In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "colab-prototype"
   },
   "source": [
    "# 🚀 GenAI Autonomous Data Agent - Colab GPU Prototype\n",
    "\n",
    "This notebook demonstrates GPU-accelerated data processing and ML training using NVIDIA RAPIDS.\n",
    "\n",
    "## Setup Instructions for Google Colab:\n",
    "1. Runtime → Change runtime type → GPU\n",
    "2. Run the installation cell below\n",
    "3. Restart runtime when prompted\n",
    "4. Run all subsequent cells"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "install-packages"
   },
   "outputs": [],
   "source": [
    "# Install RAPIDS and other dependencies in Colab\n",
    "!pip install -q pandas numpy scikit-learn plotly matplotlib seaborn\n",
    "!pip install -q xgboost lightgbm catboost\n",
    "!pip install -q shap lime\n",
    "\n",
    "# Install RAPIDS for Colab (compatible version)\n",
    "!pip install -q cuml-cu11 cudf-cu11 --extra-index-url=https://pypi.nvidia.com\n",
    "\n",
    "print(\"✅ All packages installed successfully!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "import-libraries"
   },
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import plotly.express as px\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Try importing RAPIDS\n",
    "try:\n",
    "    import cudf\n",
    "    import cuml\n",
    "    print(\"✅ RAPIDS installed successfully!\")\n",
    "    print(f\"cuDF version: {cudf.__version__}\")\n",
    "    print(f\"cuML version: {cuml.__version__}\")\n",
    "    GPU_AVAILABLE = True\n",
    "except ImportError as e:\n",
    "    print(f\"⚠️ RAPIDS not available: {e}\")\n",
    "    print(\"💻 Using CPU libraries\")\n",
    "    GPU_AVAILABLE = False\n",
    "\n",
    "# ML libraries\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n",
    "from sklearn.linear_model import LogisticRegression, LinearRegression\n",
    "from sklearn.metrics import accuracy_score, classification_report, r2_score, mean_squared_error\n",
    "import xgboost as xgb\n",
    "import lightgbm as lgb\n",
    "\n",
    "print(\"✅ All libraries imported successfully!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "load-explore-data"
   },
   "source": [
    "## 📊 Load and Explore Sample Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "generate-sample-data"
   },
   "outputs": [],
   "source": [
    "# Generate sample data\n",
    "np.random.seed(42)\n",
    "n_samples = 10000\n",
    "\n",
    "# E-commerce sample data\n",
    "data = {\n",
    "    'customer_age': np.random.randint(18, 70, n_samples),\n",
    "    'annual_income': np.random.normal(50000, 20000, n_samples),\n",
    "    'spending_score': np.random.randint(1, 100, n_samples),\n",
    "    'product_category': np.random.choice(['Electronics', 'Clothing', 'Home'], n_samples),\n",
    "    'purchase_amount': np.random.exponential(100, n_samples),\n",
    "    'days_since_last_purchase': np.random.randint(1, 365, n_samples)\n",
    "}\n",
    "\n",
    "# Create target variable\n",
    "data['high_value_customer'] = (\n",
    "    (data['annual_income'] > 60000) & \n",
    "    (data['spending_score'] > 60) & \n",
    "    (data['purchase_amount'] > 150)\n",
    ").astype(int)\n",
    "\n",
    "# Create DataFrame\n",
    "if GPU_AVAILABLE:\n",
    "    df = cudf.DataFrame(data)\n",
    "    print(\"🚀 Using GPU-accelerated cuDF\")\n",
    "else:\n",
    "    df = pd.DataFrame(data)\n",
    "    print(\"💻 Using CPU pandas\")\n",
    "\n",
    "print(f\"Dataset shape: {df.shape}\")\n",
    "print(f\"High-value customers: {data['high_value_customer'].sum()} ({data['high_value_customer'].sum()/n_samples*100:.1f}%)\")\n",
    "\n",
    "if GPU_AVAILABLE:\n",
    "    df.head().to_pandas()\n",
    "else:\n",
    "    df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "gpu-accelerated-eda"
   },
   "source": [
    "## 🔍 GPU-Accelerated EDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "basic-statistics"
   },
   "outputs": [],
   "source": [
    "# Basic statistics\n",
    "print(\"📊 Dataset Statistics:\")\n",
    "if GPU_AVAILABLE:\n",
    "    stats = df.describe()\n",
    "    print(stats.to_pandas())\n",
    "else:\n",
    "    print(df.describe())\n",
    "\n",
    "# Correlation matrix\n",
    "numeric_cols = ['customer_age', 'annual_income', 'spending_score', 'purchase_amount', 'days_since_last_purchase']\n",
    "if GPU_AVAILABLE:\n",
    "    corr_data = df[numeric_cols].to_pandas()\n",
    "else:\n",
    "    corr_data = df[numeric_cols]\n",
    "    \n",
    "correlation_matrix = corr_data.corr()\n",
    "\n",
    "# Plot correlation heatmap\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')\n",
    "plt.title('📊 Feature Correlation Matrix')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "data-visualization"
   },
   "outputs": [],
   "source": [
    "# Distribution plots\n",
    "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
    "\n",
    "# Age distribution\n",
    "axes[0,0].hist(corr_data['customer_age'], bins=20, alpha=0.7, color='skyblue')\n",
    "axes[0,0].set_title('Age Distribution')\n",
    "axes[0,0].set_xlabel('Age')\n",
    "axes[0,0].set_ylabel('Count')\n",
    "\n",
    "# Income distribution\n",
    "axes[0,1].hist(corr_data['annual_income'], bins=30, alpha=0.7, color='lightgreen')\n",
    "axes[0,1].set_title('Annual Income Distribution')\n",
    "axes[0,1].set_xlabel('Annual Income')\n",
    "axes[0,1].set_ylabel('Count')\n",
    "\n",
    "# Spending score distribution\n",
    "axes[1,0].hist(corr_data['spending_score'], bins=20, alpha=0.7, color='orange')\n",
    "axes[1,0].set_title('Spending Score Distribution')\n",
    "axes[1,0].set_xlabel('Spending Score')\n",
    "axes[1,0].set_ylabel('Count')\n",
    "\n",
    "# Purchase amount distribution\n",
    "axes[1,1].hist(corr_data['purchase_amount'], bins=30, alpha=0.7, color='pink')\n",
    "axes[1,1].set_title('Purchase Amount Distribution')\n",
    "axes[1,1].set_xlabel('Purchase Amount')\n",
    "axes[1,1].set_ylabel('Count')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "gpu-ml-training"
   },
   "source": [
    "## 🤖 GPU-Accelerated Machine Learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "prepare-ml-data"
   },
   "outputs": [],
   "source": [
    "# Prepare data for ML\n",
    "# Encode categorical variables\n",
    "df_ml = df.copy()\n",
    "if GPU_AVAILABLE:\n",
    "    df_ml['product_category'] = df_ml['product_category'].astype('category').cat.codes\n",
    "    df_pandas = df_ml.to_pandas()  # Convert for sklearn compatibility\n",
    "else:\n",
    "    df_pandas = df_ml.copy()\n",
    "    df_pandas['product_category'] = pd.Categorical(df_pandas['product_category']).codes\n",
    "\n",
    "# Features and target\n",
    "feature_cols = ['customer_age', 'annual_income', 'spending_score', 'product_category', 'purchase_amount', 'days_since_last_purchase']\n",
    "X = df_pandas[feature_cols]\n",
    "y = df_pandas['high_value_customer']\n",
    "\n",
    "# Train-test split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
    "\n",
    "print(f\"Training set size: {X_train.shape}\")\n",
    "print(f\"Test set size: {X_test.shape}\")\n",
    "print(f\"Positive class in training: {y_train.mean():.3f}\")\n",
    "print(f\"Positive class in test: {y_test.mean():.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "train-multiple-models"
   },
   "outputs": [],
   "source": [
    "# Train multiple models and compare performance\n",
    "import time\n",
    "\n",
    "models = {\n",
    "    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),\n",
    "    'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'),\n",
    "    'LightGBM': lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)\n",
    "}\n",
    "\n",
    "results = {}\n",
    "\n",
    "for name, model in models.items():\n",
    "    print(f\"\\n🔄 Training {name}...\")\n",
    "    \n",
    "    start_time = time.time()\n",
    "    model.fit(X_train, y_train)\n",
    "    training_time = time.time() - start_time\n",
    "    \n",
    "    # Make predictions\n",
    "    y_pred = model.predict(X_test)\n",
    "    accuracy = accuracy_score(y_test, y_pred)\n",
    "    \n",
    "    results[name] = {\n",
    "        'accuracy': accuracy,\n",
    "        'training_time': training_time,\n",
    "        'model': model\n",
    "    }\n",
    "    \n",
    "    print(f\"✅ {name} - Accuracy: {accuracy:.4f}, Time: {training_time:.2f}s\")\n",
    "\n",
    "# Find best model\n",
    "best_model = max(results.keys(), key=lambda x: results[x]['accuracy'])\n",
    "print(f\"\\n🏆 Best Model: {best_model} (Accuracy: {results[best_model]['accuracy']:.4f})\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "model-explainability"
   },
   "source": [
    "## 💡 Model Explainability with SHAP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "shap-analysis"
   },
   "outputs": [],
   "source": [
    "# Install and use SHAP for model explanation\n",
    "try:\n",
    "    import shap\n",
    "    \n",
    "    # Get the best model\n",
    "    best_model_obj = results[best_model]['model']\n",
    "    \n",
    "    # Create SHAP explainer\n",
    "    explainer = shap.TreeExplainer(best_model_obj)\n",
    "    shap_values = explainer.shap_values(X_test.iloc[:100])  # Use first 100 samples\n",
    "    \n",
    "    # SHAP summary plot\n",
    "    plt.figure(figsize=(10, 6))\n",
    "    if isinstance(shap_values, list):\n",
    "        # For multi-class\n",
    "        shap.summary_plot(shap_values[1], X_test.iloc[:100], feature_names=feature_cols, show=False)\n",
    "    else:\n",
    "        # For binary classification\n",
    "        shap.summary_plot(shap_values, X_test.iloc[:100], feature_names=feature_cols, show=False)\n",
    "    \n",
    "    plt.title('🔍 SHAP Feature Importance')\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    \n",
    "    print(\"✅ SHAP analysis completed!\")\n",
    "    \n",
    "except ImportError:\n",
    "    print(\"⚠️ SHAP not available, skipping explainability analysis\")\n",
    "except Exception as e:\n",
    "    print(f\"⚠️ SHAP analysis failed: {e}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "performance-visualization"
   },
   "source": [
    "## 📊 Performance Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "performance-charts"
   },
   "outputs": [],
   "source": [
    "# Create performance comparison chart\n",
    "model_names = list(results.keys())\n",
    "accuracies = [results[model]['accuracy'] for model in model_names]\n",
    "training_times = [results[model]['training_time'] for model in model_names]\n",
    "\n",
    "# Accuracy comparison\n",
    "fig1 = px.bar(x=model_names, y=accuracies, \n",
    "              title='🏆 Model Accuracy Comparison',\n",
    "              labels={'x': 'Model', 'y': 'Accuracy'},\n",
    "              color=accuracies,\n",
    "              color_continuous_scale='viridis')\n",
    "fig1.show()\n",
    "\n",
    "# Training time comparison\n",
    "fig2 = px.bar(x=model_names, y=training_times,\n",
    "              title='⏱️ Training Time Comparison',\n",
    "              labels={'x': 'Model', 'y': 'Training Time (seconds)'},\n",
    "              color=training_times,\n",
    "              color_continuous_scale='plasma')\n",
    "fig2.show()\n",
    "\n",
    "# Performance vs Speed scatter\n",
    "fig3 = px.scatter(x=training_times, y=accuracies, \n",
    "                  text=model_names,\n",
    "                  title='🎯 Accuracy vs Training Time',\n",
    "                  labels={'x': 'Training Time (seconds)', 'y': 'Accuracy'},\n",
    "                  size=[100]*len(model_names))\n",
    "fig3.update_traces(textposition=\"top center\")\n",
    "fig3.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "key-insights"
   },
   "source": [
    "## 🎯 Key Insights and Next Steps\n",
    "\n",
    "### Performance Summary:\n",
    "- **GPU Acceleration**: RAPIDS cuDF provides significant speedup for large datasets\n",
    "- **Best Model**: The model with highest accuracy can be used for production\n",
    "- **Feature Importance**: SHAP reveals which features drive predictions\n",
    "\n",
    "### Next Steps:\n",
    "1. **Scale Up**: Test with larger datasets (1M+ rows)\n",
    "2. **Feature Engineering**: Create more sophisticated features\n",
    "3. **Hyperparameter Tuning**: Optimize model parameters\n",
    "4. **Production Deployment**: Integrate with Streamlit app\n",
    "5. **Monitoring**: Set up model performance tracking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "save-results"
   },
   "outputs": [],
   "source": [
    "# Save results summary\n",
    "summary = {\n",
    "    'gpu_available': GPU_AVAILABLE,\n",
    "    'dataset_size': df.shape,\n",
    "    'best_model': best_model,\n",
    "    'best_accuracy': results[best_model]['accuracy'],\n",
    "    'models_tested': len(results)\n",
    "}\n",
    "\n",
    "print(\"📋 Experiment Summary:\")\n",
    "for key, value in summary.items():\n",
    "    print(f\"  {key}: {value}\")\n",
    "\n",
    "print(\"\\n🎉 Colab prototype completed successfully!\")\n",
    "print(\"Ready to integrate with the main Streamlit application.\")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": [],
   "gpuType": "T4"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "name": "python"
  },
  "accelerator": "GPU"
 },
 "nbformat": 4,
 "nbformat_minor": 0
}