<a href="https://colab.research.google.com/github/solidbridge/Auto-GPT/blob/master/finalproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Enhanced package installation with version control
import subprocess
import sys
import os
from typing import Dict, List, Optional
import warnings
warnings.filterwarnings('ignore')

def install_packages(packages: Dict[str, str]) -> None:
    """Install packages with specific versions"""
    for package, version in packages.items():
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package}{version}"])
            print(f"✅ Successfully installed {package}{version}")
        except subprocess.CalledProcessError as e:
            print(f"❌ Failed to install {package}: {e}")

# Core packages with compatible versions
REQUIRED_PACKAGES = {
    "langchain": ">=0.1.0",
    "langchain-openai": ">=0.1.0",
    "langchain-experimental": ">=0.0.50",
    "langchain-community": ">=0.0.20",
    "openai": ">=1.0.0",
    "pandas": ">=2.0.0",
    "numpy": ">=1.24.0",
    "matplotlib": ">=3.7.0",
    "seaborn": ">=0.12.0",
    "scikit-learn": ">=1.3.0",
    "plotly": ">=5.15.0",
    "streamlit": ">=1.25.0",
    "faiss-cpu": ">=1.7.0",
    "pypdf": ">=3.15.0",
    "python-dotenv": ">=1.0.0",
    "sqlalchemy": ">=2.0.0",
    "chromadb": ">=0.4.0"
}

# Install packages
install_packages(REQUIRED_PACKAGES)


In [None]:
# Core data science libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# LangChain imports (fixed)
try:
    from langchain_experimental.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
    from langchain_openai import ChatOpenAI
    from langchain.agents import create_sql_agent, AgentType
    from langchain_community.utilities import SQLDatabase
    from langchain_community.document_loaders import PyPDFLoader
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain_community.vectorstores import FAISS, Chroma
    from langchain_openai import OpenAIEmbeddings
    from langchain.chains import RetrievalQA
    from langchain.prompts import PromptTemplate
    print("✅ All LangChain imports successful")
except ImportError as e:
    print(f"⚠️ Import warning: {e}")
    print("Installing missing dependencies...")

# Database and file handling
import sqlite3
from sqlalchemy import create_engine
import json
from pathlib import Path
import logging

# Environment setup
from dotenv import load_dotenv
load_dotenv()


In [None]:
class DataScienceConfig:
    """Configuration management for the data science assistant"""

    def __init__(self):
        self.setup_logging()
        self.setup_api_keys()
        self.setup_model_config()

    def setup_logging(self):
        """Setup logging configuration"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('data_science_assistant.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

    def setup_api_keys(self):
        """Setup API keys with environment variables"""
        # Use os.environ.get for safer access and fallback
        # Remove hardcoded key and rely on environment variable set by Secrets Manager
        self.openai_api_key = os.environ.get('OPENAI_API_KEY')
        if not self.openai_api_key:
            self.logger.warning("OpenAI API key not found in environment variables. Please add it to Colab Secrets Manager with the name OPENAI_API_KEY.")

        # Set environment variable for consistent access (if not already set)
        if 'OPENAI_API_KEY' not in os.environ:
             os.environ['OPENAI_API_KEY'] = self.openai_api_key


    def setup_model_config(self):
        """Setup model configurations"""
        self.model_configs = {
            'gpt-4': {
                'model_name': 'gpt-4',
                'temperature': 0.1,
                'max_tokens': 2000
            },
            'gpt-3.5-turbo': {
                'model_name': 'gpt-3.5-turbo',
                'temperature': 0.1,
                'max_tokens': 1500
            }
        }

# Initialize configuration
config = DataScienceConfig()

In [None]:
class LLMManager:
    """Enhanced LLM management with multiple model support"""

    def __init__(self, config: DataScienceConfig):
        self.config = config
        self.models = {}
        self.setup_models()

    def setup_models(self):
        """Setup multiple LLM models"""
        for model_name, model_config in self.config.model_configs.items():
            try:
                # Pass the API key explicitly
                self.models[model_name] = ChatOpenAI(**model_config, openai_api_key=self.config.openai_api_key)
                self.config.logger.info(f"✅ {model_name} model initialized successfully")
            except Exception as e:
                self.config.logger.error(f"❌ Failed to initialize {model_name}: {e}")

    def get_model(self, model_name: str = 'gpt-4') -> ChatOpenAI:
        """Get a specific model"""
        return self.models.get(model_name, self.models['gpt-3.5-turbo'])

    def test_model_connection(self, model_name: str = 'gpt-4') -> bool:
        """Test model connection"""
        try:
            model = self.get_model(model_name)
            response = model.invoke("Hello, please respond with 'Connection successful'")
            return "successful" in response.content.lower()
        except Exception as e:
            self.config.logger.error(f"Model connection test failed: {e}")
            return False

# Initialize LLM manager
llm_manager = LLMManager(config)

# Test connection
if llm_manager.test_model_connection():
    print("✅ LLM connection successful")
else:
    print("❌ LLM connection failed")

In [None]:
class DataProcessor:
    """Enhanced data processing with multiple data sources"""

    def __init__(self, llm_manager: LLMManager):
        self.llm_manager = llm_manager
        self.data = {}
        self.agents = {}

    def load_csv_data(self, file_path: str, name: str = 'default') -> pd.DataFrame:
        """Load CSV data with enhanced error handling"""
        try:
            df = pd.read_csv(file_path)
            self.data[name] = df
            config.logger.info(f"✅ Loaded {name} dataset: {df.shape}")
            return df
        except Exception as e:
            config.logger.error(f"❌ Failed to load {file_path}: {e}")
            return None

    def create_enhanced_agent(self, df: pd.DataFrame, name: str = 'default') -> None:
        """Create enhanced pandas agent with safety measures"""
        try:
            agent = create_pandas_dataframe_agent(
                llm=self.llm_manager.get_model(),
                df=df,
                verbose=True,
                allow_dangerous_code=True,  # Required for pandas agent
                agent_type=AgentType.OPENAI_FUNCTIONS,
                handle_parsing_errors=True,
                max_iterations=5
            )
            self.agents[name] = agent
            config.logger.info(f"✅ Created agent for {name}")
        except Exception as e:
            config.logger.error(f"❌ Failed to create agent for {name}: {e}")

    def analyze_dataset(self, df: pd.DataFrame) -> Dict:
        """Comprehensive dataset analysis"""
        analysis = {
            'shape': df.shape,
            'columns': list(df.columns),
            'dtypes': df.dtypes.to_dict(),
            'missing_values': df.isnull().sum().to_dict(),
            'numerical_summary': df.describe().to_dict(),
            'categorical_summary': {}
        }

        # Categorical analysis
        for col in df.select_dtypes(include=['object']).columns:
            analysis['categorical_summary'][col] = {
                'unique_count': df[col].nunique(),
                'top_values': df[col].value_counts().head().to_dict()
            }

        return analysis

# Initialize data processor
data_processor = DataProcessor(llm_manager)

# Sample data creation for demonstration
def create_sample_dataset():
    """Create a comprehensive sample dataset"""
    np.random.seed(42)
    n_samples = 1000

    data = {
        'age': np.random.normal(45, 15, n_samples).astype(int),
        'income': np.random.lognormal(10, 0.5, n_samples),
        'education_years': np.random.randint(8, 20, n_samples),
        'experience': np.random.randint(0, 30, n_samples),
        'job_satisfaction': np.random.randint(1, 11, n_samples),
        'department': np.random.choice(['IT', 'Sales', 'HR', 'Finance', 'Marketing'], n_samples),
        'performance_score': np.random.normal(75, 15, n_samples),
        'training_hours': np.random.exponential(20, n_samples),
        'promotion': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
    }

    df = pd.DataFrame(data)

    # Add some correlations
    df.loc[df['performance_score'] > 85, 'promotion'] = 1
    df.loc[df['job_satisfaction'] > 8, 'performance_score'] += 10

    return df

# Create and load sample data
sample_df = create_sample_dataset()
data_processor.data['employee_data'] = sample_df
data_processor.create_enhanced_agent(sample_df, 'employee_data')


In [None]:
class AdvancedVisualizer:
    """Advanced visualization suite with multiple chart types"""

    def __init__(self, data_processor: DataProcessor):
        self.data_processor = data_processor
        self.setup_style()

    def setup_style(self):
        """Setup visualization style"""
        plt.style.use('seaborn-v0_8')
        sns.set_palette("husl")

    def create_comprehensive_eda(self, df: pd.DataFrame, target_col: str = None) -> None:
        """Create comprehensive exploratory data analysis"""

        # 1. Dataset Overview
        fig, axes = plt.subplots(2, 2, figsize=(20, 15))
        fig.suptitle('Comprehensive Exploratory Data Analysis', fontsize=16, y=1.02)

        # Missing values heatmap
        sns.heatmap(df.isnull(), cbar=True, ax=axes[0,0], cmap='viridis')
        axes[0,0].set_title('Missing Values Heatmap')

        # Correlation matrix
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 1:
            corr_matrix = df[numeric_cols].corr()
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[0,1])
            axes[0,1].set_title('Correlation Matrix')

        # Distribution of numerical variables
        if len(numeric_cols) > 0:
            df[numeric_cols].hist(bins=30, ax=axes[1,0], alpha=0.7)
            axes[1,0].set_title('Distribution of Numerical Variables')

        # Categorical variables
        categorical_cols = df.select_dtypes(include=['object']).columns
        if len(categorical_cols) > 0:
            cat_col = categorical_cols[0]
            df[cat_col].value_counts().plot(kind='bar', ax=axes[1,1])
            axes[1,1].set_title(f'Distribution of {cat_col}')
            axes[1,1].tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.show()

    def create_interactive_dashboard(self, df: pd.DataFrame) -> None:
        """Create interactive Plotly dashboard"""
        numeric_cols = df.select_dtypes(include=[np.number]).columns

        if len(numeric_cols) >= 2:
            # Interactive scatter plot
            fig1 = px.scatter(df, x=numeric_cols[0], y=numeric_cols[1],
                             color=df.columns[-1] if df.columns[-1] in df.select_dtypes(include=['object']).columns else None,
                             title="Interactive Scatter Plot",
                             hover_data=[col for col in df.columns[:5]])
            fig1.show()

            # Interactive correlation heatmap
            corr_matrix = df[numeric_cols].corr()
            fig2 = px.imshow(corr_matrix, text_auto=True, aspect="auto",
                           title="Interactive Correlation Heatmap")
            fig2.show()

    def create_advanced_statistical_plots(self, df: pd.DataFrame, target_col: str) -> None:
        """Create advanced statistical visualizations"""

        if target_col not in df.columns:
            print(f"Target column '{target_col}' not found")
            return

        fig, axes = plt.subplots(2, 3, figsize=(24, 16))
        fig.suptitle(f'Advanced Statistical Analysis for {target_col}', fontsize=16)

        # Box plots for numerical variables
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        numeric_cols = [col for col in numeric_cols if col != target_col]

        if len(numeric_cols) >= 2:
            sns.boxplot(data=df, x=target_col, y=numeric_cols[0], ax=axes[0,0])
            axes[0,0].set_title(f'{numeric_cols[0]} by {target_col}')

            sns.violinplot(data=df, x=target_col, y=numeric_cols[1], ax=axes[0,1])
            axes[0,1].set_title(f'{numeric_cols[1]} Distribution by {target_col}')

        # Density plots
        if len(numeric_cols) >= 1:
            for i, category in enumerate(df[target_col].unique()[:3]):
                subset = df[df[target_col] == category]
                sns.kdeplot(data=subset[numeric_cols[0]], ax=axes[0,2], label=f'{category}')
            axes[0,2].set_title(f'{numeric_cols[0]} Density by {target_col}')
            axes[0,2].legend()

        # Statistical test results visualization
        if df[target_col].dtype in ['object', 'category'] and len(numeric_cols) >= 1:
            # Perform ANOVA or t-test
            groups = [df[df[target_col] == cat][numeric_cols[0]].dropna() for cat in df[target_col].unique()]
            if len(groups) == 2:
                stat, p_value = stats.ttest_ind(groups[0], groups[1])
                test_name = "T-Test"
            else:
                stat, p_value = stats.f_oneway(*groups)
                test_name = "ANOVA"

            axes[1,0].text(0.1, 0.5, f'{test_name}\nStatistic: {stat:.4f}\nP-value: {p_value:.4f}',
                          fontsize=12, transform=axes[1,0].transAxes)
            axes[1,0].set_title('Statistical Test Results')
            axes[1,0].axis('off')

        plt.tight_layout()
        plt.show()

# Initialize visualizer
visualizer = AdvancedVisualizer(data_processor)

# Create comprehensive EDA for sample data
visualizer.create_comprehensive_eda(sample_df, 'promotion')
visualizer.create_interactive_dashboard(sample_df)
visualizer.create_advanced_statistical_plots(sample_df, 'promotion')


In [None]:
class MLPipeline:
    """Comprehensive machine learning pipeline"""

    def __init__(self, llm_manager: LLMManager):
        self.llm_manager = llm_manager
        self.models = {}
        self.results = {}
        self.scalers = {}

    def prepare_data(self, df: pd.DataFrame, target_col: str, test_size: float = 0.2):
        """Enhanced data preparation"""
        # Separate features and target
        X = df.drop(columns=[target_col])
        y = df[target_col]

        # Handle categorical variables
        categorical_cols = X.select_dtypes(include=['object']).columns
        X_processed = X.copy()

        for col in categorical_cols:
            le = LabelEncoder()
            X_processed[col] = le.fit_transform(X[col].astype(str))

        # Handle missing values
        X_processed = X_processed.fillna(X_processed.mean())

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_processed, y, test_size=test_size, random_state=42, stratify=y
        )

        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        self.scalers[target_col] = scaler

        return X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled

    def train_multiple_models(self, X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled):
        """Train multiple ML models"""

        models = {
            'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
            'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
            'Gradient Boosting': GradientBoostingClassifier(random_state=42),
            'Neural Network': MLPClassifier(random_state=42, max_iter=1000)
        }

        results = {}

        for name, model in models.items():
            try:
                # Use scaled data for models that benefit from it
                if name in ['Logistic Regression', 'Neural Network']:
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_test_scaled)
                    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
                else:
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)
                    y_pred_proba = model.predict_proba(X_test)[:, 1]

                # Calculate metrics
                accuracy = model.score(X_test_scaled if name in ['Logistic Regression', 'Neural Network'] else X_test, y_test)
                auc_score = roc_auc_score(y_test, y_pred_proba)

                # Cross-validation
                cv_scores = cross_val_score(model, X_train_scaled if name in ['Logistic Regression', 'Neural Network'] else X_train,
                                          y_train, cv=5, scoring='accuracy')

                results[name] = {
                    'model': model,
                    'accuracy': accuracy,
                    'auc_score': auc_score,
                    'cv_mean': cv_scores.mean(),
                    'cv_std': cv_scores.std(),
                    'y_pred': y_pred,
                    'y_pred_proba': y_pred_proba,
                    'classification_report': classification_report(y_test, y_pred)
                }

                config.logger.info(f"✅ {name} trained successfully - Accuracy: {accuracy:.4f}")

            except Exception as e:
                config.logger.error(f"❌ Failed to train {name}: {e}")

        self.results = results
        return results

    def create_model_comparison(self, results: Dict) -> None:
        """Create comprehensive model comparison"""

        # Model performance comparison
        model_names = list(results.keys())
        accuracies = [results[name]['accuracy'] for name in model_names]
        auc_scores = [results[name]['auc_score'] for name in model_names]
        cv_means = [results[name]['cv_mean'] for name in model_names]

        fig, axes = plt.subplots(2, 2, figsize=(20, 15))
        fig.suptitle('Comprehensive Model Comparison', fontsize=16)

        # Accuracy comparison
        bars1 = axes[0,0].bar(model_names, accuracies, color='skyblue', alpha=0.7)
        axes[0,0].set_title('Model Accuracy Comparison')
        axes[0,0].set_ylabel('Accuracy')
        axes[0,0].tick_params(axis='x', rotation=45)

        # Add value labels on bars
        for bar, acc in zip(bars1, accuracies):
            axes[0,0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                          f'{acc:.3f}', ha='center', va='bottom')

        # AUC comparison
        bars2 = axes[0,1].bar(model_names, auc_scores, color='lightcoral', alpha=0.7)
        axes[0,1].set_title('Model AUC Score Comparison')
        axes[0,1].set_ylabel('AUC Score')
        axes[0,1].tick_params(axis='x', rotation=45)

        for bar, auc in zip(bars2, auc_scores):
            axes[0,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                          f'{auc:.3f}', ha='center', va='bottom')

        # Cross-validation scores
        cv_stds = [results[name]['cv_std'] for name in model_names]
        axes[1,0].errorbar(model_names, cv_means, yerr=cv_stds, fmt='o', capsize=5, capthick=2)
        axes[1,0].set_title('Cross-Validation Scores with Standard Deviation')
        axes[1,0].set_ylabel('CV Accuracy')
        axes[1,0].tick_params(axis='x', rotation=45)

        # Feature importance (for Random Forest)
        if 'Random Forest' in results:
            feature_importance = results['Random Forest']['model'].feature_importances_
            feature_names = [f'Feature_{i}' for i in range(len(feature_importance))]
            sorted_idx = np.argsort(feature_importance)[-10:]  # Top 10 features

            axes[1,1].barh(range(len(sorted_idx)), feature_importance[sorted_idx])
            axes[1,1].set_yticks(range(len(sorted_idx)))
            axes[1,1].set_yticklabels([feature_names[i] for i in sorted_idx])
            axes[1,1].set_title('Top 10 Feature Importances (Random Forest)')
            axes[1,1].set_xlabel('Importance')

        plt.tight_layout()
        plt.show()

    def hyperparameter_tuning(self, X_train, y_train, model_name: str = 'Random Forest'):
        """Perform hyperparameter tuning"""

        if model_name == 'Random Forest':
            model = RandomForestClassifier(random_state=42)
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
        elif model_name == 'Gradient Boosting':
            model = GradientBoostingClassifier(random_state=42)
            param_grid = {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7]
            }
        else:
            config.logger.warning(f"Hyperparameter tuning not implemented for {model_name}")
            return None

        grid_search = GridSearchCV(
            model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1
        )

        config.logger.info(f"Starting hyperparameter tuning for {model_name}...")
        grid_search.fit(X_train, y_train)

        config.logger.info(f"Best parameters for {model_name}: {grid_search.best_params_}")
        config.logger.info(f"Best CV score: {grid_search.best_score_:.4f}")

        return grid_search.best_estimator_

# Initialize ML pipeline
ml_pipeline = MLPipeline(llm_manager)

# Train models on sample data
X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled = ml_pipeline.prepare_data(sample_df, 'promotion')
ml_results = ml_pipeline.train_multiple_models(X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled)
ml_pipeline.create_model_comparison(ml_results)


In [None]:
class DatabaseManager:
    """Enhanced database management with multiple database support"""

    def __init__(self, llm_manager: LLMManager):
        self.llm_manager = llm_manager
        self.engines = {}
        self.agents = {}

    def create_sample_database(self, df: pd.DataFrame, db_name: str = 'sample_db.sqlite'):
        """Create sample SQLite database"""
        try:
            engine = create_engine(f'sqlite:///{db_name}')
            df.to_sql('employee_data', engine, if_exists='replace', index=False)

            # Create additional tables for demonstration
            departments_df = pd.DataFrame({
                'department': ['IT', 'Sales', 'HR', 'Finance', 'Marketing'],
                'budget': [500000, 300000, 200000, 400000, 250000],
                'head_count': [50, 80, 25, 30, 40]
            })
            departments_df.to_sql('departments', engine, if_exists='replace', index=False)

            self.engines[db_name] = engine
            config.logger.info(f"✅ Created database: {db_name}")
            return engine

        except Exception as e:
            config.logger.error(f"❌ Failed to create database: {e}")
            return None

    def create_sql_agent(self, db_name: str):
        """Create SQL agent for natural language querying"""
        try:
            if db_name not in self.engines:
                config.logger.error(f"Database {db_name} not found")
                return None

            db = SQLDatabase(self.engines[db_name])
            agent = create_sql_agent(
                llm=self.llm_manager.get_model(),
                db=db,
                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                verbose=True,
                handle_parsing_errors=True,
                max_iterations=5
            )

            self.agents[db_name] = agent
            config.logger.info(f"✅ Created SQL agent for {db_name}")
            return agent

        except Exception as e:
            config.logger.error(f"❌ Failed to create SQL agent: {e}")
            return None

    def execute_natural_language_query(self, db_name: str, query: str):
        """Execute natural language database query"""
        try:
            if db_name not in self.agents:
                config.logger.error(f"SQL agent for {db_name} not found")
                return None

            agent = self.agents[db_name]
            result = agent.invoke(query)
            return result

        except Exception as e:
            config.logger.error(f"❌ Query execution failed: {e}")
            return None

# Initialize database manager
db_manager = DatabaseManager(llm_manager)

# Create sample database
engine = db_manager.create_sample_database(sample_df)
sql_agent = db_manager.create_sql_agent('sample_db.sqlite')

# Example queries
sample_queries = [
    "How many employees are in each department?",
    "What's the average performance score by department?",
    "Show me the top 10 employees by performance score",
    "What's the correlation between education years and income?",
    "Which department has the highest promotion rate?"
]

print("Sample Natural Language Queries:")
for i, query in enumerate(sample_queries, 1):
    print(f"{i}. {query}")


In [None]:
class RAGSystem:
    """Enhanced Retrieval-Augmented Generation system"""

    def __init__(self, llm_manager: LLMManager):
        self.llm_manager = llm_manager
        self.embeddings = OpenAIEmbeddings()
        self.vector_stores = {}
        self.qa_chains = {}

    def create_document_store(self, documents: List[str], store_name: str = 'default'):
        """Create vector store from documents"""
        try:
            # Split documents into chunks
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                length_function=len
            )

            # Handle both string documents and document objects
            if isinstance(documents[0], str):
                splits = text_splitter.split_text('\n\n'.join(documents))
            else:
                texts = text_splitter.split_documents(documents)
                splits = [doc.page_content for doc in texts]

            # Create vector store
            vector_store = FAISS.from_texts(splits, self.embeddings)
            self.vector_stores[store_name] = vector_store

            config.logger.info(f"✅ Created vector store '{store_name}' with {len(splits)} chunks")
            return vector_store

        except Exception as e:
            config.logger.error(f"❌ Failed to create vector store: {e}")
            return None

    def create_qa_chain(self, store_name: str = 'default'):
        """Create QA chain for the vector store"""
        try:
            if store_name not in self.vector_stores:
                config.logger.error(f"Vector store '{store_name}' not found")
                return None

            vector_store = self.vector_stores[store_name]

            # Custom prompt template
            prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""

            PROMPT = PromptTemplate(
                template=prompt_template, input_variables=["context", "question"]
            )

            qa_chain = RetrievalQA.from_chain_type(
                llm=self.llm_manager.get_model(),
                chain_type="stuff",
                retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
                chain_type_kwargs={"prompt": PROMPT},
                return_source_documents=True
            )

            self.qa_chains[store_name] = qa_chain
            config.logger.info(f"✅ Created QA chain for '{store_name}'")
            return qa_chain

        except Exception as e:
            config.logger.error(f"❌ Failed to create QA chain: {e}")
            return None

    def query_documents(self, question: str, store_name: str = 'default'):
        """Query the document store"""
        try:
            if store_name not in self.qa_chains:
                config.logger.error(f"QA chain for '{store_name}' not found")
                return None

            qa_chain = self.qa_chains[store_name]
            result = qa_chain.invoke({"query": question})

            return {
                'answer': result['result'],
                'source_documents': result['source_documents']
            }

        except Exception as e:
            config.logger.error(f"❌ Query failed: {e}")
            return None

# Initialize RAG system
rag_system = RAGSystem(llm_manager)

# Create sample documents for demonstration
sample_documents = [
    """
    Data Science Best Practices:
    1. Always start with exploratory data analysis (EDA)
    2. Handle missing values appropriately
    3. Feature engineering is crucial for model performance
    4. Cross-validation prevents overfitting
    5. Document your methodology and assumptions
    """,
    """
    Machine Learning Model Selection Guidelines:
    - Use logistic regression for simple binary classification
    - Random Forest is good for handling mixed data types
    - Gradient boosting often provides high accuracy
    - Neural networks work well with large datasets
    - Always compare multiple models using cross-validation
    """,
    """
    Data Visualization Principles:
    - Choose the right chart type for your data
    - Use color purposefully and consistently
    - Avoid chart junk and unnecessary elements
    - Make sure text is readable
    - Include proper labels and legends
    - Consider your audience when designing visualizations
    """
]

# Create document store and QA chain
vector_store = rag_system.create_document_store(sample_documents, 'data_science_guide')
qa_chain = rag_system.create_qa_chain('data_science_guide')

# Example RAG queries
rag_queries = [
    "What are the key steps in data science projects?",
    "How should I select a machine learning model?",
    "What are important principles for data visualization?",
    "How can I prevent overfitting in my models?"
]

print("Sample RAG Queries:")
for i, query in enumerate(rag_queries, 1):
    print(f"{i}. {query}")


In [None]:
class TestSuite:
    """Comprehensive testing suite for the data science assistant"""

    def __init__(self, data_processor, ml_pipeline, db_manager, rag_system):
        self.data_processor = data_processor
        self.ml_pipeline = ml_pipeline
        self.db_manager = db_manager
        self.rag_system = rag_system
        self.test_results = {}

    def test_data_processing(self):
        """Test data processing functionality"""
        tests = {}

        # Test data loading
        try:
            test_df = self.data_processor.data.get('employee_data')
            tests['data_loading'] = test_df is not None and not test_df.empty
        except:
            tests['data_loading'] = False

        # Test agent creation
        try:
            agent = self.data_processor.agents.get('employee_data')
            tests['agent_creation'] = agent is not None
        except:
            tests['agent_creation'] = False

        # Test data analysis
        try:
            analysis = self.data_processor.analyze_dataset(test_df)
            tests['data_analysis'] = 'shape' in analysis and 'columns' in analysis
        except:
            tests['data_analysis'] = False

        self.test_results['data_processing'] = tests
        return tests

    def test_ml_pipeline(self):
        """Test machine learning pipeline"""
        tests = {}

        # Test model training
        try:
            tests['model_training'] = len(self.ml_pipeline.results) > 0
        except:
            tests['model_training'] = False

        # Test model accuracy
        try:
            accuracies = [result['accuracy'] for result in self.ml_pipeline.results.values()]
            tests['model_accuracy'] = all(acc > 0.5 for acc in accuracies)
        except:
            tests['model_accuracy'] = False

        self.test_results['ml_pipeline'] = tests
        return tests

    def test_database_integration(self):
        """Test database integration"""
        tests = {}

        # Test database creation
        try:
            tests['database_creation'] = 'sample_db.sqlite' in self.db_manager.engines
        except:
            tests['database_creation'] = False

        # Test SQL agent
        try:
            tests['sql_agent'] = 'sample_db.sqlite' in self.db_manager.agents
        except:
            tests['sql_agent'] = False

        self.test_results['database'] = tests
        return tests

    def test_rag_system(self):
        """Test RAG system"""
        tests = {}

        # Test vector store creation
        try:
            tests['vector_store'] = 'data_science_guide' in self.rag_system.vector_stores
        except:
            tests['vector_store'] = False

        # Test QA chain
        try:
            tests['qa_chain'] = 'data_science_guide' in self.rag_system.qa_chains
        except:
            tests['qa_chain'] = False

        # Test query functionality
        try:
            result = self.rag_system.query_documents("What is data science?", 'data_science_guide')
            tests['query_functionality'] = result is not None and 'answer' in result
        except:
            tests['query_functionality'] = False

        self.test_results['rag_system'] = tests
        return tests

    def run_all_tests(self):
        """Run all tests and generate report"""
        print("🧪 Running Comprehensive Test Suite...")
        print("=" * 50)

        # Run individual test suites
        data_tests = self.test_data_processing()
        ml_tests = self.test_ml_pipeline()
        db_tests = self.test_database_integration()
        rag_tests = self.test_rag_system()

        # Generate test report
        all_tests = {
            'Data Processing': data_tests,
            'ML Pipeline': ml_tests,
            'Database Integration': db_tests,
            'RAG System': rag_tests
        }

        print("\n📊 Test Results Summary:")
        print("-" * 30)

        total_tests = 0
        passed_tests = 0

        for category, tests in all_tests.items():
            category_passed = sum(tests.values())
            category_total = len(tests)
            total_tests += category_total
            passed_tests += category_passed

            status = "✅" if category_passed == category_total else "⚠️"
            print(f"{status} {category}: {category_passed}/{category_total} tests passed")

            for test_name, result in tests.items():
                status_icon = "✅" if result else "❌"
                print(f"   {status_icon} {test_name}")

        print("-" * 30)
        overall_status = "✅" if passed_tests == total_tests else "⚠️"
        print(f"{overall_status} Overall: {passed_tests}/{total_tests} tests passed")

        if passed_tests == total_tests:
            print("\n🎉 All tests passed! System is ready for use.")
        else:
            print(f"\n⚠️ {total_tests - passed_tests} test(s) failed. Please check the issues above.")

        return all_tests

# Initialize and run test suite
test_suite = TestSuite(data_processor, ml_pipeline, db_manager, rag_system)
test_results = test_suite.run_all_tests()


In [None]:
class InteractiveDemo:
    """Interactive demonstration interface"""

    def __init__(self, data_processor, ml_pipeline, db_manager, rag_system, visualizer):
        self.data_processor = data_processor
        self.ml_pipeline = ml_pipeline
        self.db_manager = db_manager
        self.rag_system = rag_system
        self.visualizer = visualizer

    def run_interactive_demo(self):
        """Run interactive demonstration"""
        print("🚀 Welcome to the Enhanced LangChain Data Science Assistant!")
        print("=" * 60)

        while True:
            print("\nAvailable Demos:")
            print("1. 📊 Data Analysis with Natural Language")
            print("2. 🤖 Machine Learning Pipeline")
            print("3. 🗄️ Natural Language Database Queries")
            print("4. 📚 RAG Document Q&A")
            print("5. 📈 Advanced Visualizations")
            print("6. 🧪 Run System Tests")
            print("0. Exit")

            choice = input("\nEnter your choice (0-6): ").strip()

            if choice == '0':
                print("👋 Thank you for using the Data Science Assistant!")
                break
            elif choice == '1':
                self.demo_data_analysis()
            elif choice == '2':
                self.demo_ml_pipeline()
            elif choice == '3':
                self.demo_database_queries()
            elif choice == '4':
                self.demo_rag_system()
            elif choice == '5':
                self.demo_visualizations()
            elif choice == '6':
                test_suite.run_all_tests()
            else:
                print("❌ Invalid choice. Please try again.")

    def demo_data_analysis(self):
        """Demonstrate data analysis capabilities"""
        print("\n📊 Data Analysis Demo")
        print("-" * 30)

        sample_questions = [
            "How many rows and columns are in the dataset?",
            "What is the average age of employees?",
            "Show me the correlation between performance score and promotion",
            "What percentage of employees got promoted?",
            "Create a summary of the dataset"
        ]

        print("Sample questions you can ask:")
        for i, q in enumerate(sample_questions, 1):
            print(f"{i}. {q}")

        question = input("\nEnter your question (or press Enter for a sample): ").strip()
        if not question:
            question = sample_questions[0]

        try:
            agent = self.data_processor.agents.get('employee_data')
            if agent:
                print(f"\n🤖 Processing: {question}")
                response = agent.invoke(question)
                print(f"📋 Answer: {response}")
            else:
                print("❌ Data analysis agent not available")
        except Exception as e:
            print(f"❌ Error: {e}")

    def demo_ml_pipeline(self):
        """Demonstrate ML pipeline"""
        print("\n🤖 Machine Learning Pipeline Demo")
        print("-" * 40)

        if self.ml_pipeline.results:
            print("📈 Model Performance Summary:")
            for model_name, results in self.ml_pipeline.results.items():
                print(f"  • {model_name}: {results['accuracy']:.3f} accuracy, {results['auc_score']:.3f} AUC")
        else:
            print("❌ ML models not trained yet")

    def demo_database_queries(self):
        """Demonstrate database querying"""
        print("\n🗄️ Natural Language Database Queries Demo")
        print("-" * 50)

        if 'sample_db.sqlite' in self.db_manager.agents:
            question = input("Enter your database question: ").strip()
            if question:
                try:
                    result = self.db_manager.execute_natural_language_query('sample_db.sqlite', question)
                    print(f"📋 Result: {result}")
                except Exception as e:
                    print(f"❌ Error: {e}")
        else:
            print("❌ Database agent not available")

    def demo_rag_system(self):
        """Demonstrate RAG system"""
        print("\n📚 RAG Document Q&A Demo")
        print("-" * 30)

        if 'data_science_guide' in self.rag_system.qa_chains:
            question = input("Ask a question about data science: ").strip()
            if question:
                try:
                    result = self.rag_system.query_documents(question, 'data_science_guide')
                    if result:
                        print(f"📋 Answer: {result['answer']}")
                        print(f"📄 Sources: {len(result['source_documents'])} documents used")
                except Exception as e:
                    print(f"❌ Error: {e}")
        else:
            print("❌ RAG system not available")

    def demo_visualizations(self):
        """Demonstrate visualization capabilities"""
        print("\n📈 Advanced Visualizations Demo")
        print("-" * 40)

        try:
            self.visualizer.create_comprehensive_eda(sample_df, 'promotion')
            print("✅ Visualization created successfully!")
        except Exception as e:
            print(f"❌ Visualization error: {e}")

# Initialize interactive demo
demo = InteractiveDemo(data_processor, ml_pipeline, db_manager, rag_system, visualizer)


In [None]:
class ProductionUtils:
    """Production deployment utilities"""

    @staticmethod
    def create_streamlit_app():
        """Create Streamlit web application"""
        app_code = '''
import streamlit as st
import pandas as pd
import plotly.express as px
from your_enhanced_data_science_assistant import *

st.set_page_config(page_title="Data Science Assistant", layout="wide")

st.title("🤖 Enhanced LangChain Data Science Assistant")

# Sidebar navigation
st.sidebar.title("Navigation")
page = st.sidebar.selectbox("Choose a feature", [
    "Data Analysis", "Machine Learning", "Database Queries", "RAG Q&A", "Visualizations"
])

if page == "Data Analysis":
    st.header("📊 Data Analysis")

    uploaded_file = st.file_uploader("Upload CSV file", type="csv")
    if uploaded_file:
        df = pd.read_csv(uploaded_file)
        st.write("Dataset Preview:")
        st.dataframe(df.head())

        question = st.text_input("Ask a question about your data:")
        if question and st.button("Analyze"):
            # Process with your data agent
            st.write("Analysis results would appear here")

elif page == "Machine Learning":
    st.header("🤖 Machine Learning Pipeline")
    # ML interface code here

elif page == "Database Queries":
    st.header("🗄️ Natural Language Database Queries")
    # Database interface code here

elif page == "RAG Q&A":
    st.header("📚 Document Q&A")
    # RAG interface code here

elif page == "Visualizations":
    st.header("📈 Advanced Visualizations")
    # Visualization interface code here
'''

        with open('streamlit_app.py', 'w') as f:
            f.write(app_code)

        print("✅ Streamlit app created: streamlit_app.py")
        print("Run with: streamlit run streamlit_app.py")

    @staticmethod
    def create_docker_config():
        """Create Docker configuration"""
        dockerfile = '''
FROM python:3.9-slim

WORKDIR /app

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY . .

EXPOSE 8501

CMD ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
'''

        requirements = '''
langchain>=0.1.0
langchain-openai>=0.1.0
langchain-experimental>=0.0.50
streamlit>=1.25.0
pandas>=2.0.0
plotly>=5.15.0
scikit-learn>=1.3.0
'''

        with open('Dockerfile', 'w') as f:
            f.write(dockerfile)

        with open('requirements.txt', 'w') as f:
            f.write(requirements)

        print("✅ Docker configuration created")
        print("Build with: docker build -t data-science-assistant .")
        print("Run with: docker run -p 8501:8501 data-science-assistant")

    @staticmethod
    def save_models_and_data(ml_pipeline, data_processor):
        """Save trained models and processed data"""
        import pickle

        # Save models
        with open('trained_models.pkl', 'wb') as f:
            pickle.dump(ml_pipeline.results, f)

        # Save processed data
        with open('processed_data.pkl', 'wb') as f:
            pickle.dump(data_processor.data, f)

        print("✅ Models and data saved successfully")

# Create production utilities
prod_utils = ProductionUtils()


In [None]:
def main():
    """Main execution function"""
    print("🚀 Enhanced LangChain Data Science Assistant")
    print("=" * 50)

    # Check if all components are initialized
    components_status = {
        "Configuration": config is not None,
        "LLM Manager": llm_manager is not None,
        "Data Processor": data_processor is not None,
        "Visualizer": visualizer is not None,
        "ML Pipeline": ml_pipeline is not None,
        "Database Manager": db_manager is not None,
        "RAG System": rag_system is not None
    }

    print("\n📋 Component Status:")
    for component, status in components_status.items():
        status_icon = "✅" if status else "❌"
        print(f"{status_icon} {component}")

    if all(components_status.values()):
        print("\n🎉 All components initialized successfully!")
        print("\nAvailable features:")
        print("• 📊 Automated EDA and data analysis")
        print("• 🤖 Multiple ML model training and comparison")
        print("• 📈 Advanced interactive visualizations")
        print("• 🗄️ Natural language database querying")
        print("• 📚 RAG-based document Q&A")
        print("• 🧪 Comprehensive testing suite")
        print("• 🚀 Production deployment utilities")

        # Option to run interactive demo
        run_demo = input("\nWould you like to run the interactive demo? (y/n): ").lower().strip()
        if run_demo == 'y':
            demo.run_interactive_demo()
        else:
            print("\n💡 You can run demo.run_interactive_demo() anytime to explore features!")

        # Option to create production files
        create_prod = input("\nWould you like to create production deployment files? (y/n): ").lower().strip()
        if create_prod == 'y':
            prod_utils.create_streamlit_app()
            prod_utils.create_docker_config()
            prod_utils.save_models_and_data(ml_pipeline, data_processor)

    else:
        print("\n⚠️ Some components failed to initialize. Please check the errors above.")

# Execute main function
if __name__ == "__main__":
    main()
