# Data Analysis Starter Notebook

This notebook provides a comprehensive template for data analysis projects. It includes data loading, exploration, visualization, and basic machine learning workflows.

## Table of Contents
1. [Install Required Packages](#install)
2. [Import Libraries](#imports)
3. [Load and Explore Data](#load-data)
4. [Data Cleaning and Preprocessing](#preprocessing)
5. [Exploratory Data Analysis](#eda)
6. [Feature Engineering](#feature-engineering)
7. [Machine Learning Modeling](#modeling)
8. [Results and Conclusions](#conclusions)

## 1. Install Required Packages {#install}

Install essential Python packages and dependencies. Run this cell if packages are not already installed.

In [None]:
# Install required packages (uncomment if needed)
# !pip install pandas numpy matplotlib seaborn plotly scikit-learn jupyter

# For additional packages
# !pip install -r ../requirements.txt

## 2. Import Libraries {#imports}

Import all necessary libraries for data analysis and machine learning.

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score

# System and utilities
import os
import sys
import warnings
from pathlib import Path

# Custom modules (add src to path)
sys.path.append('../src')
from data.loader import load_csv_data, basic_data_info
from visualization.plots import plot_distribution, correlation_heatmap
from features.engineering import create_datetime_features, encode_categorical_features

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

print("All libraries imported successfully!")

## 3. Load and Explore Data {#load-data}

Load your dataset and perform initial exploration.

In [None]:
# Define data paths
DATA_PATH = Path('../data')
RAW_DATA_PATH = DATA_PATH / 'raw'
PROCESSED_DATA_PATH = DATA_PATH / 'processed'

# Create sample dataset if none exists
if not any(RAW_DATA_PATH.glob('*.csv')):
    print("No CSV files found in raw data directory.")
    print("Creating sample dataset for demonstration...")
    
    # Create sample data
    np.random.seed(42)
    sample_data = {
        'date': pd.date_range('2023-01-01', periods=1000, freq='D'),
        'category': np.random.choice(['A', 'B', 'C'], 1000),
        'value1': np.random.normal(50, 15, 1000),
        'value2': np.random.normal(100, 25, 1000),
        'target': np.random.randint(0, 2, 1000)
    }
    
    df = pd.DataFrame(sample_data)
    df.to_csv(RAW_DATA_PATH / 'sample_data.csv', index=False)
    print(f"Sample dataset saved to {RAW_DATA_PATH / 'sample_data.csv'}")
else:
    # Load existing data
    data_file = list(RAW_DATA_PATH.glob('*.csv'))[0]
    df = load_csv_data(data_file)

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic data information
info = basic_data_info(df)

print("Dataset Overview:")
print(f"Shape: {info['shape']}")
print(f"Columns: {info['columns']}")
print(f"\nData Types:")
for col, dtype in info['dtypes'].items():
    print(f"  {col}: {dtype}")

print(f"\nMissing Values:")
missing_data = {k: v for k, v in info['missing_values'].items() if v > 0}
if missing_data:
    for col, missing_count in missing_data.items():
        print(f"  {col}: {missing_count}")
else:
    print("  No missing values found")

In [None]:
# Statistical summary
print("Statistical Summary:")
df.describe()

## 4. Data Cleaning and Preprocessing {#preprocessing}

Clean and preprocess the data for analysis.

In [None]:
# Data cleaning steps
df_clean = df.copy()

# Handle missing values
print("Handling missing values...")
# Example: Fill numeric columns with median, categorical with mode
numeric_columns = df_clean.select_dtypes(include=[np.number]).columns
categorical_columns = df_clean.select_dtypes(include=['object']).columns

for col in numeric_columns:
    if df_clean[col].isnull().any():
        df_clean[col].fillna(df_clean[col].median(), inplace=True)

for col in categorical_columns:
    if df_clean[col].isnull().any():
        df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)

# Remove duplicates
print(f"Removing {df_clean.duplicated().sum()} duplicate rows...")
df_clean.drop_duplicates(inplace=True)

# Data type conversions
if 'date' in df_clean.columns:
    df_clean['date'] = pd.to_datetime(df_clean['date'])

print(f"Cleaned dataset shape: {df_clean.shape}")
print("Data cleaning completed!")

## 5. Exploratory Data Analysis {#eda}

Explore the data through visualizations and statistical analysis.

In [None]:
# Distribution plots for numeric columns
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns

if len(numeric_cols) > 0:
    for col in numeric_cols[:3]:  # Plot first 3 numeric columns
        plot_distribution(df_clean, col)

In [None]:
# Correlation analysis
if len(numeric_cols) > 1:
    correlation_heatmap(df_clean)

In [None]:
# Categorical variable analysis
categorical_cols = df_clean.select_dtypes(include=['object']).columns

if len(categorical_cols) > 0:
    for col in categorical_cols[:2]:  # Plot first 2 categorical columns
        plt.figure(figsize=(10, 6))
        value_counts = df_clean[col].value_counts()
        value_counts.plot(kind='bar')
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
# Interactive visualization with Plotly
if len(numeric_cols) >= 2:
    col1, col2 = numeric_cols[0], numeric_cols[1]
    color_col = categorical_cols[0] if len(categorical_cols) > 0 else None
    
    fig = px.scatter(
        df_clean, 
        x=col1, 
        y=col2, 
        color=color_col,
        title=f'{col2} vs {col1}',
        hover_data=numeric_cols.tolist()
    )
    fig.show()

## 6. Feature Engineering {#feature-engineering}

Create new features and prepare data for modeling.

In [None]:
# Feature engineering
df_features = df_clean.copy()

# Create datetime features if date column exists
if 'date' in df_features.columns:
    df_features = create_datetime_features(df_features, 'date', drop_original=False)
    print("Created datetime features")

# Encode categorical variables
categorical_cols = df_features.select_dtypes(include=['object']).columns
categorical_cols = [col for col in categorical_cols if col != 'date']  # Exclude date column

if len(categorical_cols) > 0:
    df_features = encode_categorical_features(df_features, categorical_cols, method='onehot')
    print(f"Encoded categorical features: {categorical_cols}")

# Create interaction features (example)
numeric_cols = df_features.select_dtypes(include=[np.number]).columns
if len(numeric_cols) >= 2:
    col1, col2 = numeric_cols[0], numeric_cols[1]
    df_features[f'{col1}_{col2}_interaction'] = df_features[col1] * df_features[col2]
    print(f"Created interaction feature: {col1}_{col2}_interaction")

print(f"\nFeature engineering completed. New shape: {df_features.shape}")
print(f"New columns: {list(set(df_features.columns) - set(df_clean.columns))}")

## 7. Machine Learning Modeling {#modeling}

Build and evaluate machine learning models.

In [None]:
# Prepare data for modeling
# Assuming 'target' column exists, otherwise create a sample target
if 'target' not in df_features.columns:
    # Create a sample target variable
    numeric_col = df_features.select_dtypes(include=[np.number]).columns[0]
    df_features['target'] = (df_features[numeric_col] > df_features[numeric_col].median()).astype(int)
    print("Created sample binary target variable")

# Select features and target
feature_columns = [col for col in df_features.columns if col not in ['target', 'date']]
X = df_features[feature_columns]
y = df_features['target']

# Handle any remaining non-numeric columns
X = X.select_dtypes(include=[np.number])

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature columns: {list(X.columns)}")

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully")

In [None]:
# Train models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train the model
    if name == 'Logistic Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Cross-validation score
    if name == 'Logistic Regression':
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    else:
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    
    results[name] = {
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'predictions': y_pred
    }
    
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
# Model comparison
model_comparison = pd.DataFrame({
    'Model': list(results.keys()),
    'Test Accuracy': [results[model]['accuracy'] for model in results],
    'CV Mean': [results[model]['cv_mean'] for model in results],
    'CV Std': [results[model]['cv_std'] for model in results]
})

print("Model Comparison:")
print(model_comparison)

# Visualize model performance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Accuracy comparison
model_comparison.plot(x='Model', y='Test Accuracy', kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Model Test Accuracy Comparison')
ax1.set_ylabel('Accuracy')
ax1.tick_params(axis='x', rotation=45)

# Cross-validation scores
ax2.errorbar(model_comparison['Model'], model_comparison['CV Mean'], 
             yerr=model_comparison['CV Std'], capsize=5, marker='o', linewidth=2)
ax2.set_title('Cross-Validation Scores')
ax2.set_ylabel('CV Score')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 8. Results and Conclusions {#conclusions}

Summarize findings and next steps.

In [None]:
# Feature importance (for Random Forest)
if 'Random Forest' in results:
    rf_model = models['Random Forest']
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 10 Most Important Features:")
    print(feature_importance.head(10))
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    top_features = feature_importance.head(10)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Importance')
    plt.title('Top 10 Feature Importance (Random Forest)')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

In [None]:
# Summary of analysis
print("=" * 50)
print("DATA ANALYSIS SUMMARY")
print("=" * 50)

print(f"\n📊 Dataset Information:")
print(f"   • Original shape: {df.shape}")
print(f"   • Final shape: {df_features.shape}")
print(f"   • Features used for modeling: {X.shape[1]}")

print(f"\n🧹 Data Cleaning:")
print(f"   • Missing values handled: ✓")
print(f"   • Duplicates removed: ✓")
print(f"   • Data types optimized: ✓")

print(f"\n🔧 Feature Engineering:")
print(f"   • Datetime features: {'✓' if 'date' in df_clean.columns else '✗'}")
print(f"   • Categorical encoding: {'✓' if len(categorical_cols) > 0 else '✗'}")
print(f"   • Interaction features: ✓")

print(f"\n🤖 Machine Learning:")
best_model = model_comparison.loc[model_comparison['Test Accuracy'].idxmax(), 'Model']
best_accuracy = model_comparison['Test Accuracy'].max()
print(f"   • Best performing model: {best_model}")
print(f"   • Best accuracy: {best_accuracy:.4f}")

print(f"\n📈 Next Steps:")
print(f"   • Hyperparameter tuning for better performance")
print(f"   • Try advanced feature engineering techniques")
print(f"   • Explore other algorithms (XGBoost, Neural Networks)")
print(f"   • Deploy the best model for production use")

print("\n" + "=" * 50)
print("Analysis completed successfully! 🎉")
print("=" * 50)

In [None]:
# Save results
# Save processed data
df_features.to_csv(PROCESSED_DATA_PATH / 'processed_data.csv', index=False)
print(f"Processed data saved to {PROCESSED_DATA_PATH / 'processed_data.csv'}")

# Save model comparison results
model_comparison.to_csv(PROCESSED_DATA_PATH / 'model_comparison.csv', index=False)
print(f"Model comparison saved to {PROCESSED_DATA_PATH / 'model_comparison.csv'}")

print("\nAll results saved successfully! 💾")