In [28]:
"""
Machine Learning Master Pipeline System
Covers classification and regression models with full preprocessing and evaluation
"""

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, r2_score, mean_absolute_error
)
from sklearn.datasets import fetch_openml

# ======================
# Model Imports
# ======================

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor



In [29]:
# Data manipulation and analysis
import numpy as np
import pandas as pd

# Scikit-learn components
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, r2_score, mean_absolute_error
)
from sklearn.datasets import fetch_openml

# ======================
# Model Imports
# ======================
# Purpose: Import various ML algorithms for comparison

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


## Purpose: Create reusable components for feature engineering

In [30]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    """
    Custom feature engineering transformer
    Handles:
    - Creating new features
    - Removing unnecessary columns
    - Data type conversions
    """

    def fit(self, X, y=None):
        """No fitting required for this transformer"""
        return self

    def transform(self, X):
        """
        Apply feature engineering transformations
        1. Create family_size feature from existing columns
        2. Create age groups using binning
        3. Remove irrelevant columns
        """
        X = X.copy()

        # Feature 1: Family size calculation
        if all(col in X.columns for col in ['sibsp', 'parch']):
            X['family_size'] = X['sibsp'] + X['parch']

        # Feature 2: Age categorization
        if 'age' in X.columns:
            bins = [0, 18, 35, 50, 100]
            labels = ['child', 'young', 'adult', 'senior']
            X['age_group'] = pd.cut(X['age'], bins=bins, labels=labels)

        # Remove non-feature columns
        cols_to_drop = ['name', 'ticket', 'cabin', 'boat', 'body', 'home.dest']
        return X.drop(
            columns=[c for c in cols_to_drop if c in X.columns],
            errors='ignore'  # Silently ignore missing columns
        )

##  Purpose: Create reusable preprocessing steps

In [31]:
def create_preprocessor(numeric_features, categorical_features):
    """
    Create preprocessing pipeline for different feature types
    Returns ColumnTransformer with:
    - Numeric pipeline: Imputation + Scaling
    - Categorical pipeline: Imputation + OneHotEncoding
    """

    # Numeric feature processing
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
        ('scaler', StandardScaler())  # Standardize features
    ])

    # Categorical feature processing
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categories
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Convert to numeric
    ])

    return ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop'  # Discard unused columns
    )

## Purpose: Define models with optimized hyperparameters

In [32]:
CLASSIFICATION_MODELS = {
    'Logistic Regression': LogisticRegression(
        solver='liblinear',  # For small datasets
        random_state=42,
        max_iter=1000  # Ensure convergence
    ),
    'Decision Tree': DecisionTreeClassifier(
        random_state=42,
        max_depth=5  # Prevent overfitting
    ),
    'Random Forest': RandomForestClassifier(
        random_state=42,
        n_estimators=200  # More trees for better performance
    ),
    'SVM': SVC(
        probability=True,  # Enable predict_proba
        random_state=42,
        kernel='rbf'  # Non-linear kernel
    ),
    'K-Neighbors': KNeighborsClassifier(
        n_neighbors=7  # Optimal for many classification tasks
    ),
    'XGBoost': XGBClassifier(
        random_state=42,
        eval_metric='logloss'  # Proper metric for classification
    ),
    'LightGBM': LGBMClassifier(
        random_state=42,
        verbose=-1  # Suppress output
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        random_state=42,
        n_estimators=200  # More estimators for better performance
    )
}

## Regression Models Dictionary

In [33]:
REGRESSION_MODELS = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(
        random_state=42,
        alpha=0.5  # Regularization strength
    ),
    'Lasso Regression': Lasso(
        random_state=42,
        alpha=0.01  # Moderate regularization
    ),
    'Decision Tree': DecisionTreeRegressor(
        random_state=42,
        max_depth=5  # Prevent overfitting
    ),
    'Random Forest': RandomForestRegressor(
        random_state=42,
        n_estimators=200  # More trees for better performance
    ),
    'SVR': SVR(
        kernel='rbf'  # Non-linear kernel
    ),
    'XGBoost': XGBRegressor(
        random_state=42
    ),
    'LightGBM': LGBMRegressor(
        random_state=42,
        verbose=-1  # Suppress output
    ),
    'Gradient Boosting': GradientBoostingRegressor(
        random_state=42,
        n_estimators=200  # More estimators for better performance
    )
}

## Purpose: Combine all components into a single workflow

In [34]:
def create_model_pipeline(model, numeric_features, categorical_features):
    """
    Create end-to-end ML pipeline:
    1. Feature engineering
    2. Preprocessing
    3. Model training
    """
    return Pipeline([
        ('feature_engineer', FeatureEngineer()),  # Custom transformations
        ('preprocessor', create_preprocessor(numeric_features, categorical_features)),  # Data preprocessing
        ('model', model)  # ML algorithm
    ])

## Purpose: Standardized model performance assessment

In [35]:
def evaluate_model(pipeline, X_test, y_test, problem_type):
    """
    Calculate performance metrics for given problem type
    Returns dictionary with appropriate metrics
    """
    y_pred = pipeline.predict(X_test)

    metrics = {}
    if problem_type == 'classification':
        metrics.update({
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred, average='macro'),  # Macro average for multi-class
            'recall': recall_score(y_test, y_pred, average='macro'),
            'f1': f1_score(y_test, y_pred, average='macro')
        })
    else:
        metrics.update({
            'mae': mean_absolute_error(y_test, y_pred),  # Mean Absolute Error
            'mse': mean_squared_error(y_test, y_pred),  # Mean Squared Error
            'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),  # Root MSE
            'r2': r2_score(y_test, y_pred)  # R-squared
        })
    return metrics

## Purpose: Automated testing of all models on selected dataset

In [36]:
def run_full_test_suite(dataset_name='titanic'):
    """
    Main testing function that:
    - Loads dataset
    - Splits data
    - Tests all models
    - Evaluates performance
    - Provides example predictions
    """

    # Dataset Configuration
    if dataset_name == 'titanic':
        # Classification problem setup
        data = fetch_openml('titanic', version=1, as_frame=True)
        df = data.frame.dropna(subset=['embarked'])  # Remove incomplete cases
        target = 'survived'
        problem_type = 'classification'
        numeric_features = ['age', 'fare', 'pclass', 'sibsp', 'parch']
        categorical_features = ['sex', 'embarked']

    elif dataset_name == 'diabetes':
        # Regression problem setup
        from sklearn.datasets import load_diabetes
        data = load_diabetes()
        df = pd.DataFrame(data.data, columns=data.feature_names)
        df['target'] = data.target  # Add target column
        target = 'target'
        problem_type = 'regression'
        numeric_features = data.feature_names  # All features are numeric
        categorical_features = []

    else:
        raise ValueError("Supported datasets: 'titanic' or 'diabetes'")

    # Data Preparation
    X = df.drop(target, axis=1)
    y = df[target]

    if problem_type == 'classification':
        y = y.astype(int)  # Ensure integer labels

    # Train-Test Split (80-20)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42  # Reproducible splits
    )

    # Model Selection
    models = CLASSIFICATION_MODELS if problem_type == 'classification' else REGRESSION_MODELS

    # Model Testing Loop
    results = {}
    for model_name, model in models.items():
        print(f"\n{'='*30}\nTraining {model_name}...\n{'='*30}")

        # Pipeline Creation and Training
        pipeline = create_model_pipeline(
            model=model,
            numeric_features=numeric_features,
            categorical_features=categorical_features
        )
        pipeline.fit(X_train, y_train)  # Train entire pipeline

        # Performance Evaluation
        results[model_name] = evaluate_model(
            pipeline, X_test, y_test, problem_type
        )

        # Cross-Validation (3-fold)
        cv_metric = 'accuracy' if problem_type == 'classification' else 'r2'
        cv_scores = cross_val_score(
            pipeline, X, y,
            cv=3,  # 3-fold cross-validation
            scoring=cv_metric
        )

        # Results Reporting
        print(f"\n{model_name} Results:")
        print("Test Metrics:", {k: f"{v:.3f}" for k, v in results[model_name].items()})
        print(f"CV {cv_metric.capitalize()} Scores:", [f"{s:.3f}" for s in cv_scores])
        print(f"Mean CV Score: {np.mean(cv_scores):.3f}")

        # Example Prediction
        sample = X_test.iloc[[0]]  # First test case
        print(f"\nSample Prediction ({model_name}):")
        print("Input Features:")
        print(sample)
        print("Predicted:", pipeline.predict(sample)[0])
        print("Actual:", y_test.iloc[0])
        print("="*60)

    return results

## Purpose: Run complete test suites

In [37]:
if __name__ == "__main__":
    # Run classification tests on Titanic dataset
    print("\n\n{'='*40}\nStarting Classification Test Suite\n{'='*40}")
    classification_results = run_full_test_suite('titanic')

    # Run regression tests on Diabetes dataset
    print("\n\n{'='*40}\nStarting Regression Test Suite\n{'='*40}")
    regression_results = run_full_test_suite('diabetes')



{'='*40}
Starting Classification Test Suite
{'='*40}

Training Logistic Regression...

Logistic Regression Results:
Test Metrics: {'accuracy': '0.771', 'precision': '0.763', 'recall': '0.758', 'f1': '0.760'}
CV Accuracy Scores: ['0.385', '0.794', '0.634']
Mean CV Score: 0.604

Sample Prediction (Logistic Regression):
Input Features:
      pclass                   name   sex  age  sibsp  parch ticket   fare  \
1166       3  Saade, Mr. Jean Nassr  male  NaN      0      0   2676  7.225   

     cabin embarked boat  body home.dest  
1166   NaN        C  NaN   NaN       NaN  
Predicted: 0
Actual: 0

Training Decision Tree...

Decision Tree Results:
Test Metrics: {'accuracy': '0.752', 'precision': '0.773', 'recall': '0.712', 'f1': '0.718'}
CV Accuracy Scores: ['0.424', '0.622', '0.618']
Mean CV Score: 0.555

Sample Prediction (Decision Tree):
Input Features:
      pclass                   name   sex  age  sibsp  parch ticket   fare  \
1166       3  Saade, Mr. Jean Nassr  male  NaN      0  




LightGBM Results:
Test Metrics: {'accuracy': '0.779', 'precision': '0.771', 'recall': '0.767', 'f1': '0.769'}
CV Accuracy Scores: ['0.431', '0.557', '0.625']
Mean CV Score: 0.538

Sample Prediction (LightGBM):
Input Features:
      pclass                   name   sex  age  sibsp  parch ticket   fare  \
1166       3  Saade, Mr. Jean Nassr  male  NaN      0      0   2676  7.225   

     cabin embarked boat  body home.dest  
1166   NaN        C  NaN   NaN       NaN  
Predicted: 0
Actual: 0

Training Gradient Boosting...

Gradient Boosting Results:
Test Metrics: {'accuracy': '0.782', 'precision': '0.776', 'recall': '0.767', 'f1': '0.771'}
CV Accuracy Scores: ['0.417', '0.571', '0.628']
Mean CV Score: 0.539

Sample Prediction (Gradient Boosting):
Input Features:
      pclass                   name   sex  age  sibsp  parch ticket   fare  \
1166       3  Saade, Mr. Jean Nassr  male  NaN      0      0   2676  7.225   

     cabin embarked boat  body home.dest  
1166   NaN        C  NaN   NaN 




Gradient Boosting Results:
Test Metrics: {'mae': '44.734', 'mse': '2981.884', 'rmse': '54.607', 'r2': '0.437'}
CV R2 Scores: ['0.411', '0.354', '0.342']
Mean CV Score: 0.369

Sample Prediction (Gradient Boosting):
Input Features:
          age       sex       bmi        bp        s1        s2        s3  \
287  0.045341 -0.044642 -0.006206 -0.015999  0.125019  0.125198  0.019187   

           s4        s5       s6  
287  0.034309  0.032432 -0.00522  
Predicted: 164.9203458082221
Actual: 219.0
