In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

### 1) Loading the Data
    - Display the first 10 rows of the dataframe
    - Helper function (splitting numeric/categorical data)
    - Use bdplib to access the SSMS at work

In [None]:
file_name = 'data/stud.csv'

df = pd.read_csv(file_name, delimiter="|")

In [None]:
# Sample Data Generation (Replace this with your actual dataset)
data = {
    'Make': ['Toyota', 'Honda', 'Ford', 'Toyota', 'Honda', 'Ford'],
    'Model': ['Camry', 'Civic', 'F-150', 'Corolla', 'Accord', 'Mustang'],
    'Term_Length': [36, 48, 60, 24, 36, 48],
    'Vehicle_Age': [2, 5, 1, 3, 4, 2],
    'State': ['CA', 'TX', 'FL', 'CA', 'TX', 'FL'],
    'Canceled': [0, 1, 0, 1, 0, 1]  # Target variable
}

df = pd.DataFrame(data)

In [None]:
# Show the first 10 rows of the dataframe
df.head(10)

### 2) Summary Statistics
    - Check for missing values
    - Check for duplicates
    - Check datatype info
    - Check unique values
    - Summary Stats

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.describe()

### 3) Data Visualization
    - Insert some question that you want to answer with the visualized data
    - Distributions, Histograms, Kernel Density Functions (KDE), Barcharts, Violincharts

### 4) Data Processing
    - Filling missing data
    - LabelEncoding the Categorical Variables
    - Scaling the data (Central Limit Theorem)
    - Define the Features and Target Variable
    - Splitting the dataframe into 80/20

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Label Encoding for categorical variables
label_encoders = {}
for column in ['Make', 'Model', 'State']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [None]:
# Feature and Target Variables
X = df.drop('Canceled', axis=1)
y = df['Canceled']

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standard Scaling (for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 4) Data Modeling
    - Spliting the dataframe 80/20
    - Applying Machine Learning Model (Unsupervised and Supervised Learning)
    - Model Evaluation (Confusion Matrix, Precision/Recall/F1-Score, AUC/ROC Curve, K-Fold Cross-Validation)
 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, roc_curve, auc

In [None]:
# Models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest':       RandomForestClassifier(random_state=42),
    'XGBoost':             XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'Neural Network':      MLPClassifier(activation = 'relu',
                                        alpha = 0.005,
                                        hidden_layer_sizes = (8,5,8),
                                        learning_rate = 'constant',
                                        solver = 'adam',
                                        max_iter=1000)
}

# Model Evaluation
results = {}
for name, model in models.items():
    if name == 'Logistic Regression' or name == 'Neural Network':
        X_train_model = X_train_scaled
        X_test_model = X_test_scaled
    else:
        X_train_model = X_train
        X_test_model = X_test
    
    # 10-Fold Cross-Validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train_model, y_train, cv=cv, scoring='roc_auc')
    
    # Training the model
    model.fit(X_train_model, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_model)
    y_pred_proba = model.predict_proba(X_test_model)[:, 1]
    
    # Evaluation Metrics
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    results[name] = {
        'CV AUC Scores': cv_scores,
        'Confusion Matrix': conf_matrix,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    }
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

In [None]:
# Plot ROC Curves
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Print Results
for name, result in results.items():
    print(f"Model: {name}")
    print(f"10-Fold CV AUC Scores: {result['CV AUC Scores']}")
    print(f"Mean CV AUC: {np.mean(result['CV AUC Scores']):.2f}")
    print(f"Confusion Matrix:\n{result['Confusion Matrix']}")
    print(f"F1 Score: {result['F1 Score']:.2f}")
    print(f"ROC AUC: {result['ROC AUC']:.2f}")
    print("\n")