# Student Assignment: End-to-End ML Project (Completed)

This notebook follows the requested structure: EDA, preprocessing, feature engineering, model building (Logistic Regression, Decision Tree, SVM, Random Forest, XGBoost), hyperparameter tuning, evaluation, and conclusions. The dataset used is `synthetic_dataset_10000x20.csv` (uploaded by you).

In [None]:

# Imports
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load dataset (make sure file is in same working directory)
data = pd.read_csv('synthetic_dataset_10000x20.csv')
print('Dataset shape:', data.shape)
data.head()


In [None]:

# Basic EDA
display(data.info())
display(data.describe(include='all').T)
print('\nMissing values per column:\n', data.isnull().sum())

# Show unique values for categorical-ish columns (first 10 columns sample)
for col in data.select_dtypes(include=['object','category']).columns:
    print('\n', col, '->', data[col].nunique(), 'unique values; sample:', data[col].dropna().unique()[:10])


In [None]:

# Target distribution
if 'target_default_risk' in data.columns:
    print('Target value counts:')
    print(data['target_default_risk'].value_counts(normalize=False))
    print('\nTarget distribution (percent):')
    print(data['target_default_risk'].value_counts(normalize=True)*100)
else:
    raise ValueError('Expected target column: target_default_risk')


In [None]:

# Numeric distributions: histograms and boxplots for numeric features
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != 'target_default_risk']
print('Numeric columns:', numeric_cols)

# Plot histogram for first 8 numeric columns to keep notebook readable
for col in numeric_cols[:8]:
    fig, ax = plt.subplots(1,2, figsize=(12,4))
    sns.histplot(data[col].dropna(), kde=True, ax=ax[0])
    ax[0].set_title(f'Histogram of {col}')
    sns.boxplot(x=data[col], ax=ax[1])
    ax[1].set_title(f'Boxplot of {col}')
    plt.show()


In [None]:

# Correlation heatmap (numeric features)
plt.figure(figsize=(10,8))
corr = data[numeric_cols + ['target_default_risk']].corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='vlag', center=0)
plt.title('Correlation heatmap (numeric features + target)')
plt.show()


In [None]:

# Preprocessing & feature engineering
df = data.copy()

# Example: Fix common typos in 'education' if present
if 'education' in df.columns:
    df['education'] = df['education'].str.strip().str.lower().replace({
        'bachlors':'bachelors', 'bachlor':'bachelors', 'bs':'bachelors', 'graduation':'bachelors'
    })

# Convert signup_date to datetime and create recency feature if present
if 'signup_date' in df.columns:
    df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')
    max_date = df['signup_date'].max()
    df['signup_recency_days'] = (max_date - df['signup_date']).dt.days.fillna(df['signup_date'].median())

# Basic missing value imputation strategies
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [c for c in num_cols if c != 'target_default_risk']

cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')

df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

print('After imputation missing values:', df.isnull().sum().sum())


In [None]:

# Encoding and scaling pipeline
ohe_cols = [c for c in cat_cols if df[c].nunique() <= 10 and c!='target_default_risk']
ord_cols = [c for c in cat_cols if c not in ohe_cols and c!='target_default_risk']

print('One-hot cols:', ohe_cols)
print('Ordinal cols (fallback):', ord_cols)

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False), ohe_cols),
    ('ord', OrdinalEncoder(), ord_cols)
], remainder='drop', sparse_threshold=0)

# Prepare X and y
X = df.drop(columns=['target_default_risk'])
y = df['target_default_risk'].astype(int)

# Fit preprocessor and transform
X_processed = preprocessor.fit_transform(X)
feature_names = []

feature_names.extend(num_cols)
if ohe_cols:
    ohe_names = preprocessor.named_transformers_['ohe'].get_feature_names_out(ohe_cols).tolist()
    feature_names.extend(ohe_names)
feature_names.extend(ord_cols)

print('Processed feature matrix shape:', X_processed.shape)


In [None]:

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, stratify=y, random_state=42)
print('Train shape, Test shape:', X_train.shape, X_test.shape)


In [None]:

def evaluate_model(model, X_train, y_train, X_test, y_test, name='Model'):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    print(f'--- {name} ---')
    print('Accuracy:', round(acc,4))
    print('Precision:', round(prec,4))
    print('Recall:', round(rec,4))
    print('F1-score:', round(f1,4))
    print('Confusion Matrix:\n', confusion_matrix(y_test, preds))
    print('\nClassification Report:\n', classification_report(y_test, preds))
    return {'model': model, 'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_estimators=500)
}

results = {}
for name, m in models.items():
    results[name] = evaluate_model(m, X_train, y_train, X_test, y_test, name=name)


In [None]:

# Hyperparameter tuning: RandomizedSearchCV for RandomForest and XGBoost
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_param = {
    'n_estimators': [200, 500, 800],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4],
    'max_features': ['sqrt', 'log2', None]
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_search = RandomizedSearchCV(rf, rf_param, n_iter=20, scoring='f1', cv=cv, n_jobs=-1, verbose=1, random_state=42)
rf_search.fit(X_train, y_train)
print('RF best params:', rf_search.best_params_)
print('RF best CV score (f1):', rf_search.best_score_)

xgb_param = {
    'n_estimators': [500, 1000, 1500],
    'max_depth': [3,5,7,9],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'subsample': [0.6,0.8,1.0],
    'colsample_bytree': [0.6,0.8,1.0],
    'gamma': [0,1,5],
    'min_child_weight': [1,3,5]
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_search = RandomizedSearchCV(xgb, xgb_param, n_iter=30, scoring='f1', cv=cv, n_jobs=-1, verbose=1, random_state=42)
xgb_search.fit(X_train, y_train)
print('XGB best params:', xgb_search.best_params_)
print('XGB best CV score (f1):', xgb_search.best_score_)


In [None]:

# Evaluate tuned models on test set
best_rf = rf_search.best_estimator_
best_xgb = xgb_search.best_estimator_

print('\nEvaluating tuned Random Forest on test set:')
rf_results = evaluate_model(best_rf, X_train, y_train, X_test, y_test, name='RandomForest_Tuned')

print('\nEvaluating tuned XGBoost on test set:')
xgb_results = evaluate_model(best_xgb, X_train, y_train, X_test, y_test, name='XGBoost_Tuned')


In [None]:

# Feature importance (from XGBoost tuned)
try:
    xgb.plot_importance(best_xgb, importance_type='gain')
    plt.title('XGBoost Feature Importance (gain)')
    plt.show()
except Exception as e:
    print('Could not plot importance:', e)

# Summary table
summary = []
for name, r in results.items():
    summary.append({
        'model': name,
        'accuracy': r['accuracy'],
        'precision': r['precision'],
        'recall': r['recall'],
        'f1': r['f1']
    })

summary.append({'model':'RandomForest_Tuned', 'accuracy': rf_results['accuracy'], 'precision': rf_results['precision'], 'recall': rf_results['recall'], 'f1': rf_results['f1']})
summary.append({'model':'XGBoost_Tuned', 'accuracy': xgb_results['accuracy'], 'precision': xgb_results['precision'], 'recall': xgb_results['recall'], 'f1': xgb_results['f1']})

summary_df = pd.DataFrame(summary).set_index('model')
display(summary_df)

print('\nConclusions:')
print(' - Review preprocessing choices (imputation, encoding).')
print(' - Tuned models likely perform better; consider ensembling/stacking if you want further gains.')


## Files created

- `student_assignment_completed.ipynb` — Completed notebook with EDA, preprocessing, models, tuning, and conclusions.
- `assignment_report.md` — Short 1-2 page report summarizing key findings and conclusions.

You can download both files using the links provided after running the notebook.