<a href="https://colab.research.google.com/github/saksham-42/Summer-Analytics-2025-Assignments/blob/main/SA_Hackathon_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install imbalanced-learn for SMOTE
!pip install imbalanced-learn

# 2. Import Libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from google.colab import files

# 3. Uploading files
uploaded = files.upload()
file_list = list(uploaded)
train_path = next((f for f in file_list if 'train' in f), None)
test_path = next((f for f in file_list if 'test' in f), None)

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# 4. Preprocessing: Fill ALL missing values in all features before SMOTE
categorical_features = ['RIAGENDR', 'PAQ605', 'DIQ010']
numeric_features = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']

imputer_cat = SimpleImputer(strategy='most_frequent')
imputer_num = SimpleImputer(strategy='mean')

df_train[categorical_features] = imputer_cat.fit_transform(df_train[categorical_features])
df_test[categorical_features] = imputer_cat.transform(df_test[categorical_features])
df_train[numeric_features] = imputer_num.fit_transform(df_train[numeric_features])
df_test[numeric_features] = imputer_num.transform(df_test[numeric_features])

# Impute target variable age_group with mode
age_mode = df_train['age_group'].mode()[0]
df_train['age_group'] = df_train['age_group'].fillna(age_mode)

# Encode target if needed
if df_train['age_group'].dtype == object or str(df_train['age_group'].dtype).startswith('category'):
    df_train['age_group'] = df_train['age_group'].map({'Adult': 0, 'Senior': 1})

# 5. Feature Engineering
def bmi_category(bmi):
    if bmi < 18.5:
        return 0
    elif 18.5 <= bmi < 25:
        return 1
    elif 25 <= bmi < 30:
        return 2
    else:
        return 3

def glucose_category(glu):
    if glu < 70:
        return 0
    elif 70 <= glu <= 99:
        return 1
    elif 100 <= glu <= 125:
        return 2
    else:
        return 3

df_train['BMI_cat'] = df_train['BMXBMI'].apply(bmi_category)
df_test['BMI_cat'] = df_test['BMXBMI'].apply(bmi_category)
df_train['Glucose_cat'] = df_train['LBXGLU'].apply(glucose_category)
df_test['Glucose_cat'] = df_test['LBXGLU'].apply(glucose_category)

# Interaction term (scaled BMI * scaled Glucose)
scaler_bmi = MinMaxScaler()
scaler_glu = MinMaxScaler()
df_train['BMXBMI_scaled'] = scaler_bmi.fit_transform(df_train[['BMXBMI']])
df_test['BMXBMI_scaled'] = scaler_bmi.transform(df_test[['BMXBMI']])
df_train['LBXGLU_scaled'] = scaler_glu.fit_transform(df_train[['LBXGLU']])
df_test['LBXGLU_scaled'] = scaler_glu.transform(df_test[['LBXGLU']])
df_train['BMI_GLU_interaction'] = df_train['BMXBMI_scaled'] * df_train['LBXGLU_scaled']
df_test['BMI_GLU_interaction'] = df_test['BMXBMI_scaled'] * df_test['LBXGLU_scaled']
df_train.drop(columns=['BMXBMI_scaled', 'LBXGLU_scaled'], inplace=True)
df_test.drop(columns=['BMXBMI_scaled', 'LBXGLU_scaled'], inplace=True)

# Polynomial features for numeric columns
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features_train = poly.fit_transform(df_train[numeric_features])
poly_features_test = poly.transform(df_test[numeric_features])
poly_feature_names = poly.get_feature_names_out(numeric_features)
poly_df_train = pd.DataFrame(poly_features_train, columns=poly_feature_names)
poly_df_test = pd.DataFrame(poly_features_test, columns=poly_feature_names)
df_train = pd.concat([df_train.reset_index(drop=True).drop(columns=numeric_features), poly_df_train.reset_index(drop=True)], axis=1)
df_test = pd.concat([df_test.reset_index(drop=True).drop(columns=numeric_features), poly_df_test.reset_index(drop=True)], axis=1)

# Impute any remaining missing values in engineered features
all_features = [col for col in df_train.columns if col != 'age_group']
imputer_final = SimpleImputer(strategy='most_frequent')
df_train[all_features] = imputer_final.fit_transform(df_train[all_features])
df_test[all_features] = imputer_final.transform(df_test[all_features])

# 6. SMOTE and Random Forest
feature_cols = [col for col in df_train.columns if col != 'age_group']
X = df_train[feature_cols]
y = df_train['age_group']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 7. Hyperparameter tuning for F1
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [8, 12, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'criterion': ['gini']
}
model = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
grid_search.fit(X_resampled, y_resampled)
best_model = grid_search.best_estimator_

# 8. Stratified Cross-validation F1
from sklearn.model_selection import StratifiedKFold, cross_val_score
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1_scores = cross_val_score(best_model, X_resampled, y_resampled, cv=skf, scoring='f1')
print(f"Stratified CV F1 Score: {cv_f1_scores.mean():.4f} ± {cv_f1_scores.std():.4f}")

# 9. Threshold tuning for best F1
best_model.fit(X_resampled, y_resampled)
probs = best_model.predict_proba(X_resampled)[:, 1]
from numpy import arange
best_f1 = 0
best_thresh = 0.5
for thresh in arange(0.1, 0.91, 0.01):
    preds = (probs > thresh).astype(int)
    f1 = f1_score(y_resampled, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh
print(f"Best F1 on training (SMOTE data): {best_f1:.4f} at threshold {best_thresh:.2f}")

# 10. Predict on test data using best threshold
X_test = df_test[feature_cols]
test_probs = best_model.predict_proba(X_test)[:, 1]
test_preds = (test_probs > best_thresh).astype(int)
df_test['predicted_age_group'] = test_preds
df_test['predicted_age_group'] = df_test['predicted_age_group'].map({0: 'Adult', 1: 'Senior'})

# 10. Predict on test data using best threshold
X_test = df_test[feature_cols]
test_probs = best_model.predict_proba(X_test)[:, 1]
test_preds = (test_probs > best_thresh).astype(int)

# 11. Format as sample submission (only 'age_group' column, 0/1 values)
submission = pd.DataFrame({'age_group': test_preds})

# 12. Save predictions to CSV in sample submission format
output_filename = "submission.csv"
submission.to_csv(output_filename, index=False)
files.download(output_filename)

Upload BOTH your train.csv and test.csv files together:


Saving train.csv to train.csv
Saving test.csv to test.csv
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Stratified CV F1 Score: 0.8983 ± 0.0104
Best F1 on training (SMOTE data): 1.0000 at threshold 0.36


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   age_group
0          0
1          1
2          0
3          0
4          0
