Build advanced Model#1 (Ridge Regression)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- 1. Re-generate Identical Data ---
np.random.seed(42)
num_students = 300
grades = np.random.choice([9, 10, 11, 12], size=num_students, p=[0.3, 0.3, 0.2, 0.2])
gpa = np.clip(np.random.normal(3.3, 0.4, num_students), 2.0, 4.0)
mile_time_min = np.clip(np.random.normal(9.0, 1.5, num_students), 5.5, 15.0)
weekly_exercise_hours = np.clip(np.random.normal(5, 2, num_students), 0, 15)
has_sibling_in_club = np.random.choice([0, 1], size=num_students, p=[0.85, 0.15])
access_to_bike = np.random.choice([0, 1], size=num_students, p=[0.6, 0.4])
sports = ['None', 'Soccer', 'CrossCountry', 'Swimming', 'Basketball', 'Track']
previous_sport = np.random.choice(sports, size=num_students, p=[0.3, 0.2, 0.15, 0.1, 0.1, 0.15])

# Target Formula
base_score = 50
noise = np.random.normal(0, 5, num_students)
sport_boost = {'None': 0, 'Basketball': 2, 'Soccer': 4, 'Swimming': 5, 'Track': 8, 'CrossCountry': 10}
sport_scores = np.array([sport_boost[s] for s in previous_sport])

score = base_score + (gpa * 5) + ((12.0 - mile_time_min) * 3) + (weekly_exercise_hours * 2) + \
        (has_sibling_in_club * 10) + (access_to_bike * 5) + sport_scores + noise
final_score = np.clip(score, 0, 100)

df = pd.DataFrame({
    'Grade_Level': grades, 'GPA': gpa, 'Mile_Time_Min': mile_time_min,
    'Weekly_Exercise_Hours': weekly_exercise_hours, 'Previous_Sport': previous_sport,
    'Has_Sibling_In_Club': has_sibling_in_club, 'Access_To_Bike': access_to_bike,
    'Recruit_Potential_Score': final_score
})

# --- 2. Split Data ---
X = df.drop('Recruit_Potential_Score', axis=1)
y = df['Recruit_Potential_Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 3. Setup Ridge Pipeline ---
numeric_features = ['Grade_Level', 'GPA', 'Mile_Time_Min', 'Weekly_Exercise_Hours']
categorical_features = ['Previous_Sport']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ], remainder='passthrough')

# Define Pipeline with Ridge
ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

# --- 4. Hyperparameter Tuning (Finding Best Alpha) ---
param_grid = {'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}
grid_search = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Get best model
best_ridge_model = grid_search.best_estimator_
best_alpha = grid_search.best_params_['regressor__alpha']

# --- 5. Evaluate ---
y_pred = best_ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Ridge Alpha: {best_alpha}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R2 Score: {r2:.4f}")

# --- 6. Show Coefficients (Interpretability) ---
# Extracting feature names again for display
ohe = best_ridge_model.named_steps['preprocessor'].named_transformers_['cat']
cat_names = list(ohe.get_feature_names_out(categorical_features))
feature_names = numeric_features + cat_names + ['Has_Sibling_In_Club', 'Access_To_Bike']

# Get coefficients from the Ridge step
coefficients = best_ridge_model.named_steps['regressor'].coef_

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False)

print("\nRidge Regression Coefficients (Feature Importance):")
print(coef_df[['Feature', 'Coefficient']])

Best Ridge Alpha: 1.0
Mean Squared Error (MSE): 24.33
R2 Score: 0.4545

Ridge Regression Coefficients (Feature Importance):
                        Feature  Coefficient
10          Has_Sibling_In_Club     5.533724
11               Access_To_Bike     4.504144
5   Previous_Sport_CrossCountry     4.289920
2                 Mile_Time_Min    -3.185698
6           Previous_Sport_None    -3.162829
4     Previous_Sport_Basketball    -2.699644
9          Previous_Sport_Track     2.600705
3         Weekly_Exercise_Hours     2.182982
1                           GPA     1.638091
7         Previous_Sport_Soccer    -0.885356
0                   Grade_Level     0.366457
8       Previous_Sport_Swimming    -0.142796
