Build advanced Model#2 (Random Forest)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- 1. Re-generate Identical Data ---
np.random.seed(42)
num_students = 300
grades = np.random.choice([9, 10, 11, 12], size=num_students, p=[0.3, 0.3, 0.2, 0.2])
gpa = np.clip(np.random.normal(3.3, 0.4, num_students), 2.0, 4.0)
mile_time_min = np.clip(np.random.normal(9.0, 1.5, num_students), 5.5, 15.0)
weekly_exercise_hours = np.clip(np.random.normal(5, 2, num_students), 0, 15)
has_sibling_in_club = np.random.choice([0, 1], size=num_students, p=[0.85, 0.15])
access_to_bike = np.random.choice([0, 1], size=num_students, p=[0.6, 0.4])
sports = ['None', 'Soccer', 'CrossCountry', 'Swimming', 'Basketball', 'Track']
previous_sport = np.random.choice(sports, size=num_students, p=[0.3, 0.2, 0.15, 0.1, 0.1, 0.15])

# Target Formula
base_score = 50
noise = np.random.normal(0, 5, num_students)
sport_boost = {'None': 0, 'Basketball': 2, 'Soccer': 4, 'Swimming': 5, 'Track': 8, 'CrossCountry': 10}
sport_scores = np.array([sport_boost[s] for s in previous_sport])

score = base_score + (gpa * 5) + ((12.0 - mile_time_min) * 3) + (weekly_exercise_hours * 2) + \
        (has_sibling_in_club * 10) + (access_to_bike * 5) + sport_scores + noise
final_score = np.clip(score, 0, 100)

df = pd.DataFrame({
    'Grade_Level': grades, 'GPA': gpa, 'Mile_Time_Min': mile_time_min,
    'Weekly_Exercise_Hours': weekly_exercise_hours, 'Previous_Sport': previous_sport,
    'Has_Sibling_In_Club': has_sibling_in_club, 'Access_To_Bike': access_to_bike,
    'Recruit_Potential_Score': final_score
})

# --- 2. Split Data ---
X = df.drop('Recruit_Potential_Score', axis=1)
y = df['Recruit_Potential_Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 3. Setup Random Forest Pipeline ---
numeric_features = ['Grade_Level', 'GPA', 'Mile_Time_Min', 'Weekly_Exercise_Hours']
categorical_features = ['Previous_Sport']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ], remainder='passthrough')

# Define Pipeline with Random Forest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# --- 4. Train Model ---
rf_pipeline.fit(X_train, y_train)

# --- 5. Evaluate ---
y_pred = rf_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R2 Score: {r2:.4f}")

# --- 6. Feature Importance ---
# Extracting feature names again for display
ohe = rf_pipeline.named_steps['preprocessor'].named_transformers_['cat']
cat_names = list(ohe.get_feature_names_out(categorical_features))
feature_names = numeric_features + cat_names + ['Has_Sibling_In_Club', 'Access_To_Bike']

# Get importances from the Random Forest step
importances = rf_pipeline.named_steps['regressor'].feature_importances_

imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
imp_df = imp_df.sort_values(by='Importance', ascending=False)

print("\nRandom Forest Feature Importances:")
print(imp_df)

Mean Squared Error (MSE): 28.58
R2 Score: 0.3592

Random Forest Feature Importances:
                        Feature  Importance
2                 Mile_Time_Min    0.339326
3         Weekly_Exercise_Hours    0.188428
11               Access_To_Bike    0.140509
1                           GPA    0.106326
10          Has_Sibling_In_Club    0.054237
6           Previous_Sport_None    0.048887
5   Previous_Sport_CrossCountry    0.047354
0                   Grade_Level    0.030849
9          Previous_Sport_Track    0.018287
4     Previous_Sport_Basketball    0.011856
7         Previous_Sport_Soccer    0.008930
8       Previous_Sport_Swimming    0.005011
