#first step: data creation

In [1]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)

# 1. Define the parameters for data generation
num_students = 300

# Generate synthetic features for female students
grades = np.random.choice([9, 10, 11, 12], size=num_students, p=[0.3, 0.3, 0.2, 0.2])
gpa = np.clip(np.random.normal(3.3, 0.4, num_students), 2.0, 4.0)
mile_time_min = np.clip(np.random.normal(9.0, 1.5, num_students), 5.5, 15.0) # Lower is better
weekly_exercise_hours = np.clip(np.random.normal(5, 2, num_students), 0, 15)
has_sibling_in_club = np.random.choice([0, 1], size=num_students, p=[0.85, 0.15])
access_to_bike = np.random.choice([0, 1], size=num_students, p=[0.6, 0.4])

# Previous sport categories
sports = ['None', 'Soccer', 'CrossCountry', 'Swimming', 'Basketball', 'Track']
weights = [0.3, 0.2, 0.15, 0.1, 0.1, 0.15]
previous_sport = np.random.choice(sports, size=num_students, p=weights)

# 2. Create a "Recruit Potential Score" (Target Variable for Regression)
# Formula: weighted sum of features + noise
# High GPA, Low Mile Time, Sibling, Bike Access, Specific Sports (XC/Track) boost score.

base_score = 50
noise = np.random.normal(0, 5, num_students)

# Calculate scores
score = base_score + \
        (gpa * 5) + \
        ((12.0 - mile_time_min) * 3) + \
        (weekly_exercise_hours * 2) + \
        (has_sibling_in_club * 10) + \
        (access_to_bike * 5)

# Add sport effect
sport_boost = {
    'None': 0,
    'Basketball': 2,
    'Soccer': 4,
    'Swimming': 5,
    'Track': 8,
    'CrossCountry': 10
}
sport_scores = np.array([sport_boost[s] for s in previous_sport])

final_score = score + sport_scores + noise
final_score = np.clip(final_score, 0, 100) # Clip between 0-100

# 3. Create DataFrame
df = pd.DataFrame({
    'Grade_Level': grades,
    'GPA': np.round(gpa, 2),
    'Mile_Time_Min': np.round(mile_time_min, 1),
    'Weekly_Exercise_Hours': np.round(weekly_exercise_hours, 1),
    'Previous_Sport': previous_sport,
    'Has_Sibling_In_Club': has_sibling_in_club,
    'Access_To_Bike': access_to_bike,
    'Recruit_Potential_Score': np.round(final_score, 1)
})

# 4. Save to CSV
csv_filename = "high_school_mtb_recruits.csv"
df.to_csv(csv_filename, index=False)

#Step 2:Build Baseline Model (Linear Regression)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Separate features and target
X = df.drop('Recruit_Potential_Score', axis=1)
y = df['Recruit_Potential_Score']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing Pipeline
# Numeric features to scale
numeric_features = ['Grade_Level', 'GPA', 'Mile_Time_Min', 'Weekly_Exercise_Hours']
# Categorical features to encode
categorical_features = ['Previous_Sport']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep binary columns as is
)

# Create Baseline Pipeline
baseline_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train Baseline
baseline_model.fit(X_train, y_train)

# Predict
y_pred = baseline_model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Data saved to {csv_filename}")
print(df.head())
print("\nBaseline Linear Regression Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R2 Score: {r2:.4f}")

Data saved to high_school_mtb_recruits.csv
   Grade_Level   GPA  Mile_Time_Min  Weekly_Exercise_Hours Previous_Sport  \
0           10  3.32            8.9                    3.0     Basketball   
1           12  3.04            7.2                    3.5   CrossCountry   
2           11  4.00            8.0                    4.9           None   
3           10  3.55            9.1                    5.5          Track   
4            9  2.49            7.7                    8.1          Track   

   Has_Sibling_In_Club  Access_To_Bike  Recruit_Potential_Score  
0                    0               0                     88.7  
1                    0               1                    100.0  
2                    1               1                    100.0  
3                    0               1                    100.0  
4                    0               1                     98.6  

Baseline Linear Regression Performance:
Mean Squared Error: 24.49
R2 Score: 0.4520
