## Student Performance Prediction Modeling Pipeline

Ran three regression models - Linear Regression, Random Forest and SVM

Used a 80-20 split
Ran 5 fold cross validatoin

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [11]:
# Loading dataset

df = pd.read_csv('student_performance_final.csv')
print(f"Dataset loaded: {df.shape}")
print("Columns:", df.columns.tolist())

Dataset loaded: (500, 10)
Columns: ['study_hours_per_week', 'attendance_percentage', 'previous_score', 'assignments_completed', 'sleep_hours', 'class_participation', 'internet_quality', 'extracurricular_hours', 'assignment_completion_rate', 'final_score']


In [31]:
print("Cleaned dataset shape:", df.shape)
print("\nDataset columns:")
print(df.columns.tolist())

# Printing the number of independent variables
independent_var = df.columns.tolist()
independent_var.remove('final_score')
print("The number of independent variables is:", len(independent_var))

Cleaned dataset shape: (500, 10)

Dataset columns:
['study_hours_per_week', 'attendance_percentage', 'previous_score', 'assignments_completed', 'sleep_hours', 'class_participation', 'internet_quality', 'extracurricular_hours', 'assignment_completion_rate', 'final_score']
The number of independent variables is: 9


In [13]:
df.head()

Unnamed: 0,study_hours_per_week,attendance_percentage,previous_score,assignments_completed,sleep_hours,class_participation,internet_quality,extracurricular_hours,assignment_completion_rate,final_score
0,33,79,93,17,7,7,3,12,85.0,71.89
1,19,87,61,5,7,9,1,2,25.0,56.53
2,12,67,67,4,6,9,4,0,20.0,45.78
3,25,100,79,15,9,8,3,5,75.0,75.16
4,23,98,88,12,9,10,4,15,60.0,61.52


In [16]:
# Features


target_col = 'final_score'
feature_cols = [col for col in df.columns if col != target_col]
print(f"\n Independent variables {len(feature_cols)} features:")
for col in feature_cols:
    print(f"{col}")


 Independent variables 9 features:
study_hours_per_week
attendance_percentage
previous_score
assignments_completed
sleep_hours
class_participation
internet_quality
extracurricular_hours
assignment_completion_rate


In [17]:
# Prepare X, y
X = df[feature_cols].astype('float64')
y = df['final_score'].astype('float64')

In [19]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nX shape: {X.shape}, y shape: {y.shape}")
print(f"Train: {X_train.shape}, Test: {X_test.shape}")


X shape: (500, 9), y shape: (500,)
Train: (400, 9), Test: (100, 9)


In [23]:
# Running three regression models
# 1. Linear Regression
# 2. Random Forest
# 3. SVM

# Imports needed
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pandas as pd
import pickle
import joblib

print("\n" + "="*70)
print("3-MODEL COMPARISON: Linear Regression | Random Forest | SVM")
print("="*70)

# Modeling
models = {
    'Linear Regression (OLS)': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'SVM (SVR)': SVR(kernel='rbf')
}

# For a structured results table
print(f"{'Model':<22} {'CV R²':<8} {'Test R²':<8} {'RMSE':<8} {'MAE':<8}")
print("-"*55)

results = []
trained_models = {}

for name, model in models.items():
    # 5-fold Cross-Validation
    cv_r2 = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

    # Train & Test
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Metrics
    test_r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    results.append({
        'Model': name,
        'CV_R2_mean': cv_r2.mean(),
        'CV_R2_std': cv_r2.std(),
        'Test_R2': test_r2,
        'RMSE': rmse,
        'MAE': mae
    })

    trained_models[name] = model
    print(f"{name:<22} {cv_r2.mean():<8.3f} {test_r2:<8.3f} {rmse:<8.1f} {mae:<8.1f}")



3-MODEL COMPARISON: Linear Regression | Random Forest | SVM
Model                  CV R²    Test R²  RMSE     MAE     
-------------------------------------------------------
Linear Regression (OLS) 0.748    0.769    4.7      3.9     
Random Forest          0.658    0.652    5.8      4.7     
SVM (SVR)              0.552    0.631    5.9      5.0     


In [27]:
# ============================================================================
# SAVING ONLY LINEAR REGRESSION MODEL (.pkl)

print("\n" + "="*50)
print("SAVING LINEAR REGRESSION MODEL ONLY")
print("="*50)

# Save Linear Regression model (trained on train set)
with open('linear_regression_model.pkl', 'wb') as f:
    pickle.dump(linear_regression_model, f)
print("linear_regression_model.pkl")


SAVING LINEAR REGRESSION MODEL ONLY
linear_regression_model.pkl
