In [4]:
## define the all models

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
 ## modeling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')


In [5]:
df = pd.read_csv('StudentsPerformance.csv')

In [6]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [7]:
X = df.drop(columns = ['math score'], axis = 1)

In [8]:
y = df['math score']

In [12]:
# Select numerical and categorical columns
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

# Import necessary preprocessing tools
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Define the transformers
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(handle_unknown='ignore')  # Always add handle_unknown for safety

# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('StandardScaler', numeric_transformer, num_features),
        ('OneHotEncoder', oh_transformer, cat_features)
    ],
    remainder='passthrough'
)


In [13]:
X = preprocessor.fit_transform(X)

In [14]:
## separate the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 19), (200, 19), (800,), (200,))

In [15]:
## create the evaluate function to give all metrics after training the model

def evaluate_model(model, X_train, y_train, X_test, y_test):
    mse_train = mean_squared_error(y_train, model.predict(X_train))
    mse_test = mean_squared_error(y_test, model.predict(X_test))    
    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)
    mae_train = mean_absolute_error(y_train, model.predict(X_train))
    mae_test = mean_absolute_error(y_test, model.predict(X_test))
    r2_train = r2_score(y_train, model.predict(X_train))
    r2_test = r2_score(y_test, model.predict(X_test))
    print(f"Train RMSE: {rmse_train:.2f}, Test RMSE: {rmse_test:.2f}")
    print(f"Train MAE: {mae_train:.2f}, Test MAE    : {mae_test:.2f}")
    print(f"Train R2: {r2_train:.2f}, Test R2: {r2_test:.2f}")                          

In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Updated evaluation function
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred) * 100  # R2 as percentage

    avg_value = np.mean(y_true)
    mae_accuracy = 100 - (mae / avg_value * 100)
    rmse_accuracy = 100 - (rmse / avg_value * 100)

    return mae_accuracy, rmse_accuracy, r2

# Define models
models = {
    "LinearRegression": LinearRegression(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'SVR': SVR(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'CatBoostRegressor': CatBoostRegressor(verbose=0),
    'XGBRegressor': XGBRegressor(eval_metric='rmse'),
}

model_list = []
r2_list = []

# Loop through models
for name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_mae_acc, train_rmse_acc, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae_acc, test_rmse_acc, test_r2 = evaluate_model(y_test, y_test_pred)

    print(f"\nModel: {name}")
    print("Train Dataset Performance (out of 100%):")
    print(f"- MAE Accuracy : {train_mae_acc:.2f}%")
    print(f"- RMSE Accuracy: {train_rmse_acc:.2f}%")
    print(f"- R² Score     : {train_r2:.2f}%")

    print("Test Dataset Performance (out of 100%):")
    print(f"- MAE Accuracy : {test_mae_acc:.2f}%")
    print(f"- RMSE Accuracy: {test_rmse_acc:.2f}%")
    print(f"- R² Score     : {test_r2:.2f}%")
    print("=" * 45)

    model_list.append(name)
    r2_list.append(test_r2)



Model: LinearRegression
Train Dataset Performance (out of 100%):
- MAE Accuracy : 93.58%
- RMSE Accuracy: 91.99%
- R² Score     : 87.43%
Test Dataset Performance (out of 100%):
- MAE Accuracy : 93.46%
- RMSE Accuracy: 91.63%
- R² Score     : 88.04%

Model: KNeighborsRegressor
Train Dataset Performance (out of 100%):
- MAE Accuracy : 93.21%
- RMSE Accuracy: 91.41%
- R² Score     : 85.52%
Test Dataset Performance (out of 100%):
- MAE Accuracy : 91.28%
- RMSE Accuracy: 88.75%
- R² Score     : 78.38%

Model: DecisionTreeRegressor
Train Dataset Performance (out of 100%):
- MAE Accuracy : 99.97%
- RMSE Accuracy: 99.58%
- R² Score     : 99.97%
Test Dataset Performance (out of 100%):
- MAE Accuracy : 89.92%
- RMSE Accuracy: 87.19%
- R² Score     : 71.96%

Model: RandomForestRegressor
Train Dataset Performance (out of 100%):
- MAE Accuracy : 97.23%
- RMSE Accuracy: 96.52%
- R² Score     : 97.63%
Test Dataset Performance (out of 100%):
- MAE Accuracy : 92.81%
- RMSE Accuracy: 90.73%
- R² Score 

In [25]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model', 'R2 Score']).sort_values(by='R2 Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Model,R2 Score
0,Ridge,88.059315
1,LinearRegression,88.04333
2,RandomForestRegressor,85.324268
3,CatBoostRegressor,85.163189
4,AdaBoostRegressor,84.913706
5,Lasso,82.532008
6,XGBRegressor,82.122052
7,KNeighborsRegressor,78.377026
8,SVR,72.860015
9,DecisionTreeRegressor,71.960858
