In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor

In [6]:
df = pd.read_csv('data/stud.csv')
df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [8]:
X = df.drop('math_score', axis = 1)

In [9]:
y = df['math_score']

In [12]:
num_features = X.select_dtypes(exclude = "object").columns
cat_features = X.select_dtypes(include = "object").columns

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_features = X.select_dtypes(include = "object").columns


In [13]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [14]:
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler",numeric_transformer, num_features)
    ]
)

In [15]:
X = preprocessor.fit_transform(X)

In [16]:
X.shape

(1000, 19)

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

In [18]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae,rmse,r2_square


In [28]:
models = {
    "Linear Regression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "K-Neighbours Regressor" : KNeighborsRegressor(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest" : RandomForestRegressor(),
    "CatBoosting regressor" : CatBoostRegressor(verbose= False),
    "AdaBoost" : AdaBoostRegressor(),
    "Xgboost" : XGBRegressor()
}

model_list = []
r2_list = []


for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)


    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)

    model_train_mae, model_train_mse, model_train_r2 = evaluate_model(y_train,y_pred_train)
    model_test_mae, model_test_mse, model_test_r2 = evaluate_model(y_test,y_pred_test)

    model_list.append(list(models.keys())[i])
    r2_list.append(model_test_r2)


    print('Model performance for testing set',model_test_mae, model_test_mse, model_test_r2)


Model performance for testing set 4.21476314247485 5.393993869732843 0.8804332983749565
Model performance for testing set 5.157881810347764 6.519694535667422 0.8253197323627852
Model performance for testing set 4.211100688014261 5.390387016935642 0.8805931485028737
Model performance for testing set 5.621 7.253040741647602 0.7838129945787431
Model performance for testing set 6.39 8.096295449154509 0.7306220473217544
Model performance for testing set 4.573792857142857 5.920189594900586 0.8559674722954981
Model performance for testing set 4.612531714976557 6.008631956907363 0.8516318920747058
Model performance for testing set 4.719856187359924 6.064885859918462 0.8488407892308041
Model performance for testing set 5.057730674743652 6.47330716071189 0.8277965784072876


In [29]:
model_list

['Linear Regression',
 'Lasso',
 'Ridge',
 'K-Neighbours Regressor',
 'Decision Tree',
 'Random Forest',
 'CatBoosting regressor',
 'AdaBoost',
 'Xgboost']

In [30]:
r2_list

[0.8804332983749565,
 0.8253197323627852,
 0.8805931485028737,
 0.7838129945787431,
 0.7306220473217544,
 0.8559674722954981,
 0.8516318920747058,
 0.8488407892308041,
 0.8277965784072876]

In [31]:
pd.DataFrame(list(zip(model_list, r2_list)), columns = ['Model Name', 'R2 Score']).sort_values(by = ['R2 Score'], ascending = False)

Unnamed: 0,Model Name,R2 Score
2,Ridge,0.880593
0,Linear Regression,0.880433
5,Random Forest,0.855967
6,CatBoosting regressor,0.851632
7,AdaBoost,0.848841
8,Xgboost,0.827797
1,Lasso,0.82532
3,K-Neighbours Regressor,0.783813
4,Decision Tree,0.730622


In [32]:
pred_Df = pd.DataFrame({'Actual Value' : y_test, 'Predicted value' : y_pred_test, 'Differnce' : y_test - y_pred_test})
pred_Df

Unnamed: 0,Actual Value,Predicted value,Differnce
521,91,72.256554,18.743446
737,53,51.827824,1.172176
740,80,79.382545,0.617455
660,74,74.103836,-0.103836
411,84,84.875084,-0.875084
...,...,...,...
408,52,46.441635,5.558365
332,62,60.034035,1.965965
208,74,67.617630,6.382370
613,65,69.601791,-4.601791
