In [99]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor




In [100]:
df=pd.read_csv("data/stud.csv")

In [101]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [102]:
print("the categories in the gender are:",df.gender.unique())
print("the categories in the race_ethnicity are:",df.race_ethnicity.unique())
print("the categories in the parental_level_of_education are:",df.parental_level_of_education.unique())
print("the categories in the lunch are:",df.lunch.unique())
print("the categories in the test_preparation_course are:",df.test_preparation_course.unique())

the categories in the gender are: ['female' 'male']
the categories in the race_ethnicity are: ['group B' 'group C' 'group A' 'group D' 'group E']
the categories in the parental_level_of_education are: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
the categories in the lunch are: ['standard' 'free/reduced']
the categories in the test_preparation_course are: ['none' 'completed']


In [103]:
x=df.drop(["math_score"],axis=1)
y=df["math_score"]

In [104]:
x.shape

(1000, 7)

In [105]:
x.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [106]:
y.shape

(1000,)

In [107]:
y.head()

0    72
1    69
2    90
3    47
4    76
Name: math_score, dtype: int64

In [108]:
numerical_features=x.select_dtypes(exclude='object').columns
categorical_features=x.select_dtypes(include='object').columns

print(numerical_features)
print(categorical_features)

Index(['reading_score', 'writing_score'], dtype='object')
Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')


In [109]:
preprocessor=ColumnTransformer(
    [
        ("onehotencoder",OneHotEncoder(),categorical_features),
        ("standardscaler",StandardScaler(),numerical_features)
    ]
)

In [110]:
X=preprocessor.fit_transform(x)

In [111]:
X.shape

(1000, 19)

In [112]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [113]:
x_train.shape, x_test.shape

((800, 19), (200, 19))

In [114]:
def model_evaluate(y,y_pred):
    mae=mean_absolute_error(y,y_pred)
    mse=mean_squared_error(y,y_pred)
    r2=r2_score(y,y_pred)
    return mae,mse,r2

In [115]:
models={
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "KNN":KNeighborsRegressor(),
    "SVM":SVR(),
    "DecisionTree":DecisionTreeRegressor(),
    "RandomForest":RandomForestRegressor(),
    "XGBoost":XGBRegressor(),
    "CatBoost":CatBoostRegressor(),
}

model_list=[]
r2_list=[]

In [116]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)
    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)

    mae_train,mse_train,r2_score_train =model_evaluate(y_train,y_train_pred)
    mae_test,mse_test,r2_score_test =model_evaluate(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Mean Absolute Error:  {:.4f}".format(mae_train))
    print("- Mean Squared Error: {:.4f}".format(mse_train))
    print("- R2 Score: {:.4f}".format(r2_score_train))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Mean Absolute Error:  {:.4f}".format(mae_test))
    print("- Mean Squared Error: {:.4f}".format(mse_test))
    print("- R2 Score: {:.4f}".format(r2_score_test))
    r2_list.append(r2_score_test)

    

LinearRegression
Model performance for Training set
- Mean Absolute Error:  4.2667
- Mean Squared Error: 28.3349
- R2 Score: 0.8743
----------------------------------
Model performance for Test set
- Mean Absolute Error:  4.2148
- Mean Squared Error: 29.0952
- R2 Score: 0.8804
Lasso
Model performance for Training set
- Mean Absolute Error:  5.2063
- Mean Squared Error: 43.4784
- R2 Score: 0.8071
----------------------------------
Model performance for Test set
- Mean Absolute Error:  5.1579
- Mean Squared Error: 42.5064
- R2 Score: 0.8253
Ridge
Model performance for Training set
- Mean Absolute Error:  4.2650
- Mean Squared Error: 28.3378
- R2 Score: 0.8743
----------------------------------
Model performance for Test set
- Mean Absolute Error:  4.2111
- Mean Squared Error: 29.0563
- R2 Score: 0.8806
KNN
Model performance for Training set
- Mean Absolute Error:  4.5270
- Mean Squared Error: 32.6859
- R2 Score: 0.8550
----------------------------------
Model performance for Test set
- M

In [117]:
r2_list

[0.8804332983749564,
 0.8253197323627852,
 0.8805931485028738,
 0.7836806685669011,
 0.7286001513223704,
 0.7464436356941608,
 0.8525346050484465,
 0.8277965784072876,
 0.8516318920747058]

In [118]:
model_list

['LinearRegression',
 'Lasso',
 'Ridge',
 'KNN',
 'SVM',
 'DecisionTree',
 'RandomForest',
 'XGBoost',
 'CatBoost']

In [119]:
print(list(zip(model_list, r2_list)))

[('LinearRegression', 0.8804332983749564), ('Lasso', 0.8253197323627852), ('Ridge', 0.8805931485028738), ('KNN', 0.7836806685669011), ('SVM', 0.7286001513223704), ('DecisionTree', 0.7464436356941608), ('RandomForest', 0.8525346050484465), ('XGBoost', 0.8277965784072876), ('CatBoost', 0.8516318920747058)]


In [120]:
models_performance=pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

In [121]:
models_performance

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.880593
0,LinearRegression,0.880433
6,RandomForest,0.852535
8,CatBoost,0.851632
7,XGBoost,0.827797
1,Lasso,0.82532
3,KNN,0.783681
5,DecisionTree,0.746444
4,SVM,0.7286
