In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor


In [2]:
df=pd.read_csv('stud.csv')

# Preparing x and y variables
We will be predicting the math scores 

In [3]:
X=df.drop(columns=['math_score'],axis=1)
y=df['math_score']

In [4]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [5]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

# Preprocessing and splitting

In [10]:
#Creating column transformers
num_features=X.select_dtypes(exclude='object').columns
cat_features=X.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer=StandardScaler()
cat_transformer=OneHotEncoder()

preprocessor=ColumnTransformer([("OneHotEncoder",cat_transformer,cat_features),("StandardScaler",numeric_transformer,num_features)])



In [11]:
X=preprocessor.fit_transform(X)

In [12]:
X.shape

(1000, 19)

In [14]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


# Model Building and Evaluation

In [15]:
#Evaluation function
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mse)
    r2_square=r2_score(true,predicted)
    
    return mae,mse,rmse,r2_square

In [21]:
models={
        "Linear Regression":LinearRegression(),
        "Lasso":Lasso(),
        "Ridge":Ridge(),
        "K-Neighbors Regeressor":KNeighborsRegressor(),
        "Decision Tree":DecisionTreeRegressor(),
        "Random Forest Regressor":RandomForestRegressor(),
        "XGBRegressor":XGBRegressor(),
        "CatBoost Regressor":CatBoostRegressor(verbose=False),
        "Adaboost Regressor":AdaBoostRegressor()
}
model_list=[]
r2_list=[]

for i in range(len(models)):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    #make predictions
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    
    #evaluation of predictions
    mae_train,mse_train,rmse_train,r2_train=evaluate_model(y_train_pred,y_train)
    mae_test,mse_test,rmse_test,r2_test=evaluate_model(y_test_pred,y_test)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model performance on training set")
    print("-Root Mean Squared Error: {:.4f}".format(rmse_train))
    print("-Mean Absolute Error: {:.4f}".format(mae_train))
    print("-R2 Score: {:.4f}".format(r2_train))
    
    print('----------------------------------')
    
    print("Model performance on testing set")
    print("-Root Mean Squared Error: {:.4f}".format(rmse_test))
    print("-Mean Absolute Error: {:.4f}".format(mae_test))
    print("-R2 Score: {:.4f}".format(r2_test))
    
    r2_list.append(r2_test)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance on training set
-Root Mean Squared Error: 5.3231
-Mean Absolute Error: 4.2667
-R2 Score: 0.8563
----------------------------------
Model performance on testing set
-Root Mean Squared Error: 5.3940
-Mean Absolute Error: 4.2148
-R2 Score: 0.8670


Lasso
Model performance on training set
-Root Mean Squared Error: 6.5938
-Mean Absolute Error: 5.2063
-R2 Score: 0.6904
----------------------------------
Model performance on testing set
-Root Mean Squared Error: 6.5197
-Mean Absolute Error: 5.1579
-R2 Score: 0.7275


Ridge
Model performance on training set
-Root Mean Squared Error: 5.3233
-Mean Absolute Error: 4.2650
-R2 Score: 0.8558
----------------------------------
Model performance on testing set
-Root Mean Squared Error: 5.3904
-Mean Absolute Error: 4.2111
-R2 Score: 0.8668


K-Neighbors Regeressor
Model performance on training set
-Root Mean Squared Error: 5.7055
-Mean Absolute Error: 4.5122
-R2 Score: 0.7985
----------------------------------
Model 

# Results

In [24]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['Model Name','R2 Score']).sort_values(by=['R2 Score'],ascending=False)

Unnamed: 0,Model Name,R2 Score
0,Linear Regression,0.867044
2,Ridge,0.866795
5,Random Forest Regressor,0.827875
7,CatBoost Regressor,0.811383
8,Adaboost Regressor,0.794338
6,XGBRegressor,0.793257
4,Decision Tree,0.749834
1,Lasso,0.727537
3,K-Neighbors Regeressor,0.646804
