In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.metrics import r2_score, mean_squared_error , mean_absolute_error , root_mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor , AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression , Lasso , Ridge
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split , RandomizedSearchCV
from catboost import CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


In [5]:
df=pd.read_csv("stud.csv")
df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [None]:
# We are prediction math score by taking all other as independent features

In [11]:
x=df.drop(["math_score"], axis=1)
y=df["math_score"]

In [12]:
x

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [13]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [None]:
num_features=[feature for feature in x.columns if df[feature].dtype!="O"]
cat_features=[feature for feature in x.columns if df[feature].dtype=="O"]

numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder" , oh_transformer , cat_features),
        ("StandardScaler" , numeric_transformer , num_features)
    ]
)


In [None]:
x=preprocessor.fit_transform(X=x)


In [23]:
xtrain, xtest, ytrain, ytest= train_test_split(x , y , test_size=0.2, random_state=42)
xtrain.shape

(800, 19)

In [44]:
def evaluation_model(true, pred):
    mae=mean_absolute_error(true , pred)
    mse=mean_squared_error(true , pred)
    rmse=root_mean_squared_error(true , pred)
    rscore=r2_score(true , pred)
    print("Mean Absolute Error is " , mae)
    print("Mean squared error is " , mse)
    print("Root mse is " , rmse)
    print("r2 score is " , rscore)

In [None]:
models={
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "CatBoostRegressor": CatBoostRegressor(verbose=False)
    
}
for model in models:
    model=models[model]
    print(model)
    model.fit(xtrain , ytrain)
    ypred=model.predict(xtest)
    evaluation_model(ytest ,ypred)


LinearRegression()
Mean Absolute Error is  4.214763142474852
Mean squared error is  29.095169866715516
Root mse is  5.393993869732845
r2 score is  0.8804332983749564
Lasso()
Mean Absolute Error is  5.157881810347763
Mean squared error is  42.506416838411624
Root mse is  6.519694535667421
r2 score is  0.8253197323627852
Ridge()
Mean Absolute Error is  4.211100688014261
Mean squared error is  29.056272192348302
Root mse is  5.390387016935639
r2 score is  0.8805931485028737
KNeighborsRegressor()
Mean Absolute Error is  5.609
Mean squared error is  52.553799999999995
Root mse is  7.249399975170358
r2 score is  0.7840299763621361
DecisionTreeRegressor()
Mean Absolute Error is  6.095
Mean squared error is  58.605
Root mse is  7.655390257851
r2 score is  0.7591625489441863
RandomForestRegressor()
Mean Absolute Error is  4.6270428571428575
Mean squared error is  35.05731034750566
Root mse is  5.920921410347013
r2 score is  0.8559318613605347
XGBRegressor(base_score=None, booster=None, callback

array([74.70949863, 54.70687603, 76.84209741, 76.5802884 , 87.29749142,
       78.99179393, 64.88788737, 54.08994111, 75.22914668, 47.95569865,
       52.32406602, 46.39793318, 68.06160316, 55.23645422, 84.08910518,
       69.27351917, 44.7095697 , 48.71248341, 48.98590788, 53.32017699,
       74.64364857, 39.13196961, 60.18964643, 35.73198951, 73.20493117,
       83.48841626, 74.59183687, 51.03538409, 38.62973206, 51.7005766 ,
       66.08906614, 67.63411366, 64.49818272, 81.30636101, 82.82049792,
       48.08556578, 75.85411099, 72.35905188, 67.44072223, 17.89806093,
       77.64590584, 65.12564085, 69.56992507, 59.53362069, 81.50281988,
       66.09206678, 69.50964572, 32.11692348, 85.12701735, 68.1795038 ,
       76.9772759 , 69.96337565, 79.46707522, 45.66938387, 67.09851217,
       69.17207058, 86.74269367, 59.70676042, 85.95263781, 81.11535384,
       49.47483804, 68.50015628, 68.20361323, 56.17880718, 84.83835377,
       58.44915885, 61.17877005, 55.63323245, 60.30124658, 86.50