In [45]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [46]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

In [47]:
df=pd.read_csv('data/study.csv')

In [48]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [49]:
x=df.drop(columns=['math score'],axis=1)
y=df['math score']

In [50]:
x.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [51]:
y.head()

0    72
1    69
2    90
3    47
4    76
Name: math score, dtype: int64

In [52]:
df.nunique()

gender                          2
race/ethnicity                  5
parental level of education     6
lunch                           2
test preparation course         2
math score                     81
reading score                  72
writing score                  77
dtype: int64

In [53]:
print(df['gender'].unique())
print(df['race/ethnicity'].unique())
print(df['parental level of education'].unique())
print(df['lunch'].unique())
print(df['test preparation course'].unique())

['female' 'male']
['group B' 'group C' 'group A' 'group D' 'group E']
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
['standard' 'free/reduced']
['none' 'completed']


In [54]:
#defining numerical and categorical features
num_features=[f for f in x.columns if x[f].dtype != 'O']
cat_features=[f for f in x.columns if x[f].dtype == 'O']
print(f'we have {len(num_features)} numerical features',num_features)
print(f'we have {len(cat_features)} categorical features',cat_features)

we have 2 numerical features ['reading score', 'writing score']
we have 5 categorical features ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']


In [55]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
num_transformer=StandardScaler()
cat_transformer=OneHotEncoder()
transformer=ColumnTransformer([('standard_scaler',num_transformer,num_features),
                               ('onehotencoder',cat_transformer,cat_features)])

In [56]:
x=transformer.fit_transform(x)

In [57]:
x.shape

(1000, 19)

# train test split

In [58]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape

((800, 19), (200, 19))

# create an evaluate function to calculate scores

In [59]:
def evaluate_model(true,predicted):
    print('mean_absolute_error:',mean_absolute_error(true,predicted))
    print('mean_squared_error:',mean_squared_error(true,predicted))
    print('rmse:',np.sqrt(mean_squared_error(true,predicted)))
    print('r2_score',r2_score(true,predicted))

# models

In [60]:
models={'LinearRegression':LinearRegression(),
        'Lasso':Lasso(),
        'Ridge':Ridge(),
        'KNeighborsRegressor':KNeighborsRegressor(),
        'DecisionTreeRegressor':DecisionTreeRegressor(),
        'RandomForestRegressor':RandomForestRegressor(),
        'XGBRegressor':XGBRegressor(),
        'AdaBoostRegressor':AdaBoostRegressor()}

In [67]:
model_list=[]
r2_list=[]
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)
    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)
    model_list.append(list(models.keys())[i])
    print(list(models.keys())[i])
    print('training performance of model........................................')
    evaluate_model(y_train,y_train_pred)
    print('testing performance of model.........................................')
    evaluate_model(y_test,y_test_pred)


LinearRegression
training performance of model........................................
mean_absolute_error: 4.266711846071956
mean_squared_error: 28.33487038064859
rmse: 5.323050852720514
r2_score 0.8743172040139593
testing performance of model.........................................
mean_absolute_error: 4.214763142474849
mean_squared_error: 29.095169866715466
rmse: 5.3939938697328405
r2_score 0.8804332983749565
Lasso
training performance of model........................................
mean_absolute_error: 5.206296077972952
mean_squared_error: 43.47829788272618
rmse: 6.593807540619166
r2_score 0.8071466723085148
testing performance of model.........................................
mean_absolute_error: 5.157879138921816
mean_squared_error: 42.50633235127344
rmse: 6.519688056285626
r2_score 0.825320079562973
Ridge
training performance of model........................................
mean_absolute_error: 4.264987823725981
mean_squared_error: 28.337788233082442
rmse: 5.323324922741654
r2

In [68]:
linearmodel=LinearRegression()
linearmodel.fit(x_train,y_train)
y_pred=linearmodel.predict(x_test)
print(r2_score(y_test,y_pred))

0.8804332983749565
