In [38]:
import numpy as np
import pandas as pd
import seaborn as sn
from sklearn.preprocessing import _encoders
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
#from Catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

In [39]:
df = pd.read_csv('Data/Stud_Data.csv')

In [40]:
df.isnull().sum()

roll_no                         0
gender                          0
race_ethnicity                  0
parental_level_of_education     0
lunch                           0
test_preparation_course         0
math_score                      0
reading_score                   0
writing_score                   0
science_score                  14
total_score                     0
grade                           0
dtype: int64

In [41]:
df.dropna(inplace=True)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9766 entries, 0 to 9779
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   roll_no                      9766 non-null   object 
 1   gender                       9766 non-null   object 
 2   race_ethnicity               9766 non-null   object 
 3   parental_level_of_education  9766 non-null   object 
 4   lunch                        9766 non-null   int64  
 5   test_preparation_course      9766 non-null   int64  
 6   math_score                   9766 non-null   int64  
 7   reading_score                9766 non-null   int64  
 8   writing_score                9766 non-null   int64  
 9   science_score                9766 non-null   float64
 10  total_score                  9766 non-null   int64  
 11  grade                        9766 non-null   object 
dtypes: float64(1), int64(6), object(5)
memory usage: 991.9+ KB


In [43]:
df.columns

Index(['roll_no', 'gender', 'race_ethnicity', 'parental_level_of_education',
       'lunch', 'test_preparation_course', 'math_score', 'reading_score',
       'writing_score', 'science_score', 'total_score', 'grade'],
      dtype='object')

In [44]:
df_numerical_features = df.select_dtypes(include='number')
df_categorical_features = df.select_dtypes(exclude='number')

In [45]:
print('Numeric', df_numerical_features.columns)
print('Cat', df_categorical_features.columns)

Numeric Index(['lunch', 'test_preparation_course', 'math_score', 'reading_score',
       'writing_score', 'science_score', 'total_score'],
      dtype='object')
Cat Index(['roll_no', 'gender', 'race_ethnicity', 'parental_level_of_education',
       'grade'],
      dtype='object')


In [46]:
df.gender=pd.Categorical(df.gender).codes
df.race_ethnicity=pd.Categorical(df.race_ethnicity).codes
df.parental_level_of_education=pd.Categorical(df.parental_level_of_education).codes
df.race_ethnicity=pd.Categorical(df.race_ethnicity).codes
df.grade=pd.Categorical(df.grade).codes

In [47]:
df.drop(columns='roll_no',axis=1, inplace=True)

In [48]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,science_score,total_score,grade
0,1,8,4,1,1,89,38,85,26.0,238,2
1,1,6,2,1,0,65,100,67,96.0,328,0
2,1,7,3,1,0,10,99,97,58.0,264,1
3,1,8,4,1,1,22,51,41,84.0,198,3
4,1,7,4,0,1,26,58,64,65.0,213,2


In [49]:
#Target variable - total_score

X = df.drop('total_score',axis=1)
y = df['total_score']

In [50]:
# splitting data into training and test set for independent attributes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=1)

In [51]:
print("X_Train:",X_train.shape)
print("y_train:",y_train.shape)

X_Train: (7812, 10)
y_train: (7812,)


In [52]:
print("X_Test:",X_test.shape)
print("y_test:",y_test.shape)

X_Test: (1954, 10)
y_test: (1954,)


In [53]:
#Sclaing the Data Set - Normalizations
mm = StandardScaler()
X_train_scale = mm.fit_transform(X_train)
X_test_scale = mm.transform(X_test)

In [None]:
###KNeighborsRegressor
KNN_model=KNeighborsRegressor()
KNN_model.fit(X_train_scale,y_train)

In [None]:
## R-Squre for DT Train-set & Test-set
KNN_Train_Score = KNN_model.score(X_train_scale, y_train)
KNN_Test_Score = KNN_model.score(X_test_scale, y_test)
print("R-Sqr for KNN-Train:", KNN_Train_Score)
print("R-Sqr for KNN-Test:", KNN_Test_Score)

R-Sqr for KNN-Train: 0.9595314883524346
R-Sqr for KNN-Test: 0.9333352179297284


In [62]:
## RMSE for for KNN Train-set & Test-set
KNN_RMSE_Train_Predict = KNN_model.predict((X_train_scale))
KNN_RMSE_Train = np.sqrt(mean_squared_error(y_train,KNN_RMSE_Train_Predict))
print("RMSE for KNN Train-Set:",KNN_RMSE_Train )
KNN_RMSE_Test_Predict = KNN_model.predict((X_test_scale))
KNN_RMSE_Test = np.sqrt(mean_squared_error(y_test,KNN_RMSE_Test_Predict))
print("RMSE for KNN Test-Set:",KNN_RMSE_Test)

RMSE for KNN Train-Set: 8.518404519967119
RMSE for KNN Test-Set: 10.881210270454591


In [63]:
## MAP for for KNN Train-set & Test-set
KNN_MAE_Train = mean_absolute_error(y_train,KNN_RMSE_Train_Predict)
print("MAE for KNN Train-Set:",KNN_MAE_Train )
KNN_MAE_Test = mean_absolute_error(y_test,KNN_RMSE_Test_Predict)
print("MAE for KNN Train-Set:",KNN_MAE_Train)

MAE for KNN Train-Set: 6.845878136200717
MAE for KNN Train-Set: 6.845878136200717


In [65]:
# Decition Tree Regressor
DT_model=DecisionTreeRegressor()
DT_model.fit(X_train_scale,y_train)

In [66]:
## R-Squre for DT Train-set & Test-set
DT_Train_Score = DT_model.score(X_train_scale, y_train)
DT_Test_Score = DT_model.score(X_test_scale, y_test)
print("R-Sqr for KNN-Train:", DT_Train_Score)
print("R-Sqr for KNN-Test:", DT_Test_Score)

R-Sqr for KNN-Train: 1.0
R-Sqr for KNN-Test: 0.9519724350230512


In [67]:
## RMSE for for DT Train-set & Test-set
DT_RMSE_Train_Predict = DT_model.predict((X_train_scale))
DT_RMSE_Train = np.sqrt(mean_squared_error(y_train,DT_RMSE_Train_Predict))
print("RMSE for DT Train-Set:",DT_RMSE_Train )
DT_RMSE_Test_Predict = DT_model.predict((X_test_scale))
DT_RMSE_Test = np.sqrt(mean_squared_error(y_test,DT_RMSE_Test_Predict))
print("RMSE for KNN Test-Set:",DT_RMSE_Test)

RMSE for DT Train-Set: 0.0
RMSE for KNN Test-Set: 9.2357943700504


In [68]:
## MAP for for DT Train-set & Test-set
DT_MAE_Train = mean_absolute_error(y_train,DT_RMSE_Train_Predict)
print("MAE for DT Train-Set:",DT_MAE_Train )
DT_MAE_Test = mean_absolute_error(y_test,DT_RMSE_Test_Predict)
print("MAE for DT Train-Set:",DT_MAE_Train)

MAE for DT Train-Set: 0.0
MAE for DT Train-Set: 0.0


In [69]:
# Randome Forest Tree Regressor
RF_model=RandomForestRegressor()
RF_model.fit(X_train_scale,y_train)

In [70]:
## R-Squre for RF Train-set & Test-set
RF_Train_Score = RF_model.score(X_train_scale, y_train)
RF_Test_Score = RF_model.score(X_test_scale, y_test)
print("R-Sqr for RF-Train:", RF_Train_Score)
print("R-Sqr for RF-Test:", RF_Test_Score)

R-Sqr for RF-Train: 0.9977582881191187
R-Sqr for RF-Test: 0.984385196361329


In [71]:
## RMSE for for RF Train-set & Test-set
RF_RMSE_Train_Predict = RF_model.predict((X_train_scale))
RF_RMSE_Train = np.sqrt(mean_squared_error(y_train,RF_RMSE_Train_Predict))
print("RMSE for RF Train-Set:",RF_RMSE_Train )
RF_RMSE_Test_Predict = RF_model.predict((X_test_scale))
RF_RMSE_Test = np.sqrt(mean_squared_error(y_test,RF_RMSE_Test_Predict))
print("RMSE for RF Test-Set:",RF_RMSE_Test)

RMSE for RF Train-Set: 2.004885341603083
RMSE for RF Test-Set: 5.266198614803941


In [72]:
## MAP for for RF Train-set & Test-set
RF_MAE_Train = mean_absolute_error(y_train,RF_RMSE_Train_Predict)
print("MAE for RF Train-Set:",RF_MAE_Train )
RF_MAE_Test = mean_absolute_error(y_test,RF_RMSE_Test_Predict)
print("MAE for RF Train-Set:",RF_MAE_Train)

MAE for RF Train-Set: 1.5030235535074243
MAE for RF Train-Set: 1.5030235535074243


In [73]:
# ADA Boost Tree Regressor
ADA_model=AdaBoostRegressor()
ADA_model.fit(X_train_scale,y_train)

In [74]:
## R-Squre for ADA Train-set & Test-set
ADA_Train_Score = ADA_model.score(X_train_scale, y_train)
ADA_Test_Score = ADA_model.score(X_test_scale, y_test)
print("R-Sqr for ADA-Train:", ADA_Train_Score)
print("R-Sqr for ADA-Test:", ADA_Test_Score)

R-Sqr for ADA-Train: 0.8778833282842163
R-Sqr for ADA-Test: 0.8779169394632649


In [75]:
## RMSE for for ADA Train-set & Test-set
ADA_RMSE_Train_Predict = ADA_model.predict((X_train_scale))
ADA_RMSE_Train = np.sqrt(mean_squared_error(y_train,ADA_RMSE_Train_Predict))
print("RMSE for ADA Train-Set:",ADA_RMSE_Train )
ADA_RMSE_Test_Predict = ADA_model.predict((X_test_scale))
ADA_RMSE_Test = np.sqrt(mean_squared_error(y_test,ADA_RMSE_Test_Predict))
print("RMSE for ADA Test-Set:",ADA_RMSE_Test)

RMSE for ADA Train-Set: 14.797458251677718
RMSE for ADA Test-Set: 14.725046503159696


In [76]:
## MAP for for ADA Train-set & Test-set
ADA_MAE_Train = mean_absolute_error(y_train,ADA_RMSE_Train_Predict)
print("MAE for ADA Train-Set:",ADA_MAE_Train )
ADA_MAE_Test = mean_absolute_error(y_test,ADA_RMSE_Test_Predict)
print("MAE for ADA Train-Set:",ADA_MAE_Train)

MAE for ADA Train-Set: 12.43867140472632
MAE for ADA Train-Set: 12.43867140472632


In [77]:
# Gradiant Boost Tree Regressor
GB_model=GradientBoostingRegressor()
GB_model.fit(X_train_scale,y_train)

In [78]:
## R-Squre for GB Train-set & Test-set
GB_Train_Score = GB_model.score(X_train_scale, y_train)
GB_Test_Score = GB_model.score(X_test_scale, y_test)
print("R-Sqr for GB-Train:", GB_Train_Score)
print("R-Sqr for GB-Test:", GB_Test_Score)

R-Sqr for GB-Train: 0.9644467437881825
R-Sqr for GB-Test: 0.9595584962362481


In [79]:
## RMSE for for GB Train-set & Test-set
GB_RMSE_Train_Predict = GB_model.predict((X_train_scale))
GB_RMSE_Train = np.sqrt(mean_squared_error(y_train,GB_RMSE_Train_Predict))
print("RMSE for GB Train-Set:",GB_RMSE_Train )
GB_RMSE_Test_Predict = GB_model.predict((X_test_scale))
GB_RMSE_Test = np.sqrt(mean_squared_error(y_test,GB_RMSE_Test_Predict))
print("RMSE for GB Test-Set:",GB_RMSE_Test)

RMSE for GB Train-Set: 7.984345748858227
RMSE for GB Test-Set: 8.475056825577093


In [80]:
## MAP for for GB Train-set & Test-set
GB_MAE_Train = mean_absolute_error(y_train,GB_RMSE_Train_Predict)
print("MAE for GB Train-Set:",GB_MAE_Train )
GB_MAE_Test = mean_absolute_error(y_test,GB_RMSE_Test_Predict)
print("MAE for GB Train-Set:",GB_MAE_Train)

MAE for GB Train-Set: 6.571526283626649
MAE for GB Train-Set: 6.571526283626649


In [81]:
# XGBoost Tree Regressor
XGB_model=XGBRegressor()
XGB_model.fit(X_train_scale,y_train)

In [82]:
## R-Squre for XGB Train-set & Test-set
XGB_Train_Score = XGB_model.score(X_train_scale, y_train)
XGB_Test_Score = XGB_model.score(X_test_scale, y_test)
print("R-Sqr for XGB-Train:", XGB_Train_Score)
print("R-Sqr for XGB-Test:", XGB_Test_Score)

R-Sqr for XGB-Train: 0.9984846651617156
R-Sqr for XGB-Test: 0.9922537367004456


In [83]:
## RMSE for for XGB Train-set & Test-set
XGB_RMSE_Train_Predict = XGB_model.predict((X_train_scale))
XGB_RMSE_Train = np.sqrt(mean_squared_error(y_train,XGB_RMSE_Train_Predict))
print("RMSE for XGB Train-Set:",XGB_RMSE_Train )
XGB_RMSE_Test_Predict = XGB_model.predict((X_test_scale))
XGB_RMSE_Test = np.sqrt(mean_squared_error(y_test,XGB_RMSE_Test_Predict))
print("RMSE for XGB Test-Set:",XGB_RMSE_Test)

RMSE for XGB Train-Set: 1.648367137989882
RMSE for XGB Test-Set: 3.709155991422469


In [84]:
## MAP for for XGB Train-set & Test-set
XGB_MAE_Train = mean_absolute_error(y_train,XGB_RMSE_Train_Predict)
print("MAE for XGB Train-Set:",XGB_MAE_Train )
XGB_MAE_Test = mean_absolute_error(y_test,XGB_RMSE_Test_Predict)
print("MAE for XGB Train-Set:",XGB_MAE_Train)

MAE for XGB Train-Set: 1.2691509121818172
MAE for XGB Train-Set: 1.2691509121818172


In [85]:
# Model Comparision Data Frame:

Models_Comparision = pd.DataFrame({'KNN_Model_Train': [KNN_Train_Score,KNN_RMSE_Train,KNN_MAE_Train],
           'KNN_Model_Test' : [KNN_Test_Score,KNN_RMSE_Test,KNN_MAE_Test],
         'DT_Model_Train': [DT_Train_Score,DT_RMSE_Train,DT_MAE_Train], 
         'DT_Model_Test': [DT_Test_Score,DT_RMSE_Test,DT_MAE_Test],
         'RF_Model_Train': [RF_Train_Score,RF_RMSE_Train,RF_MAE_Train],  
         'RF_Model_Test': [RF_Test_Score,RF_RMSE_Test,RF_MAE_Test],
         'ADA_Model_Train':[ADA_Train_Score,ADA_RMSE_Train,ADA_MAE_Train], 
         'ADA_Model_Test':[ADA_Test_Score,ADA_RMSE_Test,ADA_MAE_Test],
         'GB_Model_Train':[GB_Train_Score,GB_RMSE_Train,GB_MAE_Train],
         'GB_Model_Test':[GB_Test_Score,GB_RMSE_Test,GB_MAE_Test],
         'XGB_Model_Train':[XGB_Train_Score,XGB_RMSE_Train,XGB_MAE_Train],
         'XGB_Model_Test':[XGB_Test_Score,XGB_RMSE_Test,XGB_MAE_Test]      
         },index=['R2_Score','RMSE','MAE'])

In [88]:
Models_Comparision.round(2).T

Unnamed: 0,R2_Score,RMSE,MAE
KNN_Model_Train,0.96,8.52,6.85
KNN_Model_Test,0.93,10.88,8.74
DT_Model_Train,1.0,0.0,0.0
DT_Model_Test,0.95,9.24,7.05
RF_Model_Train,1.0,2.0,1.5
RF_Model_Test,0.98,5.27,3.91
ADA_Model_Train,0.88,14.8,12.44
ADA_Model_Test,0.88,14.73,12.31
GB_Model_Train,0.96,7.98,6.57
GB_Model_Test,0.96,8.48,6.95
