In [83]:
import numpy as np
import pandas as pd
import seaborn as sn
from sklearn.preprocessing import _encoders
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
#from Catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

In [14]:
df = pd.read_csv('Data/Stud_Data.csv')

In [15]:
df.isnull().sum()

roll_no                         0
gender                          0
race_ethnicity                  0
parental_level_of_education     0
lunch                           0
test_preparation_course         0
math_score                      0
reading_score                   0
writing_score                   0
science_score                  14
total_score                     0
grade                           0
dtype: int64

In [17]:
df.dropna(inplace=True)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9766 entries, 0 to 9779
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   roll_no                      9766 non-null   object 
 1   gender                       9766 non-null   object 
 2   race_ethnicity               9766 non-null   object 
 3   parental_level_of_education  9766 non-null   object 
 4   lunch                        9766 non-null   int64  
 5   test_preparation_course      9766 non-null   int64  
 6   math_score                   9766 non-null   int64  
 7   reading_score                9766 non-null   int64  
 8   writing_score                9766 non-null   int64  
 9   science_score                9766 non-null   float64
 10  total_score                  9766 non-null   int64  
 11  grade                        9766 non-null   object 
dtypes: float64(1), int64(6), object(5)
memory usage: 991.9+ KB


In [16]:
df.columns

Index(['roll_no', 'gender', 'race_ethnicity', 'parental_level_of_education',
       'lunch', 'test_preparation_course', 'math_score', 'reading_score',
       'writing_score', 'science_score', 'total_score', 'grade'],
      dtype='object')

In [19]:
df_numerical_features = df.select_dtypes(include='number')
df_categorical_features = df.select_dtypes(exclude='number')

In [21]:
print('Numeric', df_numerical_features.columns)
print('Cat', df_categorical_features.columns)

Numeric Index(['lunch', 'test_preparation_course', 'math_score', 'reading_score',
       'writing_score', 'science_score', 'total_score'],
      dtype='object')
Cat Index(['roll_no', 'gender', 'race_ethnicity', 'parental_level_of_education',
       'grade'],
      dtype='object')


In [50]:
df.gender=pd.Categorical(df.gender).codes
df.race_ethnicity=pd.Categorical(df.race_ethnicity).codes
df.parental_level_of_education=pd.Categorical(df.parental_level_of_education).codes
df.race_ethnicity=pd.Categorical(df.race_ethnicity).codes
df.grade=pd.Categorical(df.grade).codes

In [39]:
df.drop(columns='roll_no',axis=1, inplace=True)

In [52]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,science_score,total_score,grade
0,0,8,2,1,1,89,38,85,26.0,238,2
1,0,7,1,1,0,65,100,67,96.0,328,4
2,0,9,5,1,0,10,99,97,58.0,264,3
3,0,8,2,1,1,22,51,41,84.0,198,1
4,0,9,2,0,1,26,58,64,65.0,213,2


In [54]:
#Target variable - total_score

X = df.drop('total_score',axis=1)
y = df['total_score']

In [61]:
# splitting data into training and test set for independent attributes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=1)

In [65]:
print("X_Train:",X_train.shape)
print("y_train:",y_train.shape)

X_Train: (7812, 10)
y_train: (7812,)


In [64]:
print("X_Test:",X_test.shape)
print("y_test:",y_test.shape)

X_Test: (1954, 10)
y_test: (1954,)


In [70]:
#Sclaing the Data Set - Normalizations
mm = StandardScaler()
X_train_scale = mm.fit_transform(X_train)
X_test_scale = mm.transform(X_test)

In [77]:
###KNeighborsRegressor
KNN_model=KNeighborsRegressor()
KNN_model.fit(X_train_scale,y_train)

In [79]:
## R-Squre for KNN Train-set & Test-set
KNN_Train_Score = KNN_model.score(X_train_scale, y_train)
KNN_Test_Score = KNN_model.score(X_test_scale, y_test)
print("R-Sqr for KNN-Train:", KNN_Train_Score)
print("R-Sqr for KNN-Test:", KNN_Test_Score)

R-Sqr for KNN-Train: 0.9592978996830448
R-Sqr for KNN-Test: 0.939205094846878


In [85]:
## RMSE for for KNN Train-set & Test-set
KNN_RMSE_Train_Predict = KNN_model.predict((X_train_scale))
KNN_RMSE_Train = np.sqrt(mean_squared_error(y_train,KNN_RMSE_Train_Predict))
print("RMSE for KNN Train-Set:",KNN_RMSE_Train )
KNN_RMSE_Test_Predict = KNN_model.predict((X_test_scale))
KNN_RMSE_Test = np.sqrt(mean_squared_error(y_test,KNN_RMSE_Test_Predict))
print("RMSE for KNN Test-Set:",KNN_RMSE_Test)

RMSE for KNN Train-Set: 8.542953726418972
RMSE for KNN Test-Set: 10.391124864362657


In [86]:
## MAP for for KNN Train-set & Test-set
KNN_MAE_Train = mean_absolute_error(y_train,KNN_RMSE_Train_Predict)
print("MAE for KNN Train-Set:",KNN_MAE_Train )
KNN_MAE_Test = mean_absolute_error(y_test,KNN_RMSE_Test_Predict)
print("MAE for KNN Train-Set:",KNN_MAE_Train)

MAE for KNN Train-Set: 6.82268305171531
MAE for KNN Train-Set: 6.82268305171531
