In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns

import xgboost as xgb
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from yellowbrick.regressor import residuals_plot
from yellowbrick.regressor import prediction_error



In [None]:
data = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')
print(data.shape)

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.isna().sum()

3 numerical variables, 8 categorical variables


n_student -> number of students in class
pretest -> score in pre-exam
posttest -> final score - TO BE PREDICTED 

In [None]:
data.columns

In [None]:
cat_cols = ['school', 'school_setting', 'school_type', 'classroom',
       'teaching_method', 'student_id', 'gender', 'lunch']

EDA

In [None]:
data.plot.scatter(x='posttest',y='school', c='DarkBlue')

In [None]:
data.plot.scatter(x='pretest',y='school', c='DarkBlue')

In [None]:
data.plot.scatter(x='posttest',y='school_setting', c='DarkBlue')

In [None]:
data.plot.scatter(x='posttest',y='school_type', c='DarkBlue')

In [None]:
data.plot.scatter(x='posttest',y='classroom', c='DarkBlue')

In [None]:
len(data.classroom.unique())

In [None]:
data.plot.scatter(x='n_student',y='posttest')

In [None]:
data.n_student.hist() 

In [None]:
data.n_student.unique() 

In [None]:
data.plot.scatter(x='posttest',y='teaching_method', c='DarkBlue')

In [None]:
data.plot.scatter(x='posttest',y='gender', c='DarkBlue')

In [None]:
data.plot.scatter(x='posttest',y='lunch', c='DarkBlue')

In [None]:
print(data.pretest.hist())

In [None]:
print(data.posttest.hist())

Creating some new features

In [None]:
school_level = data.groupby('school').mean()['pretest']
school_level.name = 'school_pretest'
data=data.join(school_level,on='school')
data.drop(['school'], axis = 1, inplace = True)

In [None]:
class_level = data.groupby('classroom').mean()['pretest']
class_level.name = 'class_pretest'
data=data.join(class_level,on='classroom')
data.drop(['classroom'], axis = 1, inplace = True)

In [None]:
# nstudent_level = data.groupby('n_student').mean()['pretest']
# nstudent_level.name = 'nstud_pretest'
# data=data.join(nstudent_level,on='n_student')
# data.drop(['n_student'], axis = 1, inplace = True)

In [None]:
# enco_cols = ['school_setting', 'school_type','teaching_method', 'gender', 'lunch', 'n_student']

# enco_cols = ['school_setting', 'school_type','teaching_method', 'gender', 'lunch', 'classroom', 'school']

# encoding the categoric features
enco_cols = ['school_setting', 'school_type','teaching_method', 'gender', 'lunch']

encoded_data = pd.get_dummies(data, columns = enco_cols)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# rescaling the numeric features
scaler = MinMaxScaler()

num_cols = ['n_student', 'pretest', 'posttest', 'class_pretest', 'school_pretest']
# num_cols = ['nstud_pretest', 'pretest', 'posttest', 'class_pretest', 'school_pretest']
# num_cols = ['pretest', 'posttest', 'class_pretest', 'school_pretest']
encoded_data[num_cols] = scaler.fit_transform(encoded_data[num_cols])
encoded_data.head()

In [None]:
import matplotlib.pyplot as plt

plt.matshow(encoded_data.corr())
plt.show()

In [None]:
encoded_data.describe()


Y = encoded_data['posttest']
X = encoded_data.drop(['posttest'], axis = 1, inplace = True)


In [None]:
stud_ids = encoded_data['student_id'].unique()

np.random.seed(42)
msk = np.random.rand(len(stud_ids)) < 0.8
train_ids = stud_ids[msk]
test_ids = stud_ids[~msk]

train = encoded_data[encoded_data['student_id'].isin(train_ids)]
test = encoded_data[encoded_data['student_id'].isin(test_ids)]

In [None]:
print(train.boxplot(column='posttest'))


In [None]:
print(test.boxplot(column='posttest'))

In [None]:
Y_train = train.loc[:,'posttest']
X_train = train.drop(['posttest','student_id'], axis = 1, inplace = False)

Y_test = test.loc[:,'posttest']
X_test = test.drop(['posttest','student_id'], axis = 1, inplace = False)

In [None]:
np.random.seed(42)

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, Y_train)

model.score(X_test, Y_test)

In [None]:
y_pred = model.predict(X_test) 

MSE = mse(Y_test, y_pred)
RMSE = np.sqrt(MSE)

R_squared = r2_score(Y_test, y_pred)

print("\nRMSE: ", np.round(RMSE, 2))
print()
print("R-Squared: ", np.round(R_squared, 2))

Feature Importance

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')

In [None]:
feat_importances[feat_importances > 0.001].index

In [None]:
model.fit(X_train.loc[:,feat_importances[feat_importances > 0.001].index], Y_train)


In [None]:
X_train.loc[:,feat_importances[feat_importances > 0.001].index]

In [None]:
model.score(X_test.loc[:,feat_importances[feat_importances > 0.001].index], Y_test)

In [None]:
from sklearn.model_selection import cross_val_score

# y_preds = model.predict(X_test)

cv_score = cross_val_score(model, X_train, Y_train, scoring='r2', cv=10)
np.mean(cv_score)

In [None]:
cv_score

In [None]:
cv_score = cross_val_score(model, X_train.loc[:,feat_importances[feat_importances > 0.00001].index], Y_train, scoring='r2', cv=5)
np.mean(cv_score)

In [None]:
X_train = X_train.loc[:,feat_importances[feat_importances > 0.001].index]
X_test = X_test.loc[:,feat_importances[feat_importances > 0.001].index]

In [None]:
X_train

Model Training

In [None]:
from sklearn.model_selection import RandomizedSearchCV


rf_reg_model = RandomForestRegressor(n_jobs = 2)

rf_grid = {'n_estimators' : [ 200, 500, 1000, 1200, 1500, 1800],
        'max_depth' : [None, 5, 20, 30, 45, 60],
        'max_features' : ['auto', 'sqrt'],
        'min_samples_split' : [4, 6 ,8 , 10],
        'min_samples_leaf' : [2, 4, 7, 10]}

rs_rf_model = RandomizedSearchCV(estimator = rf_reg_model, 
                              param_distributions = rf_grid,
                              n_iter = 10,
                              cv = 5,
                              verbose = 2)
rs_rf_model.fit(X_train, Y_train)

In [None]:
rs_rf_model.best_params_

In [None]:
rs_y_preds = rs_rf_model.predict(X_test)
MSE = mse(Y_test, rs_y_preds)
RMSE = np.sqrt(MSE)

R_squared = r2_score(Y_test, rs_y_preds)

print("\nRMSE: ", np.round(RMSE, 2))
print()
print("R-Squared: ", np.round(R_squared, 2))

In [None]:
print("\nPrediction Error Plot")
print(prediction_error(rs_rf_model, X_train, Y_train, X_test, Y_test))

print("\nResiduals Plot")
print(residuals_plot(rs_rf_model, X_train, Y_train, X_test, Y_test))


In [None]:
xgb_reg = xgb.XGBRegressor(max_depth=5, n_estimators=100, n_jobs=2,
                           objectvie='reg:squarederror', booster='gbtree',
                           random_state=42, learning_rate=0.05)

xgb_reg.fit(X_train, Y_train)

y_pred = xgb_reg.predict(X_test) # Predictions
MSE = mse(Y_test, y_pred)
RMSE = np.sqrt(MSE)

R_squared = r2_score(Y_test, y_pred)

print("\nRMSE: ", np.round(RMSE, 2))
print()
print("R-Squared: ", np.round(R_squared, 2))

# Making the Prediction Error Plot
print("\nPrediction Error Plot")
print(prediction_error(xgb_reg, X_train, Y_train, X_test, Y_test))

# Making the Residuals Plot
print("\nResiduals Plot")
print(residuals_plot(xgb_reg, X_train, Y_train, X_test, Y_test))


In [None]:
xgb_reg = xgb.XGBRegressor(n_jobs=2, objectvie='reg:squarederror', booster='gbtree',
                           random_state=42)

xgb_grid = {'n_estimators' : [ 100, 200, 500, 1000, 1200, 1500, 1800],
        'max_depth' : [None, 25, 30, 45], #3, 5, 15,
        'learning_rate' : [0.02, 0.01, 0.005],
        'max_leaf_nodes': [1, 2, 4], #6 ,8 , 10
        'min_child_weight': [1, 2, 4],#, 7, 10],
        'colsample_bytree'  : [0.25, 0.33, 0.5, 0.75, 1.0],
         'colsample_bylevel': [0.25, 0.33, 0.5, 0.75, 1.0]}

xgb_rs_model = RandomizedSearchCV(estimator = xgb_reg, 
                              param_distributions = xgb_grid,
                              cv = 10,
                              verbose = 0)

# from sklearn.model_selection import GridSearchCV

# xgb_rs_model = GridSearchCV(estimator = xgb_reg, 
#                               param_grid = xgb_grid,
#                               cv = 20,
#                               verbose = 0)

xgb_rs_model.fit(X_train, Y_train)

print(xgb_rs_model.best_params_)

rs_y_preds = xgb_rs_model.predict(X_test)
MSE = mse(Y_test, rs_y_preds)
RMSE = np.sqrt(MSE)

R_squared = r2_score(Y_test, rs_y_preds)

print("\nRMSE: ", np.round(RMSE, 2))
print()
print("R-Squared: ", np.round(R_squared, 2))

In [None]:
print(xgb_rs_model.best_params_)

rs_y_preds = xgb_rs_model.predict(X_test)
MSE = mse(Y_test, rs_y_preds)
RMSE = np.sqrt(MSE)

R_squared = r2_score(Y_test, rs_y_preds)

print("\nRMSE: ", np.round(RMSE, 2))
print()
print("R-Squared: ", np.round(R_squared, 2))
#10 fold CV
#{'n_estimators': 1800, 'min_child_weight': 4, 'max_leaf_nodes': 4, 'max_depth': None, 'learning_rate': 0.005, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.5}

#20 fold
#{'n_estimators': 1800, 'min_child_weight': 4, 'max_leaf_nodes': 4, 'max_depth': 30, 'learning_rate': 0.005, 'colsample_bytree': 0.75, 'colsample_bylevel': 0.33}

# 5 fold
# {'n_estimators': 1500, 'min_child_weight': 1, 'max_leaf_nodes': 1, 'max_depth': 25, 'learning_rate': 0.02, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.33}
