<a href="https://colab.research.google.com/github/siti-alawiyah/ibresultprediction/blob/main/RandomForestReg_1.02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RandomForest Regressor

In [1]:
# imports
# ignore future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix,make_scorer, r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn import metrics
import pickle

import statsmodels.api as sm
from scipy import stats

In [2]:
# links to dataset
URL1 = "https://raw.githubusercontent.com/siti-alawiyah/ibresultprediction/main/data/2020IB.csv"
URL2 = "https://raw.githubusercontent.com/siti-alawiyah/ibresultprediction/main/data/2019IB.csv"
URL3 = "https://raw.githubusercontent.com/siti-alawiyah/ibresultprediction/main/data/2018IB.csv"   
URL4 = "https://raw.githubusercontent.com/siti-alawiyah/ibresultprediction/main/data/2017IB.csv"      

In [3]:
# read the url for the datasets
df20 = pd.read_csv(URL1)
df19 = pd.read_csv(URL2)
df18 = pd.read_csv(URL3)
df17 = pd.read_csv(URL4)

In [4]:
# getting the columns prior for modelling
col_name = ["Ma Std","Ma Teacher","Ma ATL","Ma Compl. Of Work","Ma Sub Achievement","Predicted Grade","Actual Grade","Scaled Total"]

df20 = df20[col_name]
df19 = df19[col_name]
df18 = df18[col_name]
df17 = df17[col_name]

In [5]:
# combine the dataframes into 1 before modelling
frames = [df20,df19,df18,df17]
df = pd.concat(frames)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 681 entries, 0 to 177
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Ma Std              657 non-null    object 
 1   Ma Teacher          657 non-null    object 
 2   Ma ATL              657 non-null    float64
 3   Ma Compl. Of Work   654 non-null    float64
 4   Ma Sub Achievement  657 non-null    float64
 5   Predicted Grade     654 non-null    float64
 6   Actual Grade        652 non-null    float64
 7   Scaled Total        653 non-null    float64
dtypes: float64(6), object(2)
memory usage: 47.9+ KB


In [7]:
df.isnull().sum()

Ma Std                24
Ma Teacher            24
Ma ATL                24
Ma Compl. Of Work     27
Ma Sub Achievement    24
Predicted Grade       27
Actual Grade          29
Scaled Total          28
dtype: int64

In [8]:
#drop rows that have null values
df.dropna(axis=0, inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 650 entries, 0 to 177
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Ma Std              650 non-null    object 
 1   Ma Teacher          650 non-null    object 
 2   Ma ATL              650 non-null    float64
 3   Ma Compl. Of Work   650 non-null    float64
 4   Ma Sub Achievement  650 non-null    float64
 5   Predicted Grade     650 non-null    float64
 6   Actual Grade        650 non-null    float64
 7   Scaled Total        650 non-null    float64
dtypes: float64(6), object(2)
memory usage: 45.7+ KB


In [10]:
# dummify Ma Std and Ma Teacher
col= ['Ma Std','Ma Teacher']

# Creaing dummies 
df = pd.get_dummies(columns=col, data=df,drop_first=True)

In [11]:
# create train test split 
X = df.drop(['Predicted Grade','Actual Grade'],axis=1)
y = df['Actual Grade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [12]:
pipe  = Pipeline([('rf', RandomForestRegressor(random_state=42))])

In [13]:
pipe_params = {
    'rf__n_jobs': [-1],
    'rf__n_estimators': [100,101,102,103,104,105,106,107,108,109,110],
    'rf__max_depth': [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'rf__random_state': [42]
    }

In [14]:
scorers = {'RMSE': make_scorer(mean_squared_error), 'R2': make_scorer(r2_score)}

In [15]:
# instantiate GridSeachCV
gs = GridSearchCV(pipe, param_grid = pipe_params, cv = 3, scoring = scorers, n_jobs = -1, refit = 'R2')

In [16]:
# fit grid search to training data
gs.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('rf',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'rf__n_estimators': [100, 101, 102, 103, 104, 105, 106,
                                              107, 108, 109, 110],
                         'rf__n_jobs': [-1], 'rf__random_state': [42]},
             refit='R2',
             scoring={'R2': make_scorer(r2_score),
                      'RMSE': make_scorer(mean_squared_error)})

In [17]:
#print the best score for CV = 3
print(gs.best_score_)

0.940070337373175


In [18]:
#model score on training set
gs.score(X_train, y_train)

0.9880954394790418

In [19]:
#score model on test set
gs.score(X_test, y_test)

0.9604987060654657

In [20]:
#get the best params
gs.best_params_

{'rf__max_depth': 7,
 'rf__n_estimators': 102,
 'rf__n_jobs': -1,
 'rf__random_state': 42}

In [21]:
#RandomForest AUC/Accuracy Score
rf_r2_train = gs.score(X_train,y_train)
rf_r2_test = gs.score(X_test,y_test)
rf_rmse_train = mean_squared_error(gs.predict(X_train),y_train)
rf_rmse_test = mean_squared_error(gs.predict(X_test),y_test)

In [22]:
print(f'RandomForestRegressor R2 train score {rf_r2_train}')
print(f'RandomForestRegressor R2 test score {rf_r2_test}')
print(f'RandomForestRegressor RMSE train score {rf_rmse_train}')
print(f'RandomForestRegressor RMSE test score {rf_rmse_test}')

RandomForestRegressor R2 train score 0.9880954394790418
RandomForestRegressor R2 test score 0.9604987060654658
RandomForestRegressor RMSE train score 0.015893486780001045
RandomForestRegressor RMSE test score 0.05168566412672756


# Feature Ranking


In [27]:
feature_names = pd.DataFrame(gs.best_estimator_.named_steps.rf.feature_names_in_, columns=['feature_name'])
feature_names['importance'] = gs.best_estimator_.named_steps.rf.feature_importances_

In [28]:
feature_names

Unnamed: 0,feature_name,importance
0,Ma ATL,0.002303
1,Ma Compl. Of Work,0.002946
2,Ma Sub Achievement,0.016033
3,Scaled Total,0.950796
4,Ma Std_ SL,0.00204
5,Ma Std_HL,0.000853
6,Ma Std_SL,0.010019
7,Ma Teacher_ CLe,0.00054
8,Ma Teacher_ CNa,0.000546
9,Ma Teacher_ FBe,0.000919


In [29]:
feature_names.sort_values(by="importance",ascending=False)[:10]

Unnamed: 0,feature_name,importance
3,Scaled Total,0.950796
2,Ma Sub Achievement,0.016033
6,Ma Std_SL,0.010019
1,Ma Compl. Of Work,0.002946
16,Ma Teacher_FBe,0.002714
0,Ma ATL,0.002303
4,Ma Std_ SL,0.00204
18,Ma Teacher_GGo,0.001628
12,Ma Teacher_BGe,0.001547
21,Ma Teacher_JTo,0.001477


# Model Prediction of Student's Grade


In [30]:
# rounding the results off to nearest whole number

y_test_pred = gs.predict(X_test).round().astype('int')

In [31]:
X_test

Unnamed: 0,Ma ATL,Ma Compl. Of Work,Ma Sub Achievement,Scaled Total,Ma Std_ SL,Ma Std_HL,Ma Std_SL,Ma Teacher_ CLe,Ma Teacher_ CNa,Ma Teacher_ FBe,Ma Teacher_ RCh,Ma Teacher_ SSn,Ma Teacher_BGe,Ma Teacher_CLe,Ma Teacher_CNa,Ma Teacher_DCh,Ma Teacher_FBe,Ma Teacher_FYi,Ma Teacher_GGo,Ma Teacher_GTu,Ma Teacher_Gtu,Ma Teacher_JTo,Ma Teacher_LLP,Ma Teacher_RCh,Ma Teacher_TSS
165,7.0,7.0,7.0,87.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
60,6.0,6.0,4.0,57.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
112,6.0,6.0,5.0,75.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,6.0,5.0,4.0,58.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
76,7.0,7.0,7.0,85.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,7.0,7.0,5.0,61.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
96,6.0,6.0,6.0,67.0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
143,5.0,5.0,4.0,42.0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
115,6.0,6.0,5.0,52.0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
X_test['model prediction'] = y_test_pred

In [33]:
X_test.head()

Unnamed: 0,Ma ATL,Ma Compl. Of Work,Ma Sub Achievement,Scaled Total,Ma Std_ SL,Ma Std_HL,Ma Std_SL,Ma Teacher_ CLe,Ma Teacher_ CNa,Ma Teacher_ FBe,Ma Teacher_ RCh,Ma Teacher_ SSn,Ma Teacher_BGe,Ma Teacher_CLe,Ma Teacher_CNa,Ma Teacher_DCh,Ma Teacher_FBe,Ma Teacher_FYi,Ma Teacher_GGo,Ma Teacher_GTu,Ma Teacher_Gtu,Ma Teacher_JTo,Ma Teacher_LLP,Ma Teacher_RCh,Ma Teacher_TSS,model prediction
165,7.0,7.0,7.0,87.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,7
60,6.0,6.0,4.0,57.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,5
112,6.0,6.0,5.0,75.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,6
5,6.0,5.0,4.0,58.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,5
76,7.0,7.0,7.0,85.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,7
