<a href="https://colab.research.google.com/github/siti-alawiyah/ibresultprediction/blob/main/RandomForestReg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# imports
# ignore future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn import metrics

import statsmodels.api as sm
from scipy import stats

In [None]:
# links to dataset
URL1 = "https://raw.githubusercontent.com/siti-alawiyah/ibresultprediction/main/data/2020IB.csv"
URL2 = "https://raw.githubusercontent.com/siti-alawiyah/ibresultprediction/main/data/2019IB.csv"
URL3 = "https://raw.githubusercontent.com/siti-alawiyah/ibresultprediction/main/data/2018IB.csv"   
URL4 = "https://raw.githubusercontent.com/siti-alawiyah/ibresultprediction/main/data/2017IB.csv"      

In [None]:
# read the url for the datasets
df20 = pd.read_csv(URL1)
df19 = pd.read_csv(URL2)
df18 = pd.read_csv(URL3)
df17 = pd.read_csv(URL4)

In [None]:
# getting the columns prior for modelling
col_name = ["Ma Std","Ma Teacher","Ma ATL","Ma Compl. Of Work","Ma Sub Achievement","Predicted Grade","Actual Grade","Scaled Total"]

df20 = df20[col_name]
df19 = df19[col_name]
df18 = df18[col_name]
df17 = df17[col_name]

In [None]:
# combine the dataframes into 1 before modelling
frames = [df20,df19,df18,df17]
df = pd.concat(frames)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 692 entries, 0 to 178
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Ma Std              657 non-null    object 
 1   Ma Teacher          657 non-null    object 
 2   Ma ATL              657 non-null    float64
 3   Ma Compl. Of Work   654 non-null    float64
 4   Ma Sub Achievement  657 non-null    float64
 5   Predicted Grade     654 non-null    float64
 6   Actual Grade        652 non-null    float64
 7   Scaled Total        653 non-null    float64
dtypes: float64(6), object(2)
memory usage: 48.7+ KB


In [None]:
df.isnull().sum()

Ma Std                35
Ma Teacher            35
Ma ATL                35
Ma Compl. Of Work     38
Ma Sub Achievement    35
Predicted Grade       38
Actual Grade          40
Scaled Total          39
dtype: int64

In [None]:
#drop rows that have null values
df.dropna(axis=0, inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 650 entries, 0 to 177
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Ma Std              650 non-null    object 
 1   Ma Teacher          650 non-null    object 
 2   Ma ATL              650 non-null    float64
 3   Ma Compl. Of Work   650 non-null    float64
 4   Ma Sub Achievement  650 non-null    float64
 5   Predicted Grade     650 non-null    float64
 6   Actual Grade        650 non-null    float64
 7   Scaled Total        650 non-null    float64
dtypes: float64(6), object(2)
memory usage: 45.7+ KB


In [None]:
# dummify Ma Std and Ma Teacher
col= ['Ma Std','Ma Teacher']

# Creaing dummies 
df= pd.get_dummies(columns=col, data=df,drop_first=True)

In [None]:
# create train test split 
X = df.drop(['Predicted Grade','Actual Grade'],axis=1)
y = df['Actual Grade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
# functions for comparing models 

# RMSE function
def rmse(model, X, y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse.mean())

#compare train, test, and cv score
def display_r2(model, X_train, y_train, X_test, y_test):
    print('Train Score: ', round(model.score(X_train,y_train),7))
    print('Test Score: ', round(model.score(X_test, y_test),7))
    print('Cross Val Score:', round(cross_val_score(model, X_test,y_test).mean(),7))

# model comparisons
model_dictionary = {}
def add_model(name, model, X_test, y_test):
    model_dictionary[name] = [round(rmse(model,X_test,y_test),7), #RMSE
                              round(model.score(X_test, y_test),7)] #r2 score
    return pd.DataFrame.from_dict(model_dictionary, orient = 'index', columns=['RMSE', 'R2 Score'])
    
# Plot Residuals and Predictions
def plot_pred(model, X_test, y_test):
    pred = model.predict(X_test)
    
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,7))
    
    #Plot Residuals
    ax1.set_title('Residuals Plot')
    ax1.set(xlabel='Predicted values', ylabel='Residuals')
    ax1.scatter(pred, y_test - pred)
    ax1.hlines(y = 0, xmin = min(pred), xmax = max(pred), colors='red', linestyles='solid')
    
    #Plot Predictions
    ax2.set_title('Predictions vs Actuals')
    ax2.set(xlabel='Predicted values', ylabel='Actual Values')
    ax2.scatter(pred, y_test)
    
    lims = [
    np.min([ax2.get_xlim(), ax2.get_ylim()]),  # min of both axes
    np.max([ax2.get_xlim(), ax2.get_ylim()]),  # max of both axes
    ]
    
    ax2.plot(lims, lims, 'k-', c = 'red', zorder=0)
                              
    plt.show()