# Introduction
This is a project notebook to predict the scores of students using a few features about the school. 

Data source: https://www.kaggle.com/kwadwoofosu/predict-test-scores-of-students

### Modelling setup

In [None]:
# import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# import encoder
from sklearn.preprocessing import LabelEncoder

# import joblib
from joblib import dump, load

# import sklearn library
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# read CSV as DataFrame
df = pd.read_csv("test_scores.csv")
df

In [None]:
# get DataFrame information
df.info()

In [None]:
# get the summary and description for all columns
df.describe(include='all')

### Encode labels

In [None]:
# drop columns that are not required for now
df = df.drop(['student_id'], axis=1)

In [None]:
# encode lunch
lunch_le = LabelEncoder()

df['lunch'] = lunch_le.fit_transform(df['lunch'])

In [None]:
# encode gender
gender_le = LabelEncoder()

df['gender'] = gender_le.fit_transform(df['gender'])

In [None]:
# encode school setting
school_setting_le = LabelEncoder()

df['school_setting'] = school_setting_le.fit_transform(df['school_setting'])

In [None]:
# encode school type
school_type_le = LabelEncoder()

df['school_type'] = school_type_le.fit_transform(df['school_type'])

In [None]:
# encode teaching method
teaching_method_le = LabelEncoder()

df['teaching_method'] = teaching_method_le.fit_transform(df['teaching_method'])

In [None]:
# take a quick look at the score distribution
plt.hist(df['n_student'])

In [None]:
# select columns that are only numbers
df_temp = df.select_dtypes('number')

df_temp

### Setting the baseline

In [None]:
# see the distribution of the difference in scores between pretest and posttest
plt.hist(df_temp['posttest'] - df_temp['pretest'])

In [None]:
print(np.sqrt(mean_squared_error(df_temp['pretest'] + 12, df_temp['posttest'])))
plt.scatter(df_temp['pretest'] + 12, df_temp['posttest'])

### Model training

In [None]:
# X = df_temp.drop(['pretest', 'posttest'], axis=1)
X = df_temp.drop(['posttest'], axis=1)
y = df_temp['posttest']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
(df_temp['posttest'] - df_temp['pretest']).mean()

In [None]:
print(np.sqrt(mean_squared_error(y_test, X_test['pretest'] + 12)))
plt.scatter(X_test['pretest'] + 12, y_test)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, lr_pred)))

plt.scatter(y_test, lr_pred)

In [None]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, tree_pred)))

plt.scatter(y_test, tree_pred)

In [None]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, forest_pred)))

plt.scatter(y_test, forest_pred)

In [None]:
# prediction of the first row of X_test
forest_pred[0]

In [None]:
# check the residuals
plt.scatter(y_test.index, y_test - forest_pred)

### Exporting the model as a .joblib file

In [None]:
# dump the model as a .joblib file
dump(forest, 'forest_v1.joblib')

In [None]:
# reload the .joblib file
loaded_forest = load('forest_v1.joblib')
loaded_forest

In [None]:
# take a quick look at the test dataframe
X_test

In [None]:
# get a random row
X_test.iloc[[0]]

In [None]:
# get the corresponding y test data
y_test.iloc[0]

In [None]:
# try the loaded model
loaded_forest.predict(X_test.iloc[[0]])