# Libraries

In [None]:
import pandas as pd
from plotnine import *
import plotnine
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from time import time

# Data

In [None]:
raw_data = pd.read_csv("/kaggle/input/predict-test-scores-of-students/test_scores.csv")

# EDA

#### Getting a feel of the fields

In [None]:
raw_data.head(21)

#### Sanity check: Seeing if count of all unique kids (student_id) is the same as n_student (all students of the class were included)

In [None]:
# Count unique kids
unique_kids = raw_data[["school", "classroom", "student_id"]].drop_duplicates().groupby(["school", "classroom"]).count()\
            .reset_index()
# Join with n_student data
n_student = pd.merge(unique_kids, raw_data[["school", "classroom", "n_student"]], on=["school", "classroom"],
                     how="left")
# Getting the min and max of each school-classroom
school_min = n_student.groupby(["school", "classroom"]).min().reset_index()
school_max = n_student.groupby(["school", "classroom"]).max().reset_index()
school = pd.merge(school_min, school_max, on=["school", "classroom"], suffixes=["_min", "_max"])

# Comparing if min of count is less than n_student_max (this should tell if there's a difference between 
# n_student and the actual number of unique students)
school["min_id_max_n"] = school["student_id_min"] - school["n_student_max"]
school.describe()

Observation: all student ids are counted into n_student

#### Comparing test scores between different groups

##### School setting

In [None]:
pd.pivot_table(raw_data.drop(columns = "n_student"), columns="school_setting", aggfunc=np.mean)

Observation: 
- In general Suburban students have a higher score pre and post test
- In general ~ 12 to 13 points increase from pre to post test scores, but bigger relative change in Rural and Urban students because of lower baseline (pre test scores)

##### Gender

In [None]:
pd.pivot_table(raw_data.drop(columns="n_student"), columns = ["school_type", "school_setting", "gender"],
               aggfunc = np.mean)

Observation:
- In general scores seem very comparable across male and females

#### Free / reduced fee lunch

##### Non-public schools

In [None]:
pd.pivot_table(raw_data.query("school_type == 'Non-public'").drop(columns="n_student"),
               columns = ["school_type", "school_setting", "gender", "lunch"], aggfunc = np.mean)

##### Public schools

In [None]:
pd.pivot_table(raw_data.query("school_type == 'Public'").drop(columns="n_student"),
               columns = ["school_type", "school_setting", "gender", "lunch"], aggfunc = np.mean)

Observation:
- The lunch feature seems important: being poor reduces student performance in general

#### A good visual summary of the effect of school_type, school_setting, and need for lunch support in the post test scores
- Non-public schools
    - Not a big difference between rural, suburban and urban for students that don't need lunch support
    - Students that need lunch support and which school is in an urban setting have a substantially lower performance wrt suburan and urban
- Public schools
    - For students that don't need lunch support, suburban students are doing better than urban and rural students
    - This is difference is less marked for students that need lunch support

In [None]:
plotnine.options.figure_size=(10,10)
(
ggplot(raw_data, aes(x = "posttest", fill="school_setting"))
    +geom_histogram(bins=30, alpha = 0.3, color="black")
    +facet_grid("lunch ~ school_type")
)

In [None]:
plotnine.options.figure_size=(10,10)
(
ggplot(raw_data, aes(x = "school_setting", y = "posttest"))
    +geom_boxplot(outlier_shape="o")
    +facet_grid("lunch ~ school_type")
)

#### Effect of teaching method

In [None]:
plotnine.options.figure_size=(15,5)
(
ggplot(raw_data, aes(x = "posttest", fill="teaching_method"))
    +geom_histogram(bins=30, alpha = 0.3, color="black")
    +facet_grid("lunch ~ school_type + school_setting")
)

In [None]:
plotnine.options.figure_size=(15,5)
(
ggplot(raw_data, aes(x = "teaching_method", y = "posttest"))
    +geom_boxplot(outlier_shape="o")
    +facet_grid("lunch ~ school_type + school_setting")
)

Observation:
- The teaching method seems to have a positive effect, but its magnitude varies greatly across groups

#### Correlation between pre and post test score

In [None]:
raw_data.groupby(["school_type", "school_setting", "lunch"])[["pretest", "posttest"]].corr().reset_index()\
.query("level_3=='pretest'").sort_values("posttest", ascending=False)

In [None]:
plotnine.options.figure_size=(10,10)
(
ggplot(raw_data, aes(x = "pretest", y = "posttest", fill="school_setting"))
    +geom_point(size=3, alpha=0.5)
    +geom_smooth(aes(color="school_setting"), method='lm')
    +facet_grid("lunch ~ school_type")
)

Observation:
- Pre and post test scores are highly correlated for most subgroups, but for some is more moderated (0.5 - 0.6)

#### Correlation between n_student (class size) and posttest score

In [None]:
plotnine.options.figure_size=(10,10)
(
ggplot(raw_data, aes(x = "n_student", y = "posttest", fill="school_setting"))
    +geom_point(size=3, alpha=0.5)
    +geom_smooth(aes(color="school_setting"), method='lm')
    +facet_grid("lunch ~ school_type")
)

Observation:
- Public schools: in general we see that the more students the lower the posttest score
- Non-public schools: it seems to be an opposite trend (except for urban schools)
    - I think this is because the range of students in non-public schools is never as big as in public schools

#### Number of students in each subgroup (to see if all subgroups are well represented)

##### Number of students/class by school_type and school_setting

In [None]:
pd.pivot_table(raw_data, values="n_student", columns=["school_type", "school_setting"], aggfunc=np.mean)

Observation:
- In general non-public schools have less students/class than public schools
- In general urban schools have more students/class thant suburban and rural schools

#### Distribution of gender across school_type and school_setting

In [None]:
pd.pivot_table(raw_data, values="student_id", columns=["school_type", "school_setting", "gender"],
               aggfunc=len)

Observation:
- Males and females seem to be well represented across school_type and school_setting, with seemingly more males in rural schools

#### Distribution of lunch assitance accross school type, school setting and gender

##### Non-public schools

In [None]:
pd.pivot_table(raw_data.query("school_type == 'Non-public'"), values="student_id", 
               columns=["school_type", "school_setting", "gender", "lunch"],
               aggfunc=len)

##### Public schools

In [None]:
pd.pivot_table(raw_data.query("school_type == 'Public'"), values="student_id", 
               columns=["school_type", "school_setting", "gender", "lunch"],
               aggfunc=len)

Observation:
- More students qualify for lunch in urban (\~60%) and rural (\~40%) school settings. Much less (\~30%) in suburban settings
- That's why the scores are ordered in the same manner, socioeconomical status seems to be very important in determining test scores
- Lunch support distribution seems to be similar between male and female students

#### Distribution of treatment (teaching_method) by school_type, school_setting, gender

##### Non-public schools

In [None]:
pd.pivot_table(raw_data.query("school_type == 'Non-public'"), values="student_id",
               columns=["school_type", "school_setting", "gender", "teaching_method"], aggfunc=len)

##### Public schools

In [None]:
pd.pivot_table(raw_data.query("school_type == 'Public'"), values="student_id", 
               columns=["school_type", "school_setting", "gender", "teaching_method"], aggfunc=len)

Observation:
- It seems that in non public schools the use of the experimental method was more pervasive, specially in rural the school setting (i.e. very few standard method in Non-public rural school students)

#### Distribution of teaching method by lunch status

Non-public schools

In [None]:
pd.pivot_table(raw_data.query("school_type == 'Non-public'"), values="student_id", 
               columns=["school_type", "school_setting", "lunch", "teaching_method"], aggfunc=len)

Public schools

In [None]:
pd.pivot_table(raw_data.query("school_type == 'Public'"), values="student_id", 
               columns=["school_type", "school_setting", "lunch", "teaching_method"], aggfunc=len)

Observation:
- For Non-public schools, it seems that for suburban and urban settings, students qualifying for lunch support were more likely to receive experimental teaching than students that didn't qualify to lunch support
- For public schools this was not the case the trend seems to be the opposite, with students that didn't qualify for lunch support being more likely to receive the experimental teaching method thant students that didn't require lunch support, this in all school settings

# Model fitting
## Steps:
- Explore features (e.g. for continuous variables is it gaussian or skewed (e.g. left, right)?), if not Gaussian should use a transformation to make it more Gaussian (e.g. use log)
- ~Find promissing features based on EDA (done above)~
    - Pretest scores, teaching method, lunch support, school setting
- Fit POC (I'll try with linear regression and a random forest regressor)
    - Split data into train, validation, test
    - Normalize (mean 0, sd 1)
    - Fit models
    - Tune hyperparams of the best model using validation set
- Evaluate on test set:
    - R2
    - RMSE
    - MAPE
- Conclude

#### Exploring distribution of continuous features

In [None]:
sns.pairplot(raw_data, diag_kind = 'kde', hue='teaching_method',
             plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'},
             height = 4)

Observation:
- Continuous features are not awfully asymmetrical. I wont transform them

#### Dropping unused features and generate dummy variables for categorical ones

In [None]:
processed_data = raw_data.drop(columns=["school", "classroom", "student_id"])
processed_data = pd.get_dummies(processed_data, columns = processed_data.\
                                drop(columns=["n_student", "pretest", "posttest"]).columns,
                               drop_first=True)

In [None]:
processed_data.head()

#### Split data into train, validation and test sets (70, 20, 10)

In [None]:
# Getting the train and validation-test split
x_train, x_val, y_train, y_val = train_test_split(processed_data.drop(columns="posttest"),
                                                  processed_data["posttest"], 
                                                  test_size=0.30, random_state=42)
# Splitting validation-test into validation and test sets
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, 
                                                  test_size=1/3, random_state=42)

#### Creating pipelines for scaling and model fitting

In [None]:
# Linear regression
steps = [("scale", StandardScaler()), ("lr", LinearRegression())]
pipe_lr = Pipeline(steps)
pipe_lr.fit(x_train, y_train) 
# Random forest regressor
steps = [("scale", StandardScaler()), ("rf", RandomForestRegressor())]
pipe_rf = Pipeline(steps)
pipe_rf.fit(x_train, y_train) 

#### Evaluate training and validation performance

Training set performance metrics

In [None]:
print(
    "Linear regression metrics on training set:\n"
    "R2: ", r2_score(y_train, pipe_lr.predict(x_train)),"\n", 
    "RMSE: ", mean_squared_error(y_train, pipe_lr.predict(x_train))**0.5, "\n"
    "median_absolute_error: ", median_absolute_error(y_train, pipe_lr.predict(x_train)),"\n",
    "--------------------------", "\n",
    "Random forest metrics on training set:\n"
    "R2: ", r2_score(y_train, pipe_rf.predict(x_train)),"\n", 
    "RMSE: ", mean_squared_error(y_train, pipe_rf.predict(x_train))**0.5, "\n"
    "median_absolute_error: ", median_absolute_error(y_train, pipe_rf.predict(x_train))    
)

Validation set performance metrics

In [None]:
print(
    "Linear regression metrics on validation set:\n"
    "R2: ", r2_score(y_val, pipe_lr.predict(x_val)),"\n", 
    "RMSE: ", mean_squared_error(y_val, pipe_lr.predict(x_val))**0.5, "\n"
    "median_absolute_error: ", median_absolute_error(y_val, pipe_lr.predict(x_val)),"\n",
    "--------------------------", "\n",
    "Random forest metrics on validation set:\n"
    "R2: ", r2_score(y_val, pipe_rf.predict(x_val)),"\n", 
    "RMSE: ", mean_squared_error(y_val, pipe_rf.predict(x_val))**0.5, "\n"
    "median_absolute_error: ", median_absolute_error(y_val, pipe_rf.predict(x_val))    
)

In [None]:
plt.scatter(y_train, pipe_lr.predict(x_train), alpha = 0.5, marker='o', label = "training")
plt.scatter(y_val, pipe_lr.predict(x_val), alpha = 0.5, marker='o', label="validation")
plt.title("Observed vs predicted in training and validation sets: Linear regression")
plt.legend()

In [None]:
plt.scatter(y_train, pipe_rf.predict(x_train), alpha = 0.5, marker='o', label = "training")
plt.scatter(y_val, pipe_rf.predict(x_val), alpha = 0.5, marker='o', label="validation")
plt.title("Observed vs predicted in training and validation sets: Random forest")
plt.legend()

Observation:
- Random forest seem to be overfitting the data
- Still, on the validation set the performance is very comparable between linear regression and random forest
    - Tuning the random forest can potentially increase performance even further

### Random forest hyperparameter tuning

Setting up a grid to do random search

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'rf__n_estimators': n_estimators,
               'rf__max_features': max_features,
               'rf__max_depth': max_depth,
               'rf__min_samples_split': min_samples_split,
               'rf__min_samples_leaf': min_samples_leaf,
               'rf__bootstrap': bootstrap}
print(random_grid)

Total number of models to try

In [None]:
n_models = 1
for key in random_grid.keys():
    n_models *= len(random_grid[key])
n_models

This is a lot of models to try in a timely manner. Good thing I'm using random search

Searching the parameter space

In [None]:
t0 = time()
print("Fitting started...")
search = RandomizedSearchCV(pipe_rf, param_distributions=random_grid, n_iter=100, cv=5, n_jobs = -1, verbose=8)
search.fit(x_train, y_train)
print(f"Fitting took {time() - t0:0.3f}s.")

#### Evaluating training, validation and test performance of the random forest with best hyperparams

In [None]:
print(
    "Evaluation of best performing random forest model \n"
    "Random forest metrics on training set:\n",
    "R2: ", r2_score(y_train, search.predict(x_train)),"\n", 
    "RMSE: ", mean_squared_error(y_train, search.predict(x_train))**0.5, "\n"
    "median_absolute_error: ", median_absolute_error(y_train, search.predict(x_train)),"\n",
    "--------------------------", "\n",
    "Random forest metrics on validation set:\n"
    "R2: ", r2_score(y_val, search.predict(x_val)),"\n", 
    "RMSE: ", mean_squared_error(y_val, search.predict(x_val))**0.5, "\n"
    "median_absolute_error: ", median_absolute_error(y_val, search.predict(x_val)), "\n",
    "--------------------------", "\n",
    "Random forest metrics on test set:\n"
    "R2: ", r2_score(y_test, search.predict(x_test)),"\n", 
    "RMSE: ", mean_squared_error(y_test, search.predict(x_test))**0.5, "\n"
    "median_absolute_error: ", median_absolute_error(y_test, search.predict(x_test))
)

In [None]:
plt.scatter(y_train, search.predict(x_train), alpha = 0.5, marker='o', label = "training")
plt.scatter(y_val, search.predict(x_val), alpha = 0.5, marker='o', label="validation")
plt.scatter(y_test, search.predict(x_test), alpha = 0.5, marker='o', label="test")
plt.title("Observed vs predicted in training, validation, and test sets using a tuned Random forest model")
plt.legend()

Observation:
- The tuned random forest model has a similar performance between training, validation and test set
- Performance on the validation set is higher than the simple linear regression model

#### Feature importance

In [None]:
feat_importances = pd.Series(search.best_estimator_["rf"].feature_importances_, index = x_train.columns)
feat_importances.nlargest(10).plot(kind='barh');

Observation:
- From the tune random forest feature importance, it can be seen that the three most important features are, in order: pretest score, lunch support, and number of students

#### For a better understanding: linear regression coefficients

In [None]:
feat_importances = pd.Series(pipe_lr["lr"].coef_, index = x_train.columns)
feat_importances.nlargest(10).plot(kind='barh');

Observation:
- The most importan feature was the pretest score: the higher the pretest score, the higher the posttest score 
- This was followed by teaching method: The new teaching method has an adjusted effect of roughly 2 units in the posttest score
- Features that are proxies of socioeconomic status: lunch assistance, number of students per class have a negative effect in the posttest scores, while suburban schooling has a positive effect
- There are likely many interaction effects between these features that are ignored by this simple model

# Conclusions:
- Reasonably accurate predictions on the test set  (~ 96% R2) can be obtained using a random forest regressor using the features `n_student`, `pretest`, `school_setting`,`school_type`, `teaching_method`, `gender`, and `lunch`
- Futures areas of exploration include error analysis, to identify for which subgroups this model is performing the worst, and which could hence benefit from remedial measures (e.g. collecting more data, feature engineering, other modeling approaches, etc)