In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')
df.head()

In [None]:
sns.heatmap(df.isnull())

In [None]:
df.describe()

In [None]:
df.school.nunique(), df.classroom.nunique()

In [None]:
categorical = df.select_dtypes('O')
categorical = categorical.drop(['classroom', 'student_id'], axis = 1)
fig, ax = plt.subplots(3, 2, figsize = (15, 15))
ax = ax.flatten()
for idx, column in enumerate(categorical):
    sns.boxplot(x = column, y = 'posttest', data = df, ax = ax[idx])

Boxplots of the categorical variables vs the target variable (posttest), in order to see what categorical values actually matter.
There is a great variation in posttest scores with respect to:
    school
    school_type
    lunch,
And a slight difference with respect to:
    chool_setting
    teaching method

In [None]:
fig, ax = plt.subplots(5, 5, figsize = (15, 15))
ax = ax.flatten()
schools = df['school'].unique()
for idx, school in enumerate(schools):
    temp = df.loc[df['school'] == school]
    sns.boxplot(x = 'classroom', y = 'posttest', data = temp, ax = ax[idx])
    ax[idx].set_title(school)
plt.tight_layout()
plt.show()

Here we can see that the classroom actually has an effect in certain schools. For example, for VKWQH (4, 5) the classroom can change the posttest score by 20 pts (roughly 50% higher than the lower test scores). How does this actually effect the fitting though?

In [None]:
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

df['grade'] = df.loc[:, 'posttest'].map(lambda x: x // 5)

In [None]:
sns.histplot(df['grade'], discrete = True)

In [None]:
cat = ['school', 'school_setting', 'lunch', 'school_type', 'teaching_method']
num = ['n_student', 'pretest']
ct = ColumnTransformer([('Scaling', MinMaxScaler(), num), ('Onehot Encoding', OneHotEncoder(), cat)])
scores = []
X = df[['school', 'school_setting', 'school_type', 'teaching_method', 'n_student', 'lunch', 'pretest']]
y = df['posttest']
z = df['grade']

In another trial, where I excluded classrooms, I used label encoding instead of onehot, and the results are pretty much the same. I changed to onehot, and am using column transformers for practice.

In [None]:
rsf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 2)
models = [Ridge, RandomForestRegressor, KNeighborsRegressor, DecisionTreeRegressor]
model_names = ['Ridge', 'Forest', 'KNN', 'Tree']
train_scores = defaultdict(list)
val_scores = defaultdict(list)
ridge_score = []

for name, model in zip(model_names, models):
    for train_idx, val_idx in rsf.split(X, z):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        pipe = Pipeline([('Column Transformer', ct), ('model', model())])
        pipe.fit(X_train, y_train)
        train_pred = pipe.predict(X_train)
        train_scores[name].append(mean_squared_error(y_train, train_pred, squared = False))
        val_pred = pipe.predict(X_val)
        val_scores[name].append(mean_squared_error(y_val, val_pred, squared = False))

for name in model_names:
    ts = train_scores[name]
    vs = val_scores[name]
    print(f'Model: {name} \n avg train:{sum(ts)/len(ts)}, avg val: {sum(vs)/len(vs)} \n min ts: {min(ts)}, min vs: {min(vs)} \n max ts: {max(ts)}, max vs: {max(vs)}')

In [None]:
val_score = []
ridge_score = []
for train_idx, val_idx in rsf.split(X, z):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        mod = Pipeline([('transformer', ct), ('model', Ridge())])
        mod.fit(X_train, y_train)
        val_pred = mod.predict(X_val)
        val_score.append(mean_squared_error(y_val, val_pred, squared = False))
        ridge_score.append(r2_score(y_val, val_pred))
        
np.average(np.array(val_score)), np.average(np.array(ridge_score))

In [None]:
cat = ['school', 'school_setting', 'lunch', 'school_type', 'teaching_method', 'classroom']
num = ['n_student', 'pretest']
ct = ColumnTransformer([('Scaling', MinMaxScaler(), num), ('Onehot Encoding', OneHotEncoder(), cat)])
scores = []
X = df[['school','classroom', 'school_setting', 'school_type', 'teaching_method', 'n_student', 'lunch', 'pretest']]
y = df['posttest']
z = df['grade']

In [None]:
rsf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 2)
models = [Ridge, RandomForestRegressor, KNeighborsRegressor, DecisionTreeRegressor]
model_names = ['Ridge', 'Forest', 'KNN', 'Tree']
train_scores = defaultdict(list)
val_scores = defaultdict(list)
ridge_score = []

for name, model in zip(model_names, models):
    for train_idx, val_idx in rsf.split(X, z):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        pipe = Pipeline([('Column Transformer', ct), ('model', model())])
        pipe.fit(X_train, y_train)
        train_pred = pipe.predict(X_train)
        train_scores[name].append(mean_squared_error(y_train, train_pred, squared = False))
        val_pred = pipe.predict(X_val)
        val_scores[name].append(mean_squared_error(y_val, val_pred, squared = False))

for name in model_names:
    ts = train_scores[name]
    vs = val_scores[name]
    print(f'Model: {name} \n avg train:{sum(ts)/len(ts)}, avg val: {sum(vs)/len(vs)} \n min ts: {min(ts)}, min vs: {min(vs)} \n max ts: {max(ts)}, max vs: {max(vs)}')

In [None]:
val_score = []
ridge_score = []
for train_idx, val_idx in rsf.split(X, z):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        mod = Pipeline([('transformer', ct), ('model', Ridge())])
        mod.fit(X_train, y_train)
        val_pred = mod.predict(X_val)
        val_score.append(mean_squared_error(y_val, val_pred, squared = False))
        ridge_score.append(r2_score(y_val, val_pred))
        
np.average(np.array(val_score)), np.average(np.array(ridge_score))

So we can see that adding the classroom category decreases our RMSE by about 10% (3.18 -> 2.87), and increases our R2 acurracy by about 1%. When I did GridSearchCV, nothing really changed when I varied alpha.

For the other models, the training error vs validation error shows that these models are clearly overfitting, hence the lack of reduction in the RMSE error compared to Ridge. I tried Gridsearch on XGB or Forest previously, and none of the parameters seemed to effect the overfitting levels by much (including a max depth of 1).

In [None]:
X.loc[:,'student_id'] = df.loc[:, 'student_id']
sns.scatterplot(x = df['student_id'].loc[val_idx], y = df['posttest'], color = 'red', label = 'Actual')
sns.scatterplot(x = df['student_id'].loc[val_idx], y = val_pred, color = 'blue', label = 'Predicted')
plt.title('Actual Scores (red) vs Predicted Scores (blue)')

Just to make sure nothing seems strange. 