In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import warnings
import graphviz
from sklearn import tree
warnings.filterwarnings("ignore") 
plt.style.use("bmh")

In [None]:
df = pd.read_csv("/kaggle/input/predict-test-scores-of-students/test_scores.csv")

In [None]:
df.head()

In [None]:
fig, axs = plt.subplots(2,2,figsize=(12,7))
plt.suptitle("Distribution of posttest")
sns.kdeplot(data=df, x="posttest", hue="gender", ax=axs[0,0])
sns.kdeplot(data=df, x="posttest", hue="school_type", ax=axs[0,1])
sns.kdeplot(data=df, x="posttest", hue="lunch", ax=axs[1,0])
sns.kdeplot(data=df, x="posttest", hue="teaching_method", ax=axs[1,1]);

In [None]:
sns.kdeplot(data=df, x="posttest", hue="school_setting");

In [None]:
n_students = df.n_student.unique()
n_students_mean_score = [df[df.n_student==x]["posttest"].mean() for x in n_students]
plt.figure(figsize=(12,5))
plt.title("Mean posttest score by n_student")
sns.barplot(x=n_students, y=n_students_mean_score)
plt.ylim(45, 85);

In [None]:
plt.figure(figsize=(12,5))
sns.scatterplot(data=df, x="n_student", y="posttest");

In [None]:
plt.figure(figsize=(12,5))
sns.scatterplot(data=df, x="posttest", y="pretest");

In [None]:
df.corr(method="spearman")

"prestest" and "posttest" are highly correllated. That means that a linear regression on "pretest" alone would be able to predict "posttest" with good precision.<br>
If we leave "prestest" in the dataset many (if not all) tree based algorithms would do exactly that - depend only on "pretest".<br>
As this is boring and we want to learn more about the influences of the other features we'll remove that feature before starting to fit our models.

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
target_columns = ["school_setting","school_type","teaching_method","gender","lunch"]
new_column_names = []
for target_column in target_columns:
    print(f"{target_column} has {len(df[target_column].unique())} unique values: {df[target_column].unique()}")
    for x in range(len(df[target_column].unique())):
        new_column_names.append(f"{target_column}_{x}")
myEncoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
print("New column names:",new_column_names)
myEncoder.fit(df[target_columns])
df_Xy = pd.concat([df.drop(target_columns+["school", "classroom", "student_id"], 1), pd.DataFrame(myEncoder.transform(df[target_columns]))], axis=1).reindex()
j = 0
for i, column in enumerate(df_Xy.columns):
    if type(column)==int:
        df_Xy[column] = df_Xy[column].astype("int64")
        df_Xy.rename(columns={column: new_column_names[j]}, inplace=True)
        j += 1

In [None]:
df_Xy.head()

In [None]:
#X = df_Xy[["n_student", "pretest", "school_setting_0", "school_setting_1", "school_setting_2", "school_type_0", "school_type_1", "teaching_method_0", "teaching_method_1", "gender_0", "gender_1", "lunch_0", "lunch_1"]].to_numpy()
# Removing "pretest" from our feature list
X = df_Xy[["n_student", "school_setting_0", "school_setting_1", "school_setting_2", "school_type_0", "school_type_1", "teaching_method_0", "teaching_method_1", "gender_0", "gender_1", "lunch_0", "lunch_1"]].to_numpy()
y = df_Xy[["posttest"]].to_numpy()
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

In [None]:
reg_dt = DecisionTreeRegressor(max_depth=3)
reg_dt.fit(X_train, y_train)
y_hat_dt = reg_dt.predict(X_test)

In [None]:
print("Mean squared error:", mean_squared_error(y_test, y_hat_dt))
print("Mean absolute error:", mean_absolute_error(y_test, y_hat_dt))
print("Median squared error:", median_absolute_error(y_test, y_hat_dt))

In [None]:
y_test_comp = np.ravel(y_test[-10:])
y_hat_comp_dt = y_hat_dt[-10:]
plt.figure(figsize=(12,5))
plt.title("Comparison of some samples for decision trees")
sns.scatterplot(x=[x for x in range(len(y_test_comp))], y=y_test_comp, marker="_", s=300)
sns.scatterplot(x=[x for x in range(len(y_test_comp))], y=y_hat_comp_dt, marker="_", s=300);

In [None]:
dot_data = tree.export_graphviz(reg_dt, out_file=None, feature_names=["n_student", "school_setting_0", "school_setting_1", "school_setting_2", "school_type_0", "school_type_1", "teaching_method_0", "teaching_method_1", "gender_0", "gender_1", "lunch_0", "lunch_1"], filled=True, rounded=True,  special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

First let's break down what the features mean:

school_setting has 3 unique values:  "Urban", "Suburban", "Rural"<br>
school_type has 2 unique values: "Non-public", "Public"<br>
teaching_method has 2 unique values: "Standard", "Experimental"<br>
gender has 2 unique values: "Female", "Male"<br>
lunch has 2 unique values: "Does not qualify", "Qualifies for reduced/free lunch"

As all features are encoded using "one hot encoding" the feature "school_seeting_1" represents "Suburban" with the possible values 0 = False and 1 = True

One could expect the highest score if:
* Does **not** qualify for reduced/free lunch
* school_setting: Suburban
* n_student: <=26

Other interesting influences:
* school_type "Public" dramatically reduces the expected score
* Fewer students per class = Higher score

In [None]:
max_depths = [x for x in range(2,11)]
train_score = []
test_score = []
for max_depth in max_depths:
    reg_dt = DecisionTreeRegressor(max_depth=max_depth)
    reg_dt.fit(X_train, y_train)
    y_train_dt = reg_dt.predict(X_train)
    y_test_dt = reg_dt.predict(X_test)
    train_score.append(mean_squared_error(y_train, y_train_dt))
    test_score.append(mean_squared_error(y_test, y_test_dt))

Let's see when the decision tree model starts to overfit...

In [None]:
plt.figure(figsize=(14,4))
plt.title("max_depth: MSE for train vs. test data")
plt.xlabel("max_depth")
plt.ylabel("MSE")
sns.scatterplot(x=max_depths, y=train_score, marker="_", s=300)
sns.scatterplot(x=max_depths, y=test_score, marker="_", s=300);

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
reg_rf = RandomForestRegressor(n_estimators=30, max_depth=3)
reg_rf.fit(X_train, np.ravel(y_train))
y_hat_rf = reg_rf.predict(X_test)

In [None]:
print("Mean squared error:", mean_squared_error(y_test, y_hat_rf))
print("Mean absolute error:", mean_absolute_error(y_test, y_hat_rf))
print("Median squared error:", median_absolute_error(y_test, y_hat_rf))

In [None]:
y_hat_comp_rf = y_hat_rf[-10:]
plt.figure(figsize=(12,5))
plt.title("Comparison of some samples for random forests")
sns.scatterplot(x=[x for x in range(len(y_test_comp))], y=y_test_comp, marker="_", s=300)
sns.scatterplot(x=[x for x in range(len(y_test_comp))], y=y_hat_comp_rf, marker="_", s=300);

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
reg_gb = GradientBoostingRegressor(n_estimators=30, max_depth=3)
reg_gb.fit(X_train, np.ravel(y_train))
y_hat_gb = reg_gb.predict(X_test)

In [None]:
print("Mean squared error:", mean_squared_error(y_test, y_hat_gb))
print("Mean absolute error:", mean_absolute_error(y_test, y_hat_gb))
print("Median squared error:", median_absolute_error(y_test, y_hat_gb))

In [None]:
y_hat_comp_gb = y_hat_gb[-10:]
plt.figure(figsize=(12,5))
plt.title("Comparison of some samples for gradient boosted trees")
sns.scatterplot(x=[x for x in range(len(y_test_comp))], y=y_test_comp, marker="_", s=300)
sns.scatterplot(x=[x for x in range(len(y_test_comp))], y=y_hat_comp_gb, marker="_", s=300);

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
reg_ab = AdaBoostRegressor(n_estimators=30)
reg_ab.fit(X_train, np.ravel(y_train))
y_hat_ab = reg_ab.predict(X_test)

In [None]:
print("Mean squared error:", mean_squared_error(y_test, y_hat_ab))
print("Mean absolute error:", mean_absolute_error(y_test, y_hat_ab))
print("Median squared error:", median_absolute_error(y_test, y_hat_ab))

In [None]:
y_hat_comp_ab = y_hat_ab[-10:]
plt.figure(figsize=(12,5))
plt.title("Comparison of some samples for ada gradient boosted trees")
sns.scatterplot(x=[x for x in range(len(y_test_comp))], y=y_test_comp, marker="_", s=300)
sns.scatterplot(x=[x for x in range(len(y_test_comp))], y=y_hat_comp_ab, marker="_", s=300);