In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import seaborn as sns

In [None]:
df = pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')
df

# Visualize the data

In [None]:
plt.style.use('ggplot')
fig, axes = plt.subplots(nrows=2)
ax1 = df.pretest.plot.kde(figsize=(16,10), ax=axes[0])
ax1 = df.posttest.plot.kde(ax=axes[0])
ax1.legend(['Pretest', 'Posttest'])
ax1.set_title('Density plot of pre/post test scores')
ax2 = df.n_student.plot.kde(ax=axes[1])
ax2.set_title('Density plot of number of students in class')
plt.tight_layout()
plt.show()

In [None]:
print(f'The median moved {df.posttest.median()-df.pretest.median()} points from pretest to posttest')

In [None]:
fig, axes = plt.subplots(figsize=(16,7))
ax1 = df.school.value_counts()[::-1].plot(kind='bar', ax=axes)
ax1.set_title('School distribution')
fig, axes = plt.subplots(ncols=2, figsize=(16,5))
ax2 = df.school_setting.value_counts().plot(kind='bar', ax=axes[0])
ax3 = df.school_type.value_counts().plot(kind='bar', ax=axes[1])
ax2.set_title('School setting')
ax3.set_title('School type')
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
df.groupby('school').pretest.mean().sort_values().plot(kind='bar', ax=ax)
ax.set_title('Comparison of average scores by school')
plt.show()

# Prep the data

In [None]:
from sklearn import preprocessing

df.head()

In [None]:
# disregard student_id from set
label_decoder = dict()
for col in ['school', 'school_setting', 'school_type', 'classroom', 'teaching_method', 'gender', 'lunch']:
    le = preprocessing.LabelEncoder()
    le.fit_transform(df[col])
    label_decoder[col] = le
df_le = df[['school', 'school_setting', 'school_type', 'classroom', 'teaching_method', 'gender', 'lunch']].apply(le.fit_transform, axis='index')
df_le[['n_student', 'pretest', 'posttest']] = df[['n_student', 'pretest', 'posttest']]
df_le.insert(8, 'test_diff', df_le.posttest - df_le.pretest)

In [None]:
df_le.head()

In [None]:
x = df_le.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_norm = pd.DataFrame(x_scaled)
df_norm.columns = df_le.columns
corr = df_norm.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style('white'):
    fig, ax = plt.subplots(figsize=(15,10))
    sns.heatmap(corr, ax=ax, cmap=sns.color_palette('light:#e24a33', as_cmap=True), xticklabels=True, mask=mask, linewidths=.5)
    ax.set_title('Heatmap showing correlation of variables')
    plt.show()

Interesting how little correlation any of these factors has with the actual improved test score between pre/post tests.

In [None]:
df_le

# Classification

In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

X, y = df_le.iloc[:,:-1], df_le.iloc[:,-1]
estimator = SVR(kernel='linear')
selector = RFE(estimator, n_features_to_select=4)
selector = selector.fit(X, y)
selected_cols = [c for i, c in enumerate(X.columns) if selector.support_[i]]
print(f'Used recursive feature elimination to select the following columns for our training:\n\n{selected_cols}')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score

X = df_le[selected_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = SVR(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

explained_variance_score(y_test, y_pred)

In [None]:
x_sorted = X_test.sort_values(by=['pretest'])
fig, ax = plt.subplots(figsize=(16,7))
x_predictions = pd.Series(clf.predict(x_sorted))
y_actuals = pd.Series(y_test[x_sorted.index])
y_actuals.reset_index().posttest.plot(ax=ax, linewidth=4)
x_predictions.plot(ax=ax, linewidth=.8)
ax.legend(['actual', 'predicted'])

In [None]:
pretest_indices = x_sorted.index

That was too easy with the pretests and test_diff known in the training. Let's throw those out and see how we can do.

In [None]:
X = df_le.drop(['test_diff', 'pretest', 'posttest'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = SVR(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

explained_variance_score(y_test, y_pred)

In [None]:
x_sorted = X_test.sort_index()
fig, ax = plt.subplots(figsize=(16,7))
x_predictions = pd.Series(clf.predict(x_sorted))
y_actuals = pd.Series(y_test[x_sorted.index])
y_actuals.reset_index().posttest.plot(ax=ax, linewidth=4)
x_predictions.plot(ax=ax, linewidth=.8)
ax.legend(['actual', 'predicted'])
ax.set_title('Plot of predictions made with minimal data')
plt.show()

In [None]:
X = df_le.drop(['test_diff', 'pretest', 'posttest'], axis=1)
X.school = label_decoder['school'].inverse_transform(X.school)

school_pred_acc = dict()
fig, axes = plt.subplots(nrows=8, ncols=3, figsize=(15,30))
for i, school in enumerate(X.school.unique()):
    _df = df_le.loc[df['school'] == school]
    X, y = _df.iloc[:,:-3], _df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    clf = SVR(kernel='linear')
    clf.fit(X_train, y_train)
    
    ax = axes[i//3, i%3]
    x_sorted = X_test.sort_index()
    x_predictions = pd.Series(clf.predict(x_sorted))
    y_actuals = pd.Series(y_test[x_sorted.index])
    y_actuals.reset_index().posttest.plot(ax=ax, linewidth=3)
    x_predictions.plot(ax=ax, linewidth=2)
    ax.legend(['actual', 'predicted'])
    ax.set_title(school)
    
    y_pred = clf.predict(X_test)
    school_pred_acc[school] = explained_variance_score(y_test, y_pred)

fig.suptitle('School comparison of predicted vs actual scores', fontsize='xx-large')
fig.tight_layout()
fig.subplots_adjust(top=.95)
fig.delaxes(axes[7, 2])
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
pd.Series(school_pred_acc).plot(kind='bar')
ax.set_title('Prediction accuracy by school')
plt.show()