In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading the data
df = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')
df.head()

In [None]:
df.describe()

In [None]:
# Dropping the id which should not be part of the algorithm
df = df.drop('student_id', axis=1)

In [None]:
# Checking any missing data
df.isna().sum().sum()

In [None]:
# To view the impact of the categorical values with check how the mean of the 
# difference between post and pre results depends on the categorical values.
df_withdiff = df.copy()
df_withdiff['Difference'] = df_withdiff.posttest - df_withdiff.pretest

In [None]:
# Checking categorical data
df_withdiff.groupby('school').mean()

In [None]:
df_withdiff.groupby('school_setting').mean()

In [None]:
df_withdiff.groupby('school_type').mean()

In [None]:
df_withdiff.groupby('teaching_method').mean()

In [None]:
df_withdiff.groupby('gender').mean()

In [None]:
df_withdiff.groupby('lunch').mean()

Of the categorical features, it seems like the teaching method is of most significance. We plot the different pre test and post test values to illustrate the significance of the pre test values as well as the teaching method.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.lmplot(x='pretest', y='posttest', ci = None, hue='teaching_method', data=df)
plt.show()


In [None]:
# Converting the categorical data to dummy/One hot en
df = pd.get_dummies(df, drop_first=True)

In [None]:
# Crea
X = df.drop('posttest', axis=1)
y = df['posttest']
print(f'There are {len(X)} points in the dataset')

In [None]:
# Setting up the train and the test data. 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=17)
print(f'There are {len(X_train)} points in the training dataset and {len(X_test)} points in the test set')

In [None]:
# Scale the data based on the training set
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
def gridsearch_cv_scores(models, params, scoring, cross_validation_folds, X_train, X_test, y_train, y_test):
    
    model_gs_scores = {}
    model_gs_best_param = {}
    
    for name, model in models.items():
        np.random.seed(17)
        gs_model = GridSearchCV(model,
                                param_grid=params[name],
                                scoring=scoring,
                                n_jobs=-1,
                                cv=cross_validation_folds,
                                verbose=2)
        
        gs_model.fit(X_train,y_train)

        model_gs_scores[name] = gs_model.score(X_test,y_test)
        model_gs_best_param[name] = gs_model.best_params_

    model_gs_scores = pd.DataFrame(model_gs_scores, index=[scoring])
    model_gs_scores = model_gs_scores.transpose().sort_values(scoring)
        
    return model_gs_scores, model_gs_best_param

In [None]:
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Setting the scope for the Grid Search
models = {'Ridge': Ridge(),
          'KNeighborsRegressor': KNeighborsRegressor(),
          'RandomForestRegressor': RandomForestRegressor(),
          'GradientBoostingRegressor': GradientBoostingRegressor()}
         
params = {'Ridge' : {'alpha': np.linspace(0, 1, 20),
                     'normalize': [True, False]},
          'KNeighborsRegressor': {'n_neighbors':[1, 2, 5, 10, 20]},
          'RandomForestRegressor': {'n_estimators': [50, 200],
                    'criterion' : ['mse', 'mae'],
                    'oob_score' : [True, False]},
          'GradientBoostingRegressor': {'criterion': ['mse', 'friedman_mse'],
                                        'loss': ['ls', 'lad', 'huber', 'quantile']}
          }
cross_validation_folds = 3
scoring = 'neg_root_mean_squared_error'

#### Performing a grid search for the provided models and parameters, with 3-fold cross validation. The scoring function is set to RMSE.

In [None]:
model_gs_scores, model_gs_best_param = gridsearch_cv_scores(models, 
                                                            params, 
                                                            scoring,
                                                            cross_validation_folds,
                                                            X_train, X_test, y_train, y_test)

### Summary of best results for each model type:

In [None]:
with pd.option_context('display.float_format', '{:,.3f}'.format):
    print(model_gs_scores)

In [None]:
print(f'The best model is the {model_gs_scores.index[-1]} model')
print(f'The best parameters are {model_gs_best_param[model_gs_scores.index[-1]]}')
print()
print(f'The second best model is the {model_gs_scores.index[-2]} model')
print(f'The second best parameters are {model_gs_best_param[model_gs_scores.index[-2]]}')

### Doing a finer search for the Ridge model

In [None]:
models = {'Ridge': Ridge()
         }
         
params = {'Ridge': {'alpha' : np.linspace(0.9, 1.0, 40),
                     'normalize': [False]}
          }
cross_validation_folds = 3
scoring = 'neg_root_mean_squared_error'

In [None]:
model_gs_scores, model_gs_best_param = gridsearch_cv_scores(models, 
                                                            params, 
                                                            scoring,
                                                            cross_validation_folds,
                                                            X_train, X_test, y_train, y_test)

In [None]:
with pd.option_context('display.float_format', '{:,.3f}'.format):
    print(model_gs_scores)

In [None]:
print(f'The best model is the {model_gs_scores.index[-1]} model')
print(f'The best parameters are {model_gs_best_param[model_gs_scores.index[-1]]}')



### Analysis
Intuitively, the pre test score should be a reasonably good predictor for the post test result. What would the results be with only the pretest values as input?

In [None]:
# Setting up data with only the pretest feature
Xpre = df[['pretest']]
Xpre_train, Xpre_test, ypre_train, ypre_test = train_test_split(Xpre, y, test_size=0.4, random_state=17)
prescaler = StandardScaler()
Xpre_train = prescaler.fit_transform(Xpre_train)
Xpre_test = prescaler.transform(Xpre_test)

In [None]:
# Setting the scope for the Grid Search
models = {'Ridge' : Ridge(),
         'KNeighborsRegressor': KNeighborsRegressor(),
         'RandomForestRegressor': RandomForestRegressor(),
         'GradientBoostingRegressor': GradientBoostingRegressor()}
         
params = {'Ridge' : {'alpha' : np.linspace(0,1,20),
                     'normalize': [True, False]},
          'KNeighborsRegressor': {'n_neighbors':[1,2,5,10,20]},
          'RandomForestRegressor': {'n_estimators' : [50,100,200],
                    'criterion' : ['mse','mae'],
                    'oob_score' : [True,False]},
          'GradientBoostingRegressor': {'criterion': ['mse', 'friedman_mse'],
                                        'loss': ['ls','lad','huber','quantile']}
          }
cross_validation_folds = 5
scoring = 'neg_root_mean_squared_error'

In [None]:
model_gs_scores, model_gs_best_param = gridsearch_cv_scores(models, 
                                                            params, 
                                                            scoring,
                                                            cross_validation_folds,
                                                            Xpre_train, Xpre_test, ypre_train, ypre_test)

In [None]:
with pd.option_context('display.float_format', '{:,.3f}'.format):
    print(model_gs_scores)

### Conclusion
We did get reasonably good results for predicting the post course test results from the data. For this dataset with 2000 points, we tried some simple ML models and a small Grid search. The best model was the Ridge model, and the RMSE was 2.9. The standard deviation of the post test scores was 14.0.

As expected, the pre test result is an important feature: running with only this feature results in an RMSE of 4.4.

Of the categorical features, the teaching method seems to be the most important.