# Lab 05 - Extended Exercises on Model Evaluation
## Predicting student performance

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,  OneHotEncoder

# Data directory
DATA_DIR = "./../../data/"

In [None]:
import requests

exec(requests.get("https://courdier.pythonanywhere.com/get-send-code").content)

npt_config = {
    'session_name': 'lab-05',
    'session_owner': 'mlbd',
    'sender_name': input("Your name: "),
}

## Introduction
The data has already been cleaned and it comes from 29 students in 3 different groups in a course of 26 weeks.

You already used this data in week 03. 

In this lab you will explore different models to predict the quiz grade. 

In [None]:
# Load data
df= pd.read_csv(f'{DATA_DIR}grades_in_time.csv.gz')
df.head()

In [None]:
df.describe(include='all')

# 

# Task 1: Predict the quiz grade using the studying hours and the group.
----------
### 1.1 Split the data. 80% to train and the rest to test. 


In [None]:
X = df[['studying_hours', 'group']]
y = df['quiz_grade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

### 1.2 Preprocess the data
Recall that group is a categorical feature.

Hint: Use ColumnTransformer.

In [None]:
preprocessor = ColumnTransformer([
    ('categorical', OneHotEncoder(handle_unknown='ignore', drop = 'first'), ['group']),
    ('numerical', MinMaxScaler(),['studying_hours'])
])

preprocessor.fit_transform(X_train)

### 1.3 Create a pipeline (including the preprocessing steps) to predict the quiz grade using the studying hours and the group.

1. Use the model ElasticNet for the regression task.
2. Calculate the mean sqaured error of the prediction. 


Hint: Integrate the ColumnTransformer as a pipeline step

In [None]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNet())
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
error = round(mean_squared_error(y_test, y_pred),3)
print(f"Mean Squared Error = {error}")

### 1.4 Compute the cross validation score

In [None]:
# Fit a pipeline with transformers and an estimator to the training data
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNet())
])

(-1)*np.mean(cross_val_score(pipe, X, y, cv = 5, scoring = 'neg_mean_squared_error'))

### 1.5 Does the score in 1.3 differ from the score in 1.4? Why? 

Answer = 1.3 is one fold and 1.4 is the average of multiple folds.

### 1.6 What is wrong with data split?
Answer: In some instances, we are using the data from future weeks to predict the grade in prior weeks. This is wrong as it doesn't reflect the reality at practice.

In [None]:
df.iloc[X_test.index][['week','student']]

## Task 2: Time Validation

### 2.1 Train with the first 25 weeks and predict week 26.

Hint: You may re-use your pipeline

In [None]:
df_train = df.query('week < 26')
df_test = df.query('week == 26')

In [None]:
X_train = df_train[['studying_hours', 'group','week']]
y_train = df_train['quiz_grade']

X_test = df_test[['studying_hours', 'group','week']]
y_test = df_test['quiz_grade']

In [None]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
error = round(mean_squared_error(y_test, y_pred),3)
print(f"Mean Squared Error = {error}")

### 2.2 Time splits
Would the model also be able to predict week 16 from all the previous weeks? 

What about week 5 from the previous weeks?

Create all the data splits so that the model predicts the next week given the information from the previous weeks. 


In [None]:
time_splits = [tuple([list(df.query('week < @i').index), list(df.query('week == @i').index)]) for i in range(4,27)]

### 2.3 Using the previously created splits, calculate the cross validation score

In [None]:
X = df[['studying_hours', 'group']]
y = df['quiz_grade']

errors = (-1)*cross_val_score(pipe, X, y, cv = time_splits, scoring = 'neg_mean_squared_error')
np.mean(errors)

### 2.4 How does the error differ from the error of 2.1? Why?
Answer = It is much higher. On the following plot we see that the more weeks (more information), the smaller the error

In [None]:
sns.lineplot(y = errors, x = list(range(4,27)))

## Task 3: Nested cross-validation

Now imagine we want to optimize the hyperparameters for the model.

We will "ignore" time for now and take the mean studying hours and quiz grade. 


In [None]:
df_agg = df.groupby('student').mean()

X = df_agg[['studying_hours', 'group']]
y = df_agg['quiz_grade']

### 3.1 Gridsearch with cross validation

ElasticNet has two interesting parameters: alpha and l1_ratio.

Run a GridSearch to explore the following values:
* alpha = 0.1 and 1
* l1_ratio = 0.1, 0.5 and 1

What is the best score (smallest error)? 

In [None]:
param_grid = {'model__alpha': [0.1, 1],
             'model__l1_ratio': [0.1, 0.5, 1]}

search = GridSearchCV(pipe, param_grid, n_jobs=-1, cv =  KFold(n_splits=4, shuffle=True, random_state=123) ,
                      scoring = 'neg_mean_squared_error')
search.fit(X,y)

In [None]:
pd.DataFrame(search.cv_results_)

In [None]:
(-1)*search.best_score_

### 3.2 Why is the error from the best model in 3.1 biased?

Answer = We are using the same data to tune model parameters and evaluate model performance.

### 3.3 Improve 3.1 to have an unbiased estimation of the generalization error

Hint: Use nested cross-validation

In [None]:
inner_cv = KFold(n_splits=4, shuffle=True, random_state=123)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=123)
    
param_grid = {'model__alpha': [0.1, 1],
             'model__l1_ratio': [0.1, 0.5, 1]}

search = GridSearchCV(pipe, param_grid, n_jobs=-1, 
                      cv = inner_cv, scoring = 'neg_mean_squared_error')
errors = (-1)* cross_val_score(search, X=X, y=y,
                               cv=outer_cv)

np.mean(errors)

In [None]:
errors