In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

plt.style.use('fivethirtyeight')

In [None]:
## Importing and viewing the data
test_data = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')
test_data.head()

In [None]:
# Clean the data by removing unnecessary columns
test_data.drop(['classroom', 'student_id'], axis = 1, inplace = True)
test_data.head()

In [None]:
# Conduct checks on the data
len(test_data)

In [None]:
test_data.dtypes

In [None]:
test_data.info()

In [None]:
# Check for missing data
test_data.isna().sum()

In [None]:
# Run checks on the set of values in the categorical variables
# The number of different schools in the data
len(set(test_data['school']))

In [None]:
# Check the options for the categorical variables
def categorical_options(categories):
    cat_options = {}
    for cat in categories:
        options = list(set(test_data[cat]))
        cat_options[cat] = options
        
    return cat_options
        
categories = ['school_setting', 'school_type', 'teaching_method', 'gender', 'lunch']
categorical_options(categories)

## An exploratory data analysis on the data

In [None]:
# Extract pretest and posttest scores into numpy arrays
pretest = test_data['pretest'].values
posttest = test_data['posttest'].values

In [None]:
# Average pretest and posttest scores
avg_pre = np.mean(pretest)
avg_post = np.mean(posttest)
avg_pre, avg_post

In [None]:
# Number of students who performed better than average in both tests
np.sum(pretest > avg_pre), np.sum(posttest > avg_post)

In [None]:
# Highest pretest score
np.max(pretest)

In [None]:
# how many students obtained the highest pretest scores?
np.sum(pretest == 93)

In [None]:
# Which student obtained the highest pretest? What are his/her characteristics?
test_data.iloc[np.argmax(pretest), :]

In [None]:
# Highest score in the posttest
np.max(posttest)

In [None]:
# How many students obtained the highest posttest score?
np.sum(posttest == 100.0)

In [None]:
# Looking into students who obtained the highest post test scores
test_data[test_data.posttest == 100]

Intersttingly, they share the same features except for gender (2 females, 6 males) and their pretest scores.

In [None]:
# Are there students who did not improve upon their pretest scores?
np.any(pretest >= posttest)

In [None]:
# How many of such students are there?
np.sum(pretest >= posttest)

### Visualising the categorical variables

In [None]:
# Bar chart on school setting
sns.countplot(x= test_data.school_setting);

In [None]:
# Bar chart on school type
sns.countplot(x = test_data.school_type);

In [None]:
# Plot of teaching method
sns.countplot(x = test_data.teaching_method);

In [None]:
# Bar chart on gender
sns.countplot(x = test_data.gender);

In [None]:
# Bar chart on lunch
sns.countplot(x = test_data.lunch);

## Analysing the continous variables (Pretest and Posttest)

In [None]:
# Run descriptive statistics on the continous variables
test_data.describe()

In [None]:
# Visulaise the pretest scores with the box plot and histogram
plt.style.use('seaborn-darkgrid')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (12, 6))

sns.boxplot(x = test_data.pretest, ax = ax1, linewidth=1.5)
sns.histplot(x = test_data['pretest'], ax = ax2, kde = True)

ax1.tick_params(labelsize = 14)
ax2.tick_params(labelsize = 14)

In [None]:
# Visulaise the pretest scores with the box plot and histogram
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (12, 6))

sns.boxplot(y = test_data.posttest, x = test_data.gender, ax = ax1, linewidth=1.5)
sns.histplot(x = test_data['posttest'], ax = ax2, kde = True)
ax1.tick_params(labelsize = 14)
ax2.tick_params(labelsize = 14)

In [None]:
# A quick plot of both pretext and post test side by side
test_data[['pretest', 'posttest']].plot(figsize = (16, 9), linewidth = 1)

### Finding correlation between pre-test and post-test scores

In [None]:
# Is there a correlation between pretest and posttest scores?
# Let's visualise by scatter - plotting the pretest against the post test

plt.style.use('fivethirtyeight')

fig, ax = plt.subplots(figsize = (12, 7))

ax.plot('pretest', 'posttest', 'o', data = test_data, color = 'maroon', markersize = 5)
# ax.plot(x, y, ls = '--', lw = 1.5, color = 'black')

# Add title to the plot
fig.suptitle('A plot of pretest against posttest scores', fontweight = 'bold', fontsize = 18, color = 'maroon')

# Customise then x and y labels
ax.set_xlabel('Pretest scores', fontsize = 16, fontweight = 'bold', color = 'firebrick')
ax.set_ylabel('Posttest scores', fontsize = 16, fontweight = 'bold', color = 'firebrick')

# Add vertical and horizontal average lines average lines
# ax.axhline(test_data['posttest'].mean(), ls = '--', linewidth = 1.5)
# ax.axvline(test_data['pretest'].mean(), ls = '--', linewidth = 1.5, color = 'green')

ax.tick_params(labelsize = 14, labelcolor = 'orangered')
plt.show()

From the graph, it can clearly be seen that there exist a strong positive correlation between the pretest and posttest score

In [None]:
# import scipy as sp
import scipy as sp

In [None]:
# Calculating for spearman's rank correlation among the scores
print(sp.stats.spearmanr(pretest, posttest))

# Calculating for Pearson's correlation among the scores
pearsonr = sp.stats.pearsonr(pretest, posttest)
print('Pearsonr correlaton =', pearsonr[0], 'P-value =', pearsonr[1])

**Interpretation:** Both correlations show that there exist a statistically significant (p < 0.05) **strong positive correlation** among the two-scores. 

The Spearman's correlation specifically implies that there is a strong positive correlation when the students are ranked according to their performances in both tests. Thus students who ranked high in the pretest most likely ranked high in the post-test and vice-versa.

### Inferential statistics on the continous variables
#### Perform paired sample t-test

**Test for normality in the scores**

In [None]:
# H0: The scores are normally distributed
# H1: The scores are not normally distributed

# check for normality on pretest scores
sp.stats.shapiro(test_data.pretest)

In [None]:
# H0: The scores are normally distributed
# H1: The scores are not normally distributed

# Test for normality from the posttest scores
sp.stats.shapiro(test_data.posttest)

**Test for homogeneity of variances**

Not necessary for paired samples because we are dealing with the same group.

In [None]:
# H0: The scores have equal variances
# H1: The scores do not have equal variances

sp.stats.levene(pretest, posttest)

**Interpretation:** The null hypothesis is upheld because p-value > 0.05 (Not statistically significant)

The test for normality for both scores suggest that they both fail the normaility test profoundly. Thus, the null hypothesis is rejected for both scores because they have significant p-values (p-value < 0.05).

*In view of this, t-test is not advisable but we will try it anyway.*

**Performing the paired sample T-test**

In [None]:
# H0: There is no difference between the means of pre- and post-test scores
# H1: There is a difference between the means of pre- and post-test scores

# Performing the paired sample t-test
sp.stats.ttest_rel(pretest, posttest)

**The null hypothesis is therefore rejected because p-value = 0.0[<0.05] (statistically significant).**

Hence, we accept the alternative hypothesis that there is a difference between the means of the pretest and posttest scores.

In [None]:
# A more appropriate statistic Wilconson test should be used

# H0: There is no difference between the means of pre- and post-test scores
# H1: There is a difference between the means of pre- and post-test scores

sp.stats.wilcoxon(test_data.pretest, test_data.posttest)

**Like the T-test, the null hypothesis is therefore because p-value = 0.0[<0.05] (statistically significant)**

#### Performing independent sample T-test
* For gender on pretest and posttest scores

In [None]:
# Examine the average performance on post_test and pre-test based on gender
test_data.groupby(['gender'])[['pretest', 'posttest']].mean()

In [None]:
# Independent Sample T-test between male and female on pre-test scores
male_pretest = test_data[test_data.gender == 'Male']['pretest']
fem_pretest = test_data[test_data.gender == 'Female']['pretest']

In [None]:
# Test for normality in the male and female pretest scores

# H0: The scores are normally distributed
# H1: The scores are not normally distributed

print('Male_pretest:',sp.stats.shapiro(male_pretest))
print('Female_pretest:',sp.stats.shapiro(fem_pretest))

**Interpretation:** Both males and female pretest scores grossly failed the normality test. They are not normally distributed. This is evidenced by their p-values being <0.05, implying statistical significance. As a result the null hypothesis rejected in favour of the alternate hypothesis.

In [None]:
# Test for homegeneity of variance of pretest scores between males and females
# H0: The scores have equal variances
# H1: The scores do not have equal variances
sp.stats.levene(male_pretest, fem_pretest)

**Implication:** The null hypothesis is rejected because p-value < 0.05 (statistically significant). Hence, the H1 is rather accepted.

With the data failing the normality and homogeneity tests, Independent Sample T test is not appropriate to us. However, we'll use it anyway for the sake of practice

In [None]:
# Independent Sample T-test between male and female on pre-test scores
# H0: There is no difference between the mean pret-test scores for both gender
# H1: There is a difference between the mean pre-test scores for both gender

sp.stats.ttest_ind(male_pretest, fem_pretest)

**Implication:** With p-value of the statistic >0.05 (0.7801)[not statistically significant], the null hypothesis is upheld.

      Thus there is no difference between the mean pre-test scores for males and females

In [None]:
# Using the ManWhotney U Test is more appropriate because of the absence of normality and homogeneity among both gender
sp.stats.mannwhitneyu(male_pretest, fem_pretest)

**Similar to the independent sample t-test performed above, the null hypothesis is upheld because the p-value > 0.05 (0.399). Thus not statistically significant.**

**Conducting Independent Sample T test on gender for post-test scores**

In [None]:
male_posttest = test_data['posttest'][test_data.gender == 'Male']
female_posttest = test_data['posttest'][test_data.gender == 'Female']

In [None]:
# Test of normality in both gender
print(sp.stats.shapiro(male_posttest))
print(sp.stats.shapiro(female_posttest))

>The posttest scores for both gender are not normally distributed. The null hypothesis is rejected for the alternate. Because their p-values are <0.05 (statistically significant).

In [None]:
# Test of homogeneity of variance among the 2 genders
sp.stats.levene(male_posttest, female_posttest)

The null hypothesis, that the post test scores for both gender have the same variance, is upheld. This is because p-value >0.05(0.19)

In [None]:
# Performing the MannWhitneyU Test
sp.stats.mannwhitneyu(male_posttest, female_posttest)

The null hypothesis is upheld [p-value = 0.3559] (p-value > 0.05)
This implies that the means of the postest scores for males and females are identical or the same.
Or, there is no difference between the means of the postest scores for males and females.

## A machine learning regression model to predict test scores

In [None]:
# Visualise the data again
test_data.head()

In [None]:
# Split the data into X and y
X = test_data.drop('posttest', axis = 1)
y = test_data['posttest']

# Convert categorical values to numbers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categories = ['school', 'school_setting', 'school_type', 'teaching_method', 'gender', 'lunch']

one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categories)],
                                remainder = 'passthrough')

X_transformed = transformer.fit_transform(X)

# Split the transformed data to training and test sets
np.random.seed(42)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size = 0.2)

# Import the Random Forest regressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)

# Score the model
model.score(X_test, y_test)

In [None]:
from sklearn.model_selection import cross_val_score

# y_preds = model.predict(X_test)

cv_score = cross_val_score(model, X_transformed, y)
np.mean(cv_score)

In [None]:
# Cross-validated mean-absolute-error
cv_mae = cross_val_score(model, X_transformed, y, scoring = 'neg_mean_absolute_error')
cv_mae.mean()

In [None]:
# Cross-validated mean-squared-error
cv_mse = cross_val_score(model, X_transformed, y, scoring = 'neg_mean_squared_error')
cv_mse.mean()

## Hyperparameter tuning

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def evaluation_metrics(y_true, y_preds):
    '''
    A function to compute and return the evaluation metrics in the form of a dictionary.
    '''
    r2 = r2_score(y_true, y_preds)
    mae = mean_absolute_error(y_true, y_preds)
    mse = mean_squared_error(y_true, y_preds)
    
    metrics = {'r2_score' : r2,
               'mean absolute error' : round(mae, 2),
               'mean_squared error' : round(mse, 2)}
    
    print(f'R2_score: {r2 * 100:.2f}%')
    print(f'MAE: {mae:.2f}')
    print(f'MSE: {mse:.2f}')
    
    return metrics

In [None]:
# Split the data into X and y
X = test_data.drop('posttest', axis = 1)
y = test_data['posttest']

# Convert categorical values to numbers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categories = ['school', 'school_setting', 'school_type', 'teaching_method', 'gender', 'lunch']

one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categories)],
                                remainder = 'passthrough')
X_transformed = transformer.fit_transform(X)

# Split the transformed data to training, validation and test sets
np.random.seed(94)
from sklearn.model_selection import train_test_split

X_train, X_val_test, y_train, y_val_test = train_test_split(X_transformed, y, test_size = 0.3)
# Split the X_val_test and y_val_test equally into validation and test samples
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size = 0.5)

# Import the Random Forest regressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)

# Make predictions on the validation set
y_preds = model.predict(X_val)
# Score the model
baseline = evaluation_metrics(y_val, y_preds)

In [None]:
# Tune with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Split the data into X and y
X = test_data.drop('posttest', axis = 1)
y = test_data['posttest']

# Convert categorical values to numbers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categories = ['school', 'school_setting', 'school_type', 'teaching_method', 'gender', 'lunch']

one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categories)],
                                remainder = 'passthrough')
X_transformed = transformer.fit_transform(X)

# Split the transformed data to training, validation and test sets
np.random.seed(94)
from sklearn.model_selection import train_test_split

X_train, X_val_test, y_train, y_val_test = train_test_split(X_transformed, y, test_size = 0.3)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size = 0.5)

# Import the Random Forest regressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs = -1)

grid = {'n_estimators' : [10, 100, 200, 500, 1000, 1200],
        'max_depth' : [None, 5, 10, 20, 30],
        'max_features' : ['auto', 'sqrt'],
        'min_samples_split' : [2, 4, 6 ],
        'min_samples_leaf' : [1, 2, 4]}

rs_model = RandomizedSearchCV(estimator = model, 
                              param_distributions = grid,
                              n_iter = 10,
                              cv = 5,
                              verbose = 2)
rs_model.fit(X_train, y_train)

In [None]:
# Check the best paramters
rs_model.best_params_

In [None]:
# Make predictions with the rs_model on the validation set
rs_y_preds = rs_model.predict(X_val)

In [None]:
# Evaluate the RandomizedSearchCV
rs_metrics = evaluation_metrics(y_val, rs_y_preds)

In [None]:
# Apply the model on the test sets
rs_test_preds = rs_model.predict(X_test)

# Evaluate the model
rs_test_metrics = evaluation_metrics(y_test, rs_test_preds)