# ***Introduction***

With the advent of technology and sophistication of database management resources, recently there has been interest in educational databases containing a variety of valuable information which could help less sucessful students improve their academic performance and  help academic institutions optimize their resources to improve overall wellbeing of their students. The objective of the task is to predict post test scores of students using given set of features.

### Importing libraries and dataset

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

pd.options.plotting.backend = "plotly"

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from plotly.figure_factory import create_distplot
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error,explained_variance_score
from xgboost import XGBRegressor

#tensorflow packages for bayesian NN's
#from tensorflow import keras
#from tensorflow.keras import layers
#import tensorflow_probability as tfp

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#read the dataset
data = pd.read_csv("/kaggle/input/predict-test-scores-of-students/test_scores.csv")

In [None]:
#print shape of data
print("The data contains",data.shape[0],"rows and",data.shape[1], "columns\n")
#print head of data
data.head()


# ***Exploratory Data Analysis***



In [None]:
#print dataframe information
data.info()

print("-"*50)

#checking if any null values in the dataset
print("Any null values in the data:",data.isna().sum().any())

print("-"*50)

# Exploring  unique values of categorical variables of data
for col_name in data:
    if data[col_name].dtype == "object":
        print(col_name,":",data[col_name].unique())

The data contains 8 categorical variables (Dtype-object) and 3 continous features (type-float64).Target variable - posttest.

In [None]:
# Initialize figure with subplots
fig = make_subplots(
    rows=3, cols=2, subplot_titles=("School Setting distribution", "School Type Distribution",'Teaching method distribution','Gender distribution','Lunch distribution','Relationship Between Post-test and Pre-test')
)

# Add traces
fig.add_trace(go.Bar(x=data['school_setting'].value_counts().index.values, y=data['school_setting'].value_counts().values/np.sum(data['school_setting'].value_counts())), row=1, col=1)
fig.add_trace(go.Bar(x=data['school_type'].value_counts().index.values, y=data['school_type'].value_counts().values/np.sum(data['school_type'].value_counts())), row=1, col=2)
fig.add_trace(go.Bar(x=data['teaching_method'].value_counts().index.values, y=data['teaching_method'].value_counts().values/np.sum(data['teaching_method'].value_counts())), row=2, col=1)
fig.add_trace(go.Bar(x=data['gender'].value_counts().index.values, y=data['gender'].value_counts().values/np.sum(data['gender'].value_counts())), row=2, col=2)
fig.add_trace(go.Bar(x=data['lunch'].value_counts().index.values, y=data['lunch'].value_counts().values/np.sum(data['lunch'].value_counts())), row=3, col=1)
fig.add_trace(go.Scatter(x=data['pretest'], y=data['posttest'], mode='markers'), row=3, col=2)

# Update xaxis properties
fig.update_xaxes(title_text="School Setting", row=1, col=1)
fig.update_xaxes(title_text="School Type", row=1, col=2)
fig.update_xaxes(title_text="Teaching Method", row=2, col=1)
fig.update_xaxes(title_text="Gender", row=2, col=2)
fig.update_xaxes(title_text="lunch", row=3, col=1)
fig.update_xaxes(title_text="Pretest", row=3, col=2)


# Update yaxis properties
fig.update_yaxes(title_text="Percent of total",tickformat = ',.0%',row=1, col=1)
fig.update_yaxes(title_text="Percent of total",tickformat = ',.0%', row=1, col=2)
fig.update_yaxes(title_text="Percent of total",tickformat = ',.0%', row=2, col=1)
fig.update_yaxes(title_text="Percent of total",tickformat = ',.0%', row=2, col=2)
fig.update_yaxes(title_text="Percent of total",tickformat = ',.0%', row=3, col=1)
fig.update_yaxes(title_text="Posttest",row=3, col=2)


# Update title and height
fig.update_layout(title_text="Ploting distribution of features", height=700)

fig.show()

The above graph shows that their is bias in this dataset. 
1. Distributions of students in school settings, school type, teaching methods and lunch distributions is unbalanced.
2. The relationship between pre test and post test is highly correlated. We can further check it within the setting of different features.

Unbalanced data may cause lower model predictions for some groups.

In [None]:
# Group data together
hist_data = [data['pretest'], data['posttest']]

group_labels = ['pretest', 'posttest']

# Create distplot with custom bin_size
fig_distributions = ff.create_distplot(hist_data, group_labels, bin_size=.5, show_rug=False)
fig_distributions.update_layout(title_text="Distribution of posttest and pretest", height=500)
fig_distributions.show()

The distribution of pretest and posttest is fairly normal, we don't see any skewness.

In [None]:
def create_distplot(column_name):
    #testing target variable against school setting
    fig = ff.create_distplot([data['posttest'][data[column_name] == c].values for c in data[column_name].unique()],
    data[column_name].unique(),
    show_hist=True,
    show_rug=False,bin_size=.5
        )
    title_text_1 = "Distribution of post-test by " + column_name
    fig.update_layout(title_text= title_text_1, height=500)
    fig.show()

In [None]:
#create distribution plots by target variable-postestscore
for col_name in data:
    if data[col_name].dtype == "object" and col_name not in ['student_id','classroom']:
        create_distplot(col_name)


In [None]:
#examining values in continous features to detect any outliers
fig = go.Figure()

for col_name in data:
    if data[col_name].dtype == "float64":
        fig.add_trace(go.Box(y=data[col_name].values, name=data[col_name].name))
fig.show()

# Summary EDA 

* The above distribution plots shows the correlation between each categorical features vs post-test.
* We can see some overall top performing schools such as "UKPGS" has significant better pretest results than others, also school "GOOBU" has significant worst postest results than others. This indicates that school might be a factor in the performance of a student.
* Suburban schools are covering more on the higher tests results and Urban are covering more on the lower test results. 
* Distributions from lunch shows that students without previlage to free lunch have higher test results and students with free lunch might have lower test results which is counterintuitive to this  study https://www.theatlantic.com/education/archive/2017/03/do-healthy-lunches-improve-student-test-scores/520272/ i.e. students have nutrition they need throughout the day to learn, therefore perform better in tests.
* No significant differences between postest scores of different genders


# ***Data preparation and Feature Transformation***

In [None]:
#dropping variables with no significance to data 
X = data.drop(['posttest','student_id'], axis=1)
y = data['posttest']

#listing cateogorical and number features for pipeline
categorical_features = list(X.select_dtypes(include=['object']))
numeric_features = list(X.select_dtypes(include=['float64']))

#specificing preprocessing steps in pipeline
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),('cat', categorical_transformer, categorical_features)])

#spliting dataset in test and train
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.30, random_state=42)

In [None]:
def model_evaluation(y_test, y_predicted):
    '''Show model score accuracy, with MAE, MSE and explained variance score'''
    
    MAE = mean_absolute_error(y_test, y_predicted)
    MSE = mean_squared_error(y_test, y_predicted)
    r2 = r2_score(y_test, y_predicted)
    exp_var_score = explained_variance_score(y_test, y_predicted)
    
    print(f"Mean absolute error: {round(MAE, 3)}\nMean squared error: {round(MSE, 3)}\nR2: {round(r2, 3)}\nExplained Variance Score:{(exp_var_score)}")

In [None]:
#scatter plot to check overfitting
def check_overfitting_scatter(y_test,y_predict,model):
    fig = px.scatter(x = y_test, y = y_hat, trendline = 'ols',labels=dict(x='True scores', y='Predicted scores'),title=model)
    fig.show()

# Linear Regression - Baseline Model

In [None]:
#starting with simple linear regression model, easy to understand and implement
Linear_regr = Pipeline(steps=[('preprocessor', preprocessor),('regr', LinearRegression())])
Linear_regr.fit(x_train, y_train)

In [None]:
print("Model score:",Linear_regr.score(x_train, y_train))
#predict testing dataset
y_hat = Linear_regr.predict(x_test)

#Model evaluation
model_evaluation(y_test,y_hat)

#plotting test vs predicted
check_overfitting_scatter(y_test,y_hat,'Linear Regression: Plotting predicted vs true scores')

# Tree Based Model - Random Forest

In [None]:
#random forest with gridsearch and cross validation
param_grid_rf = [{
    'ranfr__n_estimators': [500, 700,1000],
    'ranfr__criterion': ['mse','mae'],
    'ranfr__max_depth': [5, 8, 12],
    'ranfr__max_features': ['sqrt'],
    'ranfr__min_samples_leaf':[0.001, 0.003]
}]

random_f_regr = Pipeline(steps=[('preprocessor', preprocessor),('ranfr',RandomForestRegressor(random_state = 42))])

#grid search with cross val
random_f_cv = GridSearchCV(random_f_regr, param_grid = param_grid_rf, cv = 3, n_jobs = 1)
random_f_cv.fit(x_train, y_train)

In [None]:
print('Best Parameter Set: ',format(random_f_cv.best_params_))
print('Best Estimator Training Score: ',format(random_f_cv.best_score_))

best_est_ramdom_f_cv = random_f_cv.best_estimator_

y_hat = best_est_ramdom_f_cv.predict(x_test)

#Model evaluation
model_evaluation(y_test,y_hat)
#plotting test vs predicted
check_overfitting_scatter(y_test,y_hat,'Random Forrest Regression: Plotting predicted vs true scores')

Random forest did not outperform linear regression. It was highly resource intensive and did not let me experiment with more hyperparameters. 

# XGBOOST

In [None]:
#starting with simple linear regression model, easy to understand and implement
xgboost = Pipeline(steps=[('preprocessor', preprocessor),('xgboost', XGBRegressor(verbosity=0))])
xgboost.fit(x_train, y_train)

In [None]:
print("Model score:",xgboost.score(x_train, y_train))

#predict testing dataset
y_hat = xgboost.predict(x_test)
#Model evaluation
model_evaluation(y_test,y_hat)
#plotting test vs predicted
check_overfitting_scatter(y_test,y_hat,'Xgboost: Plotting predicted vs true scores')

Even though xgboost is one of the widely used algorithms in the kaggle competitions, in this task, xgboost seems to overfit traning data and underperforms when compared to linear regression

# Conclusions

All models performed well and the scores were within the acceptable ranges of accuracy. However, Linear regression performed the best with MAE ~ 2.2. 

If I were to spend more time on this assignment, I would try more algorithms such as bayesian neural networks, Adaboost etc. and further evaluate results from linear regression to check which subsets are underperforming. Test data against Lasso, Ridge, and ElasticNet extensions of linear equation, with an additional penalty parameter that aims to minimize complexity and/or reduce the number of features used in the final model.

