# Students Performance in Exams


This notebook mainly describes the process of training models and fine-tuning models.

# Load Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
data

# Split Dataset

Split the dataset to two parts - training set and testing set, the training set is used for training and cross validating, the testing set is used for the final testing.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from plotly.subplots import make_subplots
import plotly.graph_objects as go

split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)

for train_index,test_index in split.split(data,data["race/ethnicity"]):
    train_set = data.loc[train_index]
    test_set = data.loc[test_index]

# Visualize Data

Try to find how does each feature impact the scores.

Distribution of feature values.

In [None]:
def add_attr_trace(fig, dataset, attr, r, c):
    fig.add_trace(go.Histogram(x=dataset[attr].sort_values(), histnorm='probability', name=attr), row=r, col=c)

In [None]:
fig = make_subplots(rows=2, cols=5, shared_yaxes=True,
                    subplot_titles=("entire data","entire data", "entire data", "entire data", "entire data",
                                   "testing data","testing data", "testing data", "testing data", "testing data"))

attrs = data.columns[:5]

for i in range(len(attrs)):
    add_attr_trace(fig, data, attrs[i], 1, i + 1)
    add_attr_trace(fig, test_set, attrs[i], 2, i + 1)

fig.update_layout(showlegend=False, height=900)
fig.show()

Impaction of feature values on scores.

In [None]:
def add_score_trace(fig, dataset, impact_attr, attr_value, score_type, r, c):
    fig.add_trace(
        go.Histogram(x=dataset[dataset[impact_attr] == attr_value][score_type],
                     histnorm='probability',
                     name=attr_value + ' - ' + score_type),
        row=r, col=c)

In [None]:
fig = make_subplots(rows=5, cols=3, shared_yaxes=True,
                    subplot_titles=("gender - math", "gender - reading", "gender - writing",
                                    "race/ethnicity - math", "race/ethnicity - reading", "race/ethnicity - writing", 
                                    "parental level - math", "parental level - reading", "parental level - writing",
                                    "lunch - math", "lunch - reading", "lunch - writing",
                                    "test preparation - math", "test preparation - reading", "test preparation - writing"))

score_types = data.columns[5:8]

for i in range(len(attrs)):
    attr_values = data[attrs[i]].unique()
    for j in range(len(attr_values)):
        for k in range(len(score_types)):
            add_score_trace(fig, data, attrs[i], attr_values[j], score_types[k], i + 1, k + 1)
            
fig.update_layout(showlegend=False, barmode='stack', height=1500)
fig.show()

# Prepare Data

Transform the category data to numerics.

In [None]:
X_train = train_set.drop(['math score', 'reading score', 'writing score'], axis=1)
y_train_math = train_set['math score'].copy()
y_train_reading = train_set['reading score'].copy()
y_train_writing = train_set['writing score'].copy()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

category_attrs = attrs

full_pipeline = ColumnTransformer([('category', OneHotEncoder(), category_attrs)])

X_train = full_pipeline.fit_transform(X_train)

# Training Models

## Linear Regression

Training linear regression models.

In [None]:
from sklearn.linear_model import LinearRegression

lin_regr_math = LinearRegression()
lin_regr_math.fit(X_train, y_train_math)

lin_regr_reading = LinearRegression()
lin_regr_reading.fit(X_train, y_train_reading)

lin_regr_writing = LinearRegression()
lin_regr_writing.fit(X_train, y_train_writing)

## Decision Tree Regressor

Training decision tree regression models.

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_regr_math = DecisionTreeRegressor(random_state=42)
tree_regr_math.fit(X_train, y_train_math)

tree_regr_reading = DecisionTreeRegressor(random_state=42)
tree_regr_reading.fit(X_train, y_train_reading)

tree_regr_writing = DecisionTreeRegressor(random_state=42)
tree_regr_writing.fit(X_train, y_train_writing)

In [None]:
# plot tree
# from sklearn import tree
# tree.plot_tree(tree_regr_math)

In [None]:
# export tree
# import graphviz 
# dot_data = tree.export_graphviz(tree_regr_math, out_file=None) 
# graph = graphviz.Source(dot_data)
# graph.render("math")

## Random Forest Regressor

Training random forest regression models.

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_regr_math = RandomForestRegressor(random_state=42)
forest_regr_math.fit(X_train, y_train_math)

forest_regr_reading = RandomForestRegressor(random_state=42)
forest_regr_reading.fit(X_train, y_train_math)

forest_regr_writing = RandomForestRegressor(random_state=42)
forest_regr_writing.fit(X_train, y_train_math)

# Predict Training Data

In [None]:
from sklearn.metrics import mean_squared_error

def predict(model, X, y, tag):
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    rmse = np.sqrt(mse)
    print('prediction for ' + tag + ': rmse = ', rmse)

Linear models

In [None]:
print('Linear Regression ----------------')
predict(lin_regr_math, X_train, y_train_math, 'math score')
predict(lin_regr_reading, X_train, y_train_reading, 'reading score')
predict(lin_regr_writing, X_train, y_train_writing, 'writing score')

Decision tree models

In [None]:
print('Decision Tree Regressor  ----------------')
predict(tree_regr_math, X_train, y_train_math, 'math score')
predict(tree_regr_reading, X_train, y_train_reading, 'reading score')
predict(tree_regr_writing, X_train, y_train_writing, 'writing score')

Random forest models

In [None]:
print('Random Forest Regressor ----------------')
predict(forest_regr_math, X_train, y_train_math, 'math score')
predict(forest_regr_reading, X_train, y_train_reading, 'reading score')
predict(forest_regr_writing, X_train, y_train_writing, 'writing score')

# Cross validate

Cross validation is used to test a model's ability to predict new data that was not used in training it.

In [None]:
def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard deviation:', scores.std())

In [None]:
def apply_cross_validation(estimator, X, y, tag):
    scores = cross_val_score(estimator, X, y, scoring='neg_mean_squared_error', cv=10)
    rmse_scores = np.sqrt(-scores)
    print()
    print('********** ' + tag + ' **********')
    display_scores(rmse_scores)

In [None]:
from sklearn.model_selection import cross_val_score

print()
print('cross validation for linear regressions -----------------------------')
apply_cross_validation(lin_regr_math, X_train, y_train_math, 'math score')
apply_cross_validation(lin_regr_reading, X_train, y_train_reading, 'reading score')
apply_cross_validation(lin_regr_writing, X_train, y_train_writing, 'writing score')

print()
print('cross validation for decision tree regressors -----------------------------')
apply_cross_validation(tree_regr_math, X_train, y_train_math, 'math score')
apply_cross_validation(tree_regr_reading, X_train, y_train_reading, 'reading score')
apply_cross_validation(tree_regr_writing, X_train, y_train_writing, 'writing score')

print()
print('cross validation for random forest regressors -----------------------------')
apply_cross_validation(forest_regr_math, X_train, y_train_math, 'math score')
apply_cross_validation(forest_regr_reading, X_train, y_train_reading, 'reading score')
apply_cross_validation(forest_regr_writing, X_train, y_train_writing, 'writing score')

# Fine Turning

Get better models by using Grid Search CV method. 

In [None]:
from sklearn.model_selection import GridSearchCV

def grid_search_cv(estimator, param_grid, X, y, tag):
    print()
    print(tag + ' ----------------------------------')
    
    grid_search = GridSearchCV(estimator, param_grid, verbose=1, cv=10,
                              scoring='neg_mean_squared_error',
                              return_train_score=True, refit=True)

    grid_search.fit(X, y)
    print()
    print('best_params_: ')
    print(grid_search.best_params_)
    
    results = grid_search.cv_results_
    for mean_score, params in zip(results['mean_test_score'], results['params']):
        print(np.sqrt(-mean_score), params)
        
    feature_importances = grid_search.best_estimator_.feature_importances_
    print()
    print('feature_importances: ')
    print(feature_importances)
    
    return grid_search

In [None]:
print('Grid search for decision tree regressors -------------------------------')
param_grid = {'max_depth': list(range(2, 10)), 'min_samples_split': [2, 3, 4, 5, 6]}
grid_search_tree_math = grid_search_cv(DecisionTreeRegressor(random_state=42), param_grid, X_train, y_train_math, 'math score')
grid_search_tree_reading = grid_search_cv(DecisionTreeRegressor(random_state=42), param_grid, X_train, y_train_reading, 'reading score')
grid_search_tree_writing = grid_search_cv(DecisionTreeRegressor(random_state=42), param_grid, X_train, y_train_writing, 'writing score')


print()
print('Grid search for random forest regressors -------------------------------')
param_grid = {'max_depth': list(range(2, 10)), 'min_samples_split': [2, 3, 4, 5, 6]}
grid_search_forest_math = grid_search_cv(RandomForestRegressor(random_state=42), param_grid, X_train, y_train_math, 'math score')
grid_search_forest_reading = grid_search_cv(RandomForestRegressor(random_state=42), param_grid, X_train, y_train_reading, 'reading score')
grid_search_forest_writing = grid_search_cv(RandomForestRegressor(random_state=42), param_grid, X_train, y_train_writing, 'writing score')

# Final Testing

In [None]:
def final_predict(grid_search, X, y, tag):
    predictions = grid_search.best_estimator_.predict(X)
    mse = mean_squared_error(y, predictions)
    rmse = np.sqrt(mse)
    print('final predict for ' + tag + ': rmse = ', rmse)

In [None]:
X_test = test_set.drop(['math score', 'reading score', 'writing score'], axis=1)
y_test_math = test_set['math score'].copy()
y_test_reading = test_set['reading score'].copy()
y_test_writing = test_set['writing score'].copy()
X_test = full_pipeline.transform(X_test)

print('Linear Regression  ----------------')
predict(lin_regr_math, X_test, y_test_math, 'math score')
predict(lin_regr_reading, X_test, y_test_reading, 'reading score')
predict(lin_regr_writing, X_test, y_test_writing, 'writing score')

print()
print('Decision Tree Regressor  ----------------')
predict(tree_regr_math, X_test, y_test_math, 'math score')
predict(tree_regr_reading, X_test, y_test_reading, 'reading score')
predict(tree_regr_writing, X_test, y_test_writing, 'writing score')

print()
print('Fine-tuned Decision Tree Regressor  ----------------')
final_predict(grid_search_tree_math, X_test, y_test_math, 'math score')
final_predict(grid_search_tree_reading, X_test, y_test_reading, 'reading score')
final_predict(grid_search_tree_writing, X_test, y_test_writing, 'writing score')

print()
print('Random Forest Regressor ----------------')
predict(forest_regr_math, X_test, y_test_math, 'math score')
predict(forest_regr_reading, X_test, y_test_reading, 'reading score')
predict(forest_regr_writing, X_test, y_test_writing, 'writing score')

print()
print('Fine-tuned Random Forest Regressor  ----------------')
final_predict(grid_search_forest_math, X_test, y_test_math, 'math score')
final_predict(grid_search_forest_reading, X_test, y_test_reading, 'reading score')
final_predict(grid_search_forest_writing, X_test, y_test_writing, 'writing score')

# Conclusion

We explored three different models:
* Linear Regression
* Decision Tree
* Random Forest

It seems like linear regression is better than decision tree regressor and random forest regression for this dataset.

# Reference

1. https://github.com/ageron/handson-ml2
2. https://en.wikipedia.org/wiki/Cross-validation_(statistics)#:~:text=Cross%2Dvalidation%2C%20sometimes%20called%20rotation,to%20an%20independent%20data%20set.