# Introduction

Firstly, we will use EDA to get a basic idea about the dataset, and then we will train regression models to try to predict the exam score.

Importing relevant libraries

In [None]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import tensorflow as tf
import math
from scipy import special #comb, factorial
from keras import backend as K
from scipy.stats import uniform
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import MinMaxScaler, StandardScaler,LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, recall_score, make_scorer, plot_confusion_matrix, confusion_matrix, accuracy_score,f1_score

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sns.set_style('darkgrid')
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
df.head()

Shape of the dataset:

In [None]:
df.shape

Overview of the dataset

In [None]:
df.info()

In [None]:
df.isnull().sum()

As we see, there are no nulls.

Let's first start with categorical features. Let's see the number of unique values per each variable.

In [None]:
cat_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course']

count = np.array([df[feature].unique().size for feature in cat_features])

to_sort = np.argsort(count)[::-1]
cat_features = np.array(cat_features)[to_sort]
count = count[to_sort]

plt.figure(figsize=(11,6))
graph = sns.barplot(cat_features,count)
for p in graph.patches:
    graph.annotate(p.get_height(), (p.get_x()+0.4, p.get_height()),
                   ha='center', va='bottom',
                   color= 'black')


plt.title("Number of unique values per each feature")
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.xlabel('Feature')
plt.show()

Now let's look at the univariate distributions.

In [None]:
cat_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course']
WIDTH = 16
LENGTH = 30

rows = math.ceil(len(cat_features)/3)
fig, ax = plt.subplots(5,1,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cat_features):
    sns.countplot(df[feature],ax=ax[i])
    ax[i].set_title(f'Distribution of a feature `{feature}`')

Now let's check the distributions of our numerical features (i.e., exam scores)

In [None]:
score = ['math score', 'reading score','writing score']
round(df[score].describe(),2)

Interestingly (but perhaps not so surpisingly), the math turns out to be the **hardest** exam:
The average of math scores is lowest out of 3 subjects. Same goes for median, and Q3. But more striking difference appears when we consider minimum score achieved on the exams: the minimum score for the math is **0**, but for reading and writing, the scores are **17** and **10**, respectively.
So the upshot is: no matter what kind of metric we use, we see that (on average) the perfomance on the math exam is the worst.

Let's visualize the distributions.

In [None]:
cont_features = ['math score', 'reading score','writing score']
WIDTH = 15
LENGTH = 5

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(rows,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    ax[i].hist(df[feature],alpha=0.6)
    ax[i].set_title(f'Distribution of a feature `{feature}`')

We see that the distributions are roughly normal (with a little tail to the left). Let's check the skewness.

In [None]:
cont_features = ['math score', 'reading score','writing score']
df[cont_features].skew()

As we see, each variable has negative skew, which agrees with conclusions we made while observing the graphs.

Let's check the correlation between features.

In [None]:
cont_features = ['math score', 'reading score','writing score']

df1 = df[cont_features]
corr=df1.corr()

plt.figure(figsize=(10,7))
sns.heatmap(corr,
            xticklabels=df1.columns,
            yticklabels=df1.columns,
            annot=True)
plt.title('Correlation matrix of the continuous features')
plt.show()

In [None]:
fig,ax = plt.subplots(1,3,figsize=(16,6))
ax = ax.flatten()

ex_scores = ['math score', 'reading score', 'writing score']
i = 0
for feat1 in range(0,3):
    for feat2 in range(feat1+1,3):
        sns.regplot(x=ex_scores[feat1], y=ex_scores[feat2],data=df, ax=ax[i],scatter=False,color='r')
        sns.scatterplot(x=ex_scores[feat1], y=ex_scores[feat2],data=df, ax=ax[i],alpha=0.4)
        ax[i].set_xlim([0,100])
        ax[i].set_ylim([0,100])
        i+=1
        


Positive correlation for each pair of scores is not really surprising: **generally** speaking, if a student performs well on exam $A$, then it is quite likely that the student is conscientious (of course, unless he cheated or got a good grade thanks to other irregularities), implying the other grades will likely to be high too (which agrees with the fact that correlation between any two numeric features doesn't drop below $0.8$). Another unsurprising finding is that, `writing score` is more correlated with `reading score` than with `math score`. Similarly, `reading score` is more correlated with `writing score` than with `math score`. This makes sense: [To be a good writer, one needs to be a voracious reader](https://writing.stackexchange.com/questions/14189/can-i-be-a-good-writer-without-reading-a-lot); yet to be good at math, one doesn't need to read a lot of a non-mathematical literature. Hence due to the inherent differences between math and the rest of two subjects, we observe a lower correlation.

Now let's consider the conditional distributions of each numeric variable (conditional on each discrete variable). We begin with `gender`

In [None]:
cont_features = ['math score', 'reading score','writing score']
cat_variable = 'gender'
WIDTH = 12
LENGTH = 5

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(1,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    sns.boxplot(x=cat_variable, y=feature, data=df,ax=ax[i],palette = ["#0101DF", "#DF0101"])
    ax[i].set_title(f'Cond. dist. of feature `{feature}`')

One interesting pattern can be observed: boys perform better on math exams, but girls perform better on reading/writing exams. It also seems that there is one girl that performed very badly on *all* exams

In [None]:
df[df['math score'] == 0]

Now let's check (one more time) the min. values for the scores of all subjects

In [None]:
df[['math score','writing score','reading score']].min()

Indeed, we see that she performed the worst across all exams.

Now, let's have a look at influence of `race\ethnicity` on exam scores.

In [None]:
cont_features = ['math score', 'reading score','writing score']
cat_variable = 'race/ethnicity'
WIDTH = 12
LENGTH = 5

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(1,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    sns.boxplot(x=cat_variable, y=feature, data=df,ax=ax[i])
    ax[i].set_xticklabels(ax[i].get_xticklabels(), Rotation= 45) 
    ax[i].set_title(f'Cond. dist. of feature `{feature}`')

One can observe that the group `A` (in general) has the worst performance across 3 exams, and group `E` tend to perform the best (although on writing exam, the superiority is not as significant as on math exam)

In [None]:
cont_features = ['math score', 'reading score','writing score']
cat_variable = 'parental level of education'
WIDTH = 12
LENGTH = 5

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(1,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    sns.boxplot(x=cat_variable, y=feature, data=df,ax=ax[i])
    ax[i].set_xticklabels(ax[i].get_xticklabels(), Rotation= 45) 
    ax[i].set_title(f'Cond. dist. of feature `{feature}`')

The results here are not surprising: the worse education the parent has received, the more likely it is that the child will perform worse on an exam. Although should be noted that the difference between `master's degree` and `bachelor's degree` is not significant as, for example, the difference between `high school` and `bachelor's degree`

Now let's look at the distirbutions conditional on `lunch`

In [None]:
cont_features = ['math score', 'reading score','writing score']
cat_variable = 'lunch'
WIDTH = 12
LENGTH = 5

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(1,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    sns.boxplot(x=cat_variable, y=feature, data=df,ax=ax[i], palette=["#0101DF", "#DF0101"])
    ax[i].set_xticklabels(ax[i].get_xticklabels(), Rotation= 45) 
    ax[i].set_title(f'Cond. dist. of feature `{feature}`')

These results are also within our expectations: kids with worse meals tend to perform worse (most likely this happens because kids who get worse meals are likely to be coming from poor families, and poor families imply worse conditions for studying, which as the result negatively influence the scores kids get on the exam).

In [None]:
cont_features = ['math score', 'reading score','writing score']
cat_variable = 'test preparation course'
WIDTH = 12
LENGTH = 5

rows = math.ceil(len(cont_features)/3)
fig, ax = plt.subplots(1,3,figsize=(WIDTH,LENGTH))
ax = ax.flatten()
for i,feature in enumerate(cont_features):
    sns.boxplot(x=cat_variable, y=feature, data=df,ax=ax[i], palette=["#0101DF", "#DF0101"])
    ax[i].set_xticklabels(ax[i].get_xticklabels(), Rotation= 45) 
    ax[i].set_title(f'Cond. dist. of feature `{feature}`')

Similarly, the result is intuitive: You've completed the preparation course $\implies$ you are more prepared for exam $\implies$ you are likely to get a better grade than those who didn't finished the course

Let's use one way ANOVA to test independence between each categorical feature and each numeric feature.

In [None]:
dfs = []


from scipy.stats import f_oneway

cont_features = ['math score', 'reading score','writing score']

for label in ['gender', 
              'race/ethnicity', 
              'parental level of education', 
              'lunch','test preparation course']:
 
    dic = {'Categorical': [],
        'Numerical': [],
        'p-value': [],
        'p < 0.05': [],
        'statistic': []}


    for feature in cont_features:
        values = []
        for value in df[label].unique():
            values.append(df[df[label] == value][feature].values)

        statistic, pval = f_oneway(*values)

        dic['Categorical'].append(label)
        dic['Numerical'].append(feature)
        dic['p-value'].append(pval)
        dic['p < 0.05'].append(pval<0.05)
        dic['statistic'].append(statistic)


    dfs.append(pd.DataFrame(dic))

In [None]:
anova_df = pd.concat(dfs)
math = anova_df[anova_df['Numerical'] == 'math score']
reading = anova_df[anova_df['Numerical'] == 'reading score']
writing = anova_df[anova_df['Numerical'] == 'writing score']

ANOVA feature independence test: **math** and categorical features

In [None]:
math.sort_values(by='p-value',ascending=True)

ANOVA feature independence test: **reading** and categorical features

In [None]:
reading.sort_values(by='p-value',ascending=True)

ANOVA feature independence test: **writing** and categorical features

In [None]:
writing.sort_values(by='p-value',ascending=True)

We see that all our categorical features and numerical features are dependent (we've set the threshold at $0.05$). Secondly, it seems that the features `test preparation course` and `lunch` are the most useful when predicting the exam score (this is signified by the fact that for all 3 numeric variables (i.e., scores), `test preparation course` and `lunch` are  in the top 3 features with the smallest $p$-value (the smaller $p$-value, the more unlikely it is that the features are independent))

# Regression

Feature preprocessing

In [None]:
from sklearn.preprocessing import OrdinalEncoder

df_new = df.copy()

#Using ordinal encoder to encode features with only two unique values.
features_to_encode = ['gender','lunch','test preparation course']
ord_enc = OrdinalEncoder()
df_new[features_to_encode] = ord_enc.fit_transform(df_new[features_to_encode])


#One hot encoding high cardinality features
df_new = pd.get_dummies(df_new)


df_new.head()

We will try to predict the **average score**, i.e., 

$$\text{average score} = \frac{\text{math score} + \text{reading score} + \text{writing score}}{3}$$

Since all three variables are highly correlated, the predictions of the average score will be close to the predictions we would get when predicting the score for a single subject.

In [None]:
exam_scores = ['math score', 'reading score','writing score']

#Feature matrix
X = df_new.drop(exam_scores,axis=1)

#Target variable
y = (df_new['math score'] + df_new['reading score'] + df_new['writing score'])/3


X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=11)

# Decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error

tree_clf = DecisionTreeRegressor(max_depth=4,
                                 min_samples_split=50,
                                max_features='auto',
                                criterion='mse').fit(X_train,y_train)

MSE_test = mean_squared_error(y_pred=tree_clf.predict(X_test),y_true=y_test,squared=False)
print(f"RMSE for test set: {MSE_test}")

# SVM

In [None]:
from sklearn.svm import SVR


C_coeff = 16.42162462505648
gamma_coeff = 0.010323600491562047

svr_clf = SVR(C=C_coeff, 
              gamma=gamma_coeff,
              kernel='rbf').fit(X_train,y_train)

MSE_test = mean_squared_error(y_pred=svr_clf.predict(X_test),y_true=y_test,squared=False)
print(f"RMSE for test set: {MSE_test}")


# Gradient Boost

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train,label=y_train)
param = {'max_depth': 1,
        'eta': 1.7928804283381974,
         'objective':'reg:squarederror',
        'eval_metric':'rmse'}
num_round = 10
bst = xgb.train(params=param,dtrain=dtrain, num_boost_round=30)


dtest = xgb.DMatrix(X_test)



MSE_test = mean_squared_error(y_pred=bst.predict(dtest),y_true=y_test,squared=False)
print(f"RMSE for test set: {MSE_test}")

# ANN

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras import backend as K
import random


seed_value= 0
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value) # tensorflow 2.x

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

ann = keras.Sequential()
ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))
ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))
ann.add(tf.keras.layers.Dense(units = 1, activation = 'linear'))
ann.compile(optimizer = 'adam', loss = root_mean_squared_error , metrics = ['MSE'])
ann.fit(X_train, y_train, batch_size = 32, epochs = 100,verbose=0)

res = ann.predict(X_test).flatten()
print(f"RMSE for test set: {mean_squared_error(y_pred=res,y_true=y_test,squared=False)}")

In [None]:
rmse_score = [mean_squared_error(y_pred=tree_clf.predict(X_test),y_true=y_test,squared=False),
          mean_squared_error(y_pred=svr_clf.predict(X_test),y_true=y_test,squared=False),
          mean_squared_error(y_pred=bst.predict(xgb.DMatrix(X_test)),y_true=y_test,squared=False),
          mean_squared_error(y_pred=ann.predict(X_test).flatten(),y_true=y_test,squared=False)]
rmse_score = [round(x,2) for x in rmse_score]
models = ['Decision Tree',  'SVM', 'Gradient Boost', 'ANN']
scoredf = pd.DataFrame({'RMSE': rmse_score, 'Model': models}).sort_values(by='RMSE',ascending=False)


cat_features = models

count = np.array(rmse_score)

to_sort = np.argsort(count)
cat_features = np.array(cat_features)[to_sort]
count = count[to_sort]

plt.figure(figsize=(11,6))
graph = sns.barplot(cat_features,count)
for p in graph.patches:
    graph.annotate(p.get_height(), (p.get_x()+0.4, p.get_height()),
                   ha='center', va='bottom',
                   color= 'black')


plt.title("Performance of the regression models")
plt.xticks(rotation=45)
plt.ylabel('RMSE score')
plt.xlabel('Model')
plt.show()