In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Data preparation
## 1.1. Load the data

In [None]:
data = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
data.head()

In [None]:
data.info()

No missing values in this dataset

## 1.2. Convert categorical values to numerical values

In [None]:
data['gender_num'] = data['gender'].replace({'female':0,'male':1})
data['race/ethnicity_num'] = data['race/ethnicity'].replace({'group A':0,'group B':1,'group C':2,'group D':3,'group E':4})
data['parental level of education_num'] = data['parental level of education'].replace({"bachelor's degree": 4, 'some college':2,
                                                                                       "master's degree":5,"associate's degree":3,
                                                                                       'high school':1, 'some high school':0})
data['lunch_num'] = data['lunch'].replace({'standard':0,'free/reduced':1})
data['test preparation course_num'] = data['test preparation course'].replace({'none':0,'completed':1})
data.head()

This conversion is necessary for further analysis. The values given to replace non numerical values are arbitrarily given except for "parental level of education" which are given based on degree's level (lowest level=0).

# 2. Analysis
## 2.1. Principal Component Analysis (PCA)

In [None]:
# Create a dataframe only with numerical columns
pcaData = data.select_dtypes(include=[np.number])
pcaData

In [None]:
# Standardize the data
scaler = StandardScaler()
pcaData_scaled = scaler.fit_transform(pcaData)

# Create the PCA model and fit standardised data
pca = PCA(n_components=np.shape(pcaData)[1]) # Use the maximum number of component
pca.fit(pcaData_scaled)
# Update the PCA with number of components that explains 80% of the variance
varianceExplained = 0.8
pca = PCA(n_components=next(x for x, val in enumerate(pca.explained_variance_ratio_.cumsum()) if val >= varianceExplained) + 1)
pca.fit(pcaData_scaled)
pcaData_projected = pca.transform(pcaData_scaled) # for scatter plots

In [None]:
pd.DataFrame(pca.components_,columns=data.select_dtypes(include=[np.number]).columns,index=['PC1','PC2','PC3','PC4','PC5'])

* PC1 is focused on the exam score (math score, reading score, writing score) that are negatively associated
* PC2 is focused on the gender (gender_num)
* PC3 describes the associations of race/ethnicity, level of education of the parents, and test preparation (race/ethnicity_num, parental level of education_num, test preparation course_num)
* PC4 highlights the positive associations between lunch and test preparation (lunch_num, test preparation course_num)
* PC5 is focused on the race/ethnicity (race/ethnicity_num)

### 2.1.1. PC1 vs PC3 (Race/ethnicity, parental level of education, exam results, test preparation)

In [None]:
fig = go.Figure()
[fig.add_trace(go.Scatter(x=[0, pca.components_[0,x]],y=[0,pca.components_[2,x]],name=pcaData.columns[x])) for x in range(len(pca.components_[0,:]))]
fig.update_layout(plot_bgcolor='white',height=500, width=500,
                  showlegend=False,
                  shapes=[dict(type="circle",xref="x",yref="y",x0=-1,y0=-1,x1=1,y1=1,line_color="LightSeaGreen",)],
                  xaxis=dict(ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  yaxis=dict(ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  title=dict(text="Principal Component Analysis", font=dict(family="Verdana",size=25,color="Black")))
fig.update_xaxes(title=dict(text='PC1', font=dict(size=18)),showgrid=True, linecolor='black', ticks='outside')
fig.update_yaxes(title=dict(text='PC3', font=dict(size=18)),showgrid=True, linecolor='black', ticks='outside')
fig.show()

1. The race/ethnicity looks strongly positively correlated with the level of education of the parents.
2. The level of education of the parents does not seem correlated with the exam results.
3. The test preparation seems negatively correlated with the parental level of education

In [None]:
# Manipulate dataframe to plot heatmap
dataRaceEducationParents = data[['parental level of education_num','race/ethnicity_num']]
dataRaceEducationParents = dataRaceEducationParents.groupby(['parental level of education_num','race/ethnicity_num']).size().unstack(fill_value=0)
dataRaceEducationParents_Perc = round(dataRaceEducationParents/dataRaceEducationParents.sum(axis=0)*100,2)
matrixPercentage = []
for x in range(len(dataRaceEducationParents_Perc.index)):
       matrixPercentage.append(dataRaceEducationParents_Perc.iloc[x].values.tolist())

In [None]:
fig = go.Figure()
fig.add_trace(go.Heatmap(z=matrixPercentage,colorscale='gnbu'))

fig.update_layout(plot_bgcolor='white', width = 1000,
                  xaxis=dict(title='race/ethnicity',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  yaxis=dict(title='parental level of education',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  title=dict(text="Level of education of the parents vs race/ethnicity", font=dict(family="Verdana",size=25,color="Black")))
fig.update_xaxes(tickfont=dict(family='Verdana', color='grey', size=14), titlefont=dict(family='Verdana', color='black', size=16))
fig.update_yaxes(tickfont=dict(family='Verdana', color='grey', size=14), titlefont=dict(family='Verdana', color='black', size=16))

fig.show()

There is no significant correlation between race/ethnicity and the parental level of education. The values from 0 to 4 to describe each group have been randomly allocated. Furthermore no race/ethnicity group can be categorised as better or worst than another one. So the positive correlation between these two variables does not make sense here and only parental level of education will be used for further analysis. 

However some groups perform slightly differently than others.
* Groups 0 and 1 of race/ethnicity are similar and are the ones with the lowest parental level of education with about 45% of parents with no more than high school degree.
* Groups 2, 3 and 4 are quite similar. Group 3 is the one with the bigger percentage of master's degree. Group 4 is the one with more parents with at least high school degree (>71%).

In [None]:
listUniqueEducationLevel = sorted(data['parental level of education_num'].unique())

In [None]:
fig = go.Figure()
[fig.add_trace(go.Box(y=pcaData['math score'][pcaData['parental level of education_num']==x],name = data['parental level of education'][data['parental level of education_num'] == x].unique()[0],boxpoints='all')) for x in listUniqueEducationLevel]
fig.update_layout(plot_bgcolor='white',
                  xaxis=dict(title='parental level of education',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  yaxis=dict(title='math score',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  title=dict(text="Math score vs level of education of the parents", font=dict(family="Verdana",size=25,color="Black")))
fig.show()

No clear correlation between the parental level of education and the math score
- Higher parental level of education reduces the spread of the score
- None of the student with parents holding a master's degree got the maximal score (100)
- Students with parent having as education level: some college, associate's degree or bachelor's degree performs similarly in term of median score (67.5, 67, 68 respectively)

In [None]:
fig = go.Figure()
[fig.add_trace(go.Box(y=pcaData['reading score'][pcaData['parental level of education_num']==x],name = data['parental level of education'][data['parental level of education_num'] == x].unique()[0],boxpoints='all')) for x in listUniqueEducationLevel]
fig.update_layout(plot_bgcolor='white',
                  xaxis=dict(title='parental level of education',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  yaxis=dict(title='reading score',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  title=dict(text="Reading score vs level of education of the parents", font=dict(family="Verdana",size=25,color="Black")))
fig.show()

No clear correlation between the parental level of education and the reading score
- Higher parental level of education reduces the spread of the score
- Looking at the median score of each group, the higher the parental education the higher the median score (considering "some high school" similar as "high school")

In [None]:
fig = go.Figure()
[fig.add_trace(go.Box(y=pcaData['writing score'][pcaData['parental level of education_num']==x],name = data['parental level of education'][data['parental level of education_num'] == x].unique()[0],boxpoints='all')) for x in listUniqueEducationLevel]
fig.update_layout(plot_bgcolor='white',
                  xaxis=dict(title='parental level of education',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  yaxis=dict(title='writing score',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  title=dict(text="Writing score vs level of education of the parents", font=dict(family="Verdana",size=25,color="Black")))
fig.show()

Same comments as for reading score above.

In [None]:
dataParentEducationTestPreparation = data.groupby(['parental level of education','parental level of education_num','test preparation course_num']).size().unstack(fill_value=0).reset_index()
dataParentEducationTestPreparation['No_perc'] = round(dataParentEducationTestPreparation[0]/(dataParentEducationTestPreparation[0]+dataParentEducationTestPreparation[1])*100,2)
dataParentEducationTestPreparation['Yes_perc'] = round(dataParentEducationTestPreparation[1]/(dataParentEducationTestPreparation[0]+dataParentEducationTestPreparation[1])*100,2)
dataParentEducationTestPreparation = dataParentEducationTestPreparation.sort_values(by='parental level of education_num')

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=dataParentEducationTestPreparation['Yes_perc'],y=dataParentEducationTestPreparation['parental level of education'],orientation='h'))
fig.update_layout(plot_bgcolor='white', height = 400,
                  xaxis=dict(title='percentage of test preparation [%]',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  yaxis=dict(title='parental level of education',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  title=dict(text="Level of education of the parents vs percentage of test preparation", font=dict(family="Verdana",size=25,color="Black")))
fig.show()

- The higher the parental level of education the higher the percentage of test preparation is only true from 'high school' to 'bachelor's degree' categories
- Students with parents with 'some high school' education are the one with the highest preparation rate
- Students whose parents have master's degree are not the one preparing the most their tests

### 2.1.2. PC1 vs PC4 (exam results, lunch, test preparation)

In [None]:
fig = go.Figure()
[fig.add_trace(go.Scatter(x=[0, pca.components_[0,x]],y=[0,pca.components_[3,x]],name=pcaData.columns[x])) for x in range(len(pca.components_[0,:]))]
fig.update_layout(plot_bgcolor='white',height=500, width=500,
                  showlegend=False,
                  shapes=[dict(type="circle",xref="x",yref="y",x0=-1,y0=-1,x1=1,y1=1,line_color="LightSeaGreen",)],
                  xaxis=dict(ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  yaxis=dict(ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  title=dict(text="Principal Component Analysis", font=dict(family="Verdana",size=25,color="Black")))
fig.update_xaxes(title=dict(text='PC1', font=dict(size=18)),showgrid=True, linecolor='black', ticks='outside')
fig.update_yaxes(title=dict(text='PC4', font=dict(size=18)),showgrid=True, linecolor='black', ticks='outside')
fig.show()

1. Test results do not seem correlated with the test preparation
2. Test results do not seem correlated to lunch type

In [None]:
fig = go.Figure()
for x in ['math score','reading score', 'writing score']:
    fig.add_trace(go.Box(y=data[x][data['test preparation course_num']==0],name=x + ' (No)'))
    fig.add_trace(go.Box(y=data[x][data['test preparation course_num']==1],name=x + ' (Yes)'))
fig.update_layout(plot_bgcolor='white',
                  xaxis=dict(title='exam categories',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  yaxis=dict(title='exam results',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  title=dict(text="Exam results with (Yes) and without (No) preparation", font=dict(family="Verdana",size=25,color="Black")))
fig.show()

These boxplots clearly indicate that the group of students that prepared the exams performed better that the group of students that did not prepare for the exams. This is only valid when considering these two groups of students. Individually, a student that did not prepare the exams can perform better than another student that prepared the exams.

In [None]:
fig = go.Figure()
for x in ['math score','reading score', 'writing score']:
    fig.add_trace(go.Box(y=data[x][data['lunch']=='standard'],name=x + ' (Standard)'))
    fig.add_trace(go.Box(y=data[x][data['lunch']=='free/reduced'],name=x + ' (Free/reduced)'))
fig.update_layout(plot_bgcolor='white',
                  xaxis=dict(title='exam categories',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  yaxis=dict(title='exam results',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  title=dict(text="Exam results with standard and free/reduced lunch", font=dict(family="Verdana",size=25,color="Black")))
fig.show()

The group of students with standard lunch performs better than the group of students with free/reduced lunch. (Not sure about the exact definition of standard and free/reduced lunch to draw conclusion)

### 2.1.3. PC1 vs PC2 (exam results, gender)

In [None]:
fig = go.Figure()
[fig.add_trace(go.Scatter(x=[0, pca.components_[0,x]],y=[0,pca.components_[1,x]],name=pcaData.columns[x])) for x in range(len(pca.components_[0,:]))]
fig.update_layout(plot_bgcolor='white',height=500, width=500,
                  showlegend=False,
                  shapes=[dict(type="circle",xref="x",yref="y",x0=-1,y0=-1,x1=1,y1=1,line_color="LightSeaGreen",)],
                  xaxis=dict(ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  yaxis=dict(ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  title=dict(text="Principal Component Analysis", font=dict(family="Verdana",size=25,color="Black")))
fig.update_xaxes(title=dict(text='PC1', font=dict(size=18)),showgrid=True, linecolor='black', ticks='outside')
fig.update_yaxes(title=dict(text='PC2', font=dict(size=18)),showgrid=True, linecolor='black', ticks='outside')
fig.show()

1. Gender does not look correlated with the exam results 

In [None]:
fig = go.Figure()
for x in ['math score','reading score', 'writing score']:
    fig.add_trace(go.Box(y=data[x][data['gender']=='male'],name=x + ' (Male)'))
    fig.add_trace(go.Box(y=data[x][data['gender']=='female'],name=x + ' (Female)'))
fig.update_layout(plot_bgcolor='white',
                  xaxis=dict(title='exam categories',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  yaxis=dict(title='exam results',ticks="outside", tickwidth=2, tickcolor='grey', ticklen=10,showline=True, linewidth=2, linecolor='grey'),
                  title=dict(text="Exam results per gender", font=dict(family="Verdana",size=25,color="Black")))
fig.show()

- Men perform globally better than women on the math exams
- Women perform globally better on reading and writing exams

## 3.Conlcusion
This work might be completed with further analysis, i.e. probability analysis, machine learning model to predict the score of a student...