<p style="font-family: Arials; line-height: 1.3; font-size: 30px; font-weight: bold; letter-spacing: 2px; text-align: center; color: #23527c">Analysing student's performance in exams</p>


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.validators.scatter.marker import SymbolValidator
import plotly.offline as pyo
pyo.init_notebook_mode()

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<p style="font-family: Arials; line-height: 1.3; font-size: 27px; font-weight: bold; letter-spacing: 2px; text-align: center; color: #23527c">Reading the dataset</p>

In [None]:
df = pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")
df.head(5).style.background_gradient(cmap="Set2")

In [None]:
#Checking for null values
df.isnull().sum()

### **No null values, yay!**

---

<p style="font-family: Arials; line-height: 1.3; font-size: 27px; font-weight: bold; letter-spacing: 2px; text-align: center; color: #23527c">Exploratory data analysis</p>

<span style="font-family: Arials; font-size: 27px; font-style: bold; font-weight: bold; letter-spacing: 2px; color: #23527c">1. Gender</span>
<hr style="height: 0.5px; border: 0; background-color: 'Black'">


### Let's see how Gender affects grades

In [None]:
#checking relation of gender with grades
gender_mean = df.groupby('gender')[['math score','reading score','writing score']].mean().round(2).T
fig = go.Figure(data=[go.Table(
    header=dict(values=['','Male', 'Female'],
                line_color='darkslategray',
                fill_color='lightskyblue',
                align='center',
               height=40,
               font_size=20),
    cells=dict(values=[gender_mean.index,
                       gender_mean['male'],
                       gender_mean['female']],
               line_color='darkslategray',
               fill_color='lightcyan',
               align='center',
              height=40,
              font_size= 20))
])

fig.update_layout(width=800, height=400)
fig.show()

### As we can see above from the mean score:
- Male scored better in **math** than female
- Female scored better in **Reading** than Male
- Female scored better in **Writing** than Male

## Let's have a look at the score distribution

In [None]:
fig , axes = plt.subplots(2,3, figsize=(20,10))
sns.histplot(df[df['gender'] =='male']['math score'],kde=True,ax = axes[0][0])
sns.histplot(df[df['gender'] =='male']['reading score'],kde=True,ax = axes[0][1])
sns.histplot(df[df['gender'] =='male']['writing score'],kde=True,ax = axes[0][2])

sns.histplot(df[df['gender'] =='female']['math score'],kde=True,ax = axes[1][0],color='red')
sns.histplot(df[df['gender'] =='female']['reading score'],kde=True,ax = axes[1][1],color='red')
sns.histplot(df[df['gender'] =='female']['writing score'],kde=True,ax = axes[1][2],color='red')

for i in range(0,3):
    axes[0][i].set_title("Male")
    axes[1][i].set_title("Female")
    
    axes[0][i].set_xlim(0,100)
    axes[1][i].set_xlim(0,100)
plt.show()


### Looking at the distribution, we can see that Male's score is distributed at the center as compared with female.

<span style="font-family: Arials; font-size: 27px; font-style: bold; font-weight: bold; letter-spacing: 2px; color: #23527c">2. Group</span>
<hr style="height: 0.5px; border: 0; background-color: 'Black'">


### Let's see how Group (Race/Ethnicity) affects grades

In [None]:
df['tmp'] = 1
fig = px.pie(df, names='race/ethnicity',values='tmp',hole = 0.8,title='relation tips')
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.update_layout(
    title_text="Group percentage",
    annotations=[dict(text='Group', x=0.5, y=0.5, font_size=20, showarrow=False)])

In [None]:

df['total'] = (df['math score'] + df['reading score'] + df['writing score']) / 3
fig , axes = plt.subplots(1,4, figsize=(25,8))
sns.barplot(data = df, x='race/ethnicity', y='math score',ax = axes[0],ci=None)
axes[0].set_title("Math score")
sns.barplot(data = df, x='race/ethnicity', y='reading score',ax = axes[1],ci=None)
axes[1].set_title("Reading score")
sns.barplot(data = df, x='race/ethnicity', y='writing score',ax = axes[2],ci=None)
axes[2].set_title("Writing score")
sns.barplot(data = df, x='race/ethnicity', y='total',ax = axes[3],ci=None)
axes[3].set_title("Total score")

for i in range(0,4):
    axes[i].set_ylim(55,75)
plt.show()

###  Turns out **Group E** happens to be performing **best** in all subjects, while **group A** performs the **worst**

<span style="font-family: Arials; font-size: 27px; font-style: bold; font-weight: bold; letter-spacing: 2px; color: #23527c">3. Parental level of education</span>
<hr style="height: 0.5px; border: 0; background-color: 'Black'">


### Let's see how Parental level of education affects grades

In [None]:
fig , axes = plt.subplots(1,4, figsize=(25,10),sharey=True)
sns.barplot(data = df, y='parental level of education', x='math score',ax = axes[0],ci=None)
axes[0].set_title("Math score")
sns.barplot(data = df, y='parental level of education', x='reading score',ax = axes[1],ci=None)
axes[1].set_title("Reading score")
sns.barplot(data = df, y='parental level of education', x='writing score',ax = axes[2],ci=None)
axes[2].set_title("Writing score")
sns.barplot(data = df, y='parental level of education', x='total',ax = axes[3],ci=None)
axes[3].set_title("Total score")
for i in range(0,4):
    axes[i].set_xlim(60,77.5)

plt.show()

### Children with parents who have **master level education**, appear to get better grades and perform well in all aspects.

<span style="font-family: Arials; font-size: 27px; font-style: bold; font-weight: bold; letter-spacing: 2px; color: #23527c">4. Lunch of education</span>
<hr style="height: 0.5px; border: 0; background-color: 'Black'">


### Let's see how Lunch affects grades

In [None]:
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "domain"}, {"type": "xy"}]],
)
fig.add_trace(go.Pie(labels=['free/reduced','standard'],values=df.groupby('lunch')['tmp'].count(),hole = 0.8),
              row=1, col=1)
fig.update_traces(textposition='outside', textinfo='percent+label')

colors = ['red','blue']
texts = df.groupby('lunch')['total'].mean().round(2)
fig.add_trace(go.Bar(x = ['free/reduced','standard'],y= df.groupby('lunch')['total'].mean(),text=texts,marker_color=colors,opacity=.5))
fig.update_traces(textposition='outside')
fig.update_layout(showlegend=False)
fig.layout.margin.update({'t':50, 'b':100,'l':0})
fig.update_layout(
    title="Score's relationship with Lunch",
    legend_title="Legend Title",
    font=dict(
        size=15,
    )
)
fig.show()

### Students having standard lunch have better grades than students having free/reduced lunch.

<span style="font-family: Arials; font-size: 27px; font-style: bold; font-weight: bold; letter-spacing: 2px; color: #23527c">5. Preparatory course for test</span>
<hr style="height: 0.5px; border: 0; background-color: 'Black'">


### Let's see how test preparation scores affects grades

In [None]:
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "domain"}, {"type": "xy"}]],
)
fig.add_trace(go.Pie(labels=['Completed','None'],values=df.groupby('test preparation course')['tmp'].count(),hole = 0.8),
              row=1, col=1)
fig.update_traces(textposition='outside', textinfo='percent+label')

colors = ['red','blue']
texts = df.groupby('test preparation course')['total'].mean().round(2)
fig.add_trace(go.Bar(x = ['Completed','None'],y= df.groupby('test preparation course')['total'].mean(),text=texts,marker_color=colors,opacity=.5))
fig.update_traces(textposition='outside')
fig.update_layout(showlegend=False)
fig.layout.margin.update({'t':50, 'b':100,'l':0})
fig.update_layout(
    
    title="Score's Relationship with preparation course",
    legend_title="Legend Title",
    font=dict(
        size=15,
    )
)
fig.show()

### Students who completed test preparation courses get higher grades

<span style="font-family: Arials; font-size: 27px; font-style: bold; font-weight: bold; letter-spacing: 2px; color: #23527c">6. Relation between groups(race/ethnicity) and parent's education level</span>
<hr style="height: 0.5px; border: 0; background-color: 'Black'">


In [None]:
fig = px.sunburst(df, path=['race/ethnicity', 'parental level of education'], values='total')
fig.show()

---

<p style="font-family: Arials; line-height: 1.3; font-size: 27px; font-weight: bold; letter-spacing: 2px; text-align: center; color: #23527c">Processing the Data
</p>

In [None]:
df.drop(['tmp','total'],axis=1,inplace=True)

X = df.drop(['math score', 'reading score', 'writing score'],axis=1)
y = df[['math score', 'reading score', 'writing score']]

In [None]:
#Getting dummy values
X = pd.get_dummies(X,columns=['gender','race/ethnicity','parental level of education','lunch','test preparation course'])
X.head(5)

In [None]:
#Splitting the dataset now
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=100)

---

<p style="font-family: Arials; line-height: 1.3; font-size: 27px; font-weight: bold; letter-spacing: 2px; text-align: center; color: #23527c">Modelling
</p>

In [None]:
#Using KNN
mse = []
for i in range(2,101):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(X_train, y_train)
    mse.append(mean_squared_error(y_test,model.predict(X_test)))

In [None]:
plt.figure(figsize=(15,10))
plt.plot(mse)
plt.show()

In [None]:
model = KNeighborsClassifier(n_neighbors=80)
model.fit(X_train, y_train)
mse.append(mean_squared_error(y_test,model.predict(X_test)))

pred = model.predict(X_test)

pred = pd.DataFrame(pred,columns= y_test.columns)

fig , axes = plt.subplots(1,3, figsize=(25,10))
sns.scatterplot(x=pred['math score'], y=y_test['math score'],ax=axes[0])
axes[0].plot([40,100], [40,100])
sns.scatterplot(x=pred['reading score'], y=y_test['reading score'],ax=axes[1])
axes[1].plot([40,100], [40,100])
sns.scatterplot(x=pred['writing score'], y=y_test['writing score'],ax=axes[2])
axes[2].plot([40,100], [40,100])
plt.show()

### Please upvote my notebook if you like it! :D