In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
import pandas as pd
import os
from os import listdir

# Understanding Data

In [None]:
df = pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")
original_df = df.copy(deep=True)
df.describe()

In [None]:
print(df.nunique())
print(df["parental level of education"].unique())

In [None]:
fig, ax = plt.subplots(2,2, figsize=(15, 15))
df.groupby("gender").size().plot(kind='pie', autopct="%.2f%%", textprops={'fontsize': 15},colors=['gold', 'skyblue'],ax=ax[0][0])
ax[0][0].set_xlabel("gender",fontsize=20)
df.groupby("lunch").size().plot(kind='pie', autopct="%.2f%%", textprops={'fontsize': 15},cmap="Pastel1",ax=ax[0][1])
ax[0][1].set_xlabel("lunch",fontsize=20)
df.groupby("parental level of education").size().plot(kind='pie', autopct="%.2f%%", textprops={'fontsize': 15},cmap="Paired",ax=ax[1][0])
ax[1][0].set_xlabel("Parent's education",fontsize=20)
df.groupby("test preparation course").size().plot(kind='pie', autopct="%.2f%%", textprops={'fontsize': 15},colors=['tomato', '#34568B'],ax=ax[1][1])
ax[1][1].set_xlabel("Test preparation",fontsize=20)
ax[0][0].set_ylabel("",fontsize=20)
ax[1][0].set_ylabel("",fontsize=20)
ax[1][1].set_ylabel("",fontsize=20)
ax[0][1].set_ylabel("",fontsize=20)

In [None]:
df.gender.replace('female',1,inplace=True)
df.gender.replace('male',0,inplace=True)

df.lunch.replace('standard',1,inplace=True)
df.lunch.replace('free/reduced',0,inplace=True)

df["race/ethnicity"].replace('group A',0,inplace=True)
df["race/ethnicity"].replace('group B',1,inplace=True)
df["race/ethnicity"].replace('group C',2,inplace=True)
df["race/ethnicity"].replace('group D',3,inplace=True)
df["race/ethnicity"].replace('group E',4,inplace=True)

df["parental level of education"].replace('some high school',0,inplace=True)
df["parental level of education"].replace('high school',1,inplace=True)
df["parental level of education"].replace('some college',2,inplace=True)
df["parental level of education"].replace('bachelor\'s degree',3,inplace=True)
df["parental level of education"].replace('master\'s degree',4,inplace=True)
df["parental level of education"].replace('associate\'s degree',5,inplace=True)

df["test preparation course"].replace('none',0,inplace=True)
df["test preparation course"].replace('completed',1,inplace=True)

df

# Data Analysis

In [None]:
df_corr = df.corr()
df_corr

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df_corr, vmin=-1, cmap="coolwarm", annot=True)

### In the heatmap above, we see that there is no noteworthy correlation between the scores and any of the columns. The highest correlation coefficient is 0.35, between lunch and math score, but 0.35 is considered a weak correlation. However, we see a very strong correlation between the scores themselves. Anybody who scores good in writing, will almost definitely also score good in reading. The relationship between math and other subjects is weaker, but 0.82 is still an indication of a strong correlation.

In [None]:
fig, ax = plt.subplots(3,1, figsize=(10,10))
sns.boxplot(data=original_df, y="gender", x="reading score",palette=["m", "g"],ax=ax[0]).set_title("Reading")

sns.boxplot(data=original_df, y="gender", x="math score",palette=["m", "g"],ax=ax[1]).set_title("Math")
sns.boxplot(data=original_df, y="gender", x="writing score",palette=["m", "g"],ax=ax[2]).set_title("Writing")
fig.tight_layout()
ax[0].set_xlabel("")
ax[1].set_xlabel("")
ax[2].set_xlabel("")

### From the graphs above, we see that when it comes to writing and reading, females are on average a little better than males, by approximately 7 points. However, when it comes to math, male median score is higher than female.

In [None]:
def draw_countplot(col):
    plt.figure(figsize=(15,10))
    g = sns.countplot(data=df,x=col)
    plt.xticks(range(0, len(g.get_xticklabels()),2),g.get_xticklabels()[::2])
    g.set_xticklabels([math.ceil(float(i.get_text())) for i in g.get_xticklabels()], rotation=0);
    plt.show()
draw_countplot("math score")
draw_countplot("reading score")
draw_countplot("writing score")