In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df=pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')

df['total score'] = df['math score'] + df['reading score'] + df['writing score']

df.head()

In [None]:
df.isnull().sum()

* There are no missing values in the dataset.

In [None]:
df.describe()

* There are 1000 students in this dataset.
* Students performed at similar levels across the three test categories.


In [None]:
labels=df['gender'].value_counts().index
values=df['gender'].value_counts().values

plt.figure(figsize=(5,5))
plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=90)
plt.title('Gender', color='black',fontsize=15)
plt.show()

* There are roughly the same number of male and female students in the dataset, with slightly more female than male students.

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(15,5))
fig.subplots_adjust(wspace=1)

axes[0].set_title('Math Score')
sns.boxplot(ax=axes[0], x=df['gender'], y=df['math score'])

axes[1].set_title('Reading Score')
sns.boxplot(ax=axes[1], x=df['gender'], y=df['reading score'])

axes[2].set_title('Writing Score')
sns.boxplot(ax=axes[2], x=df['gender'], y=df['writing score'])

axes[3].set_title('Total Score')
sns.boxplot(ax=axes[3], x=df['gender'], y=df['total score'])

* On average, female students performed better on the tests than the male students, except for on the math test.
* There are more outliers amongst the female students than the male students.

In [None]:
sns.pairplot(df, hue="gender")

* There is a linear relationship between performance on one exam and performance on another for both male and female students.

In [None]:
df['is male']=(df['gender']=='male').astype(int)
sns.heatmap(df.corr(), annot=True)
plt.title('Correlation between Test Performances and Gender')

* There are high correlations between performance on one test and performance on another. This suggests that students who perform well on one exam will perform well on others.
* Overall, there is a low corrleation between gender and test performance. There is a small positive correlation between a student being male and math score and small negative corrleations between a student being male and reading, writing, and total score. This suggests that gender has little influence over test performance.


In [None]:
sns.countplot(df['test preparation course'],hue=df['gender'])

In [None]:
cnts = df.groupby(['gender'])['test preparation course'].value_counts(normalize=True)
print("The percent of female students who completed the test preparation course is: {:.2f}%".format(cnts["female"]["completed"]*100))
print("The percent of male students who completed the test preparation course is: {:.2f}%".format(cnts["male"]["completed"]*100))



* A similar number of male and female students completed the test preparation course. Therefore, it is unlikely that completion of the test preparation course caused a gender difference in test performance.

In [None]:
df['parents obtained college degree'] = np.where((df['parental level of education']=='associate\'s degree') | (df['parental level of education']=='bachelor\'s degree') | (df['parental level of education']=='master\'s degree'), True, False) 
sns.countplot(df['parents obtained college degree'],hue=df['gender'])

In [None]:
cnts = df.groupby(['gender'])['parents obtained college degree'].value_counts(normalize=True)
print("The percent of female students whose parents completed a college degree is: {:.2f}%".format(cnts["female"][True]*100))
print("The percent of male students whose parents completed a college degree is: {:.2f}%".format(cnts["male"][True]*100))

* More female students had a parent obtain a college degree than male students.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,5))
fig.subplots_adjust(wspace=1)

axes[0].set_title('Female Students')
sns.boxplot(ax=axes[0], x=df['parents obtained college degree'], y=df['total score'])

axes[1].set_title('Male Students')
sns.boxplot(ax=axes[1], x=df['parents obtained college degree'], y=df['total score'])

* Students who had a parent with a college degree had a higher average total score than students without a parent with a college degree.
* Since more female students had a parent with a college degree than male students, parental education could be a factor in why female students had a better test performance overall.

In [None]:
# bar charts showing distribution between free/reduced lunch and gender
sns.countplot(df['lunch'],hue=df['gender'])

In [None]:
cnts = df.groupby(['gender'])['lunch'].value_counts(normalize=True)
print("The percent of female students who have free/reduced lunch is: {:.2f}%".format(cnts["female"]["free/reduced"]*100))
print("The percent of male students who have free/reduced lunch is: {:.2f}%".format(cnts["male"]["free/reduced"]*100))

* More female students have free/reduced lunch than male students.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,5))
fig.subplots_adjust(wspace=1)

axes[0].set_title('Female Students')
sns.boxplot(ax=axes[0], x=df['lunch'], y=df['total score'])

axes[1].set_title('Male Students')
sns.boxplot(ax=axes[1], x=df['lunch'], y=df['total score'])

* Students who had free/reduced lunch had a lower average total score on the exam than students with a standard lunch.
* Even though a higher percent of female students had free/reduced lunch than male students, they had a higher average total score. This suggests that another factor had more influence on the gender difference in test performance.

### Top Ten Students by Total Exam Performance

In [None]:
df.sort_values(by=['total score'],ascending=False).head(10)


* Three students scored the maximum amount of points, one male and two female.
* 7/10 top scorers are female.