In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Importing dataset

df = pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
df.head()

In [None]:
# Data Cleaning
df.isnull().sum()

## Data Preparation

In [None]:
#Creating a new variable Percentage 

df['Percentage'] = ((df['math score']+df['writing score']+df['reading score'])/3).round(2)

In [None]:
df.shape

# Exploring Data

In [None]:
df.head()

## Which Gender performs better than the other?

In [None]:
plt.figure(figsize = (10,8))
sns.barplot(data = df, x = 'gender', y = 'Percentage', ci = False)
plt.xticks(fontsize = 12)
plt.xlabel('Gender', fontsize = 15)
plt.yticks(fontsize = 12)
plt.ylabel('Percentage', fontsize = 15)
plt.show()


## Inference:
- Overall we can see that female students perform better with repect to the percentage scored by the students

# Do test preparatory courses help the students to perform well in the examinations?

In [None]:
plt.figure(figsize = (12,6))
sns.barplot(data = df, x = 'test preparation course', y = 'Percentage', hue = 'gender', ci = False)
plt.xlabel('Test Preparation status', fontsize = 15)
plt.xticks(rotation = 45)
plt.ylabel('Percentage scored', fontsize = 15)
plt.show()

## Inference:
- We can see that Students who finish the test preparatory course perform better than that of students who do not.
- Females seem to score much better than males in both cases exclusively.

## Which race/ethnicity students performed the best?

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data = df, x = 'race/ethnicity', y = 'Percentage', hue = 'gender')
plt.xlabel('Race', fontsize = 15)
plt.ylabel('Percentage', fontsize = 15)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()

## Inference:
- We can see that Group E has performed the best among all other Races 
- An observation can also be made that most females are performing better than males.

In [None]:
df.head()

## How do the groups perform against each other in individual examinations?

In [None]:
plt.figure(figsize = (15,7))
sns.boxplot(data = df, x = 'race/ethnicity', y = 'Percentage')
plt.xlabel('Race', fontsize = 15)
plt.xticks(fontsize = 12)
plt.ylabel('Percentage', fontsize = 15)
plt.title('Percentage to Group Comparison', fontsize = 20)
plt.show()

#### Mathematics exam

In [None]:
plt.figure(figsize = (15,7))
sns.boxplot(data = df, x = 'race/ethnicity', y = 'math score', hue = 'gender')
plt.xlabel('Race', fontsize = 15)
plt.xticks(fontsize = 12)
plt.ylabel('Math Score', fontsize = 15)
plt.title('Math Score Group Comparison', fontsize = 20)
plt.show()

## Inference:
- Males from all groups outperform the females in the mathematics exam.
- Group A students perform the worst with respect to the other groups.
- Group E students remain on top with the highest scores.

#### Writing exam

In [None]:
plt.figure(figsize = (15,7))
sns.boxplot(data = df, x = 'race/ethnicity', y = 'writing score', hue = 'gender')
plt.xlabel('Race', fontsize = 15)
plt.xticks(fontsize = 12)
plt.ylabel('Writing Score', fontsize = 15)
plt.title('Writing Score Group Comparison', fontsize = 20)
plt.show()

## Inference:
- Females from all groups outperform males by a wide margin in all groups
- Group A students share the lowest scores alongside Group B and Group C.

#### Reading Exam

In [None]:
plt.figure(figsize = (15,7))
sns.boxplot(data = df, x = 'race/ethnicity', y = 'reading score', hue = 'gender')
plt.xlabel('Race', fontsize = 15)
plt.xticks(fontsize = 12)
plt.ylabel('Reading Score', fontsize = 15)
plt.title('Reading Score Group Comparison', fontsize = 20)
plt.show()

## Inference:
- Females of all groups except group A are more consistent with scoring 62 to 80.
- Males have underperformed against females similar to the Writing exam.
- The highest marks are secured once again by group E followed by group D.

# How does the parental level of education impact the students' score?

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data = df, x = 'parental level of education', y = 'math score', ci = False)
plt.xlabel('Education level', fontsize = 15)
plt.xticks(fontsize = 12)
plt.ylabel('Math Score', fontsize = 15)
plt.yticks(fontsize = 12)
plt.title('Education level with respect to Math Score', fontsize = 20)
plt.show()

## Inference:
- We can see that the parents who have a master's degree and a bachelor's degree perform the best in math. Whereas that of high school level of education perform relatively less.

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data = df, x = 'parental level of education', y = 'writing score', ci = False)
plt.xlabel('Education level', fontsize = 15)
plt.xticks(fontsize = 12)
plt.ylabel('Writing Score', fontsize = 15)
plt.yticks(fontsize = 12)
plt.title('Education level with respect to Writing score', fontsize = 20)
plt.show()

## Inference:
- A similar result to that of Mathematics can be seen here but over here we can see that Master's degree parents' children have performed best among all followed by the bachelor's degree children.

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(data = df, x = 'parental level of education', y = 'reading score', ci = False)
plt.xlabel('Education level', fontsize = 15)
plt.xticks(fontsize = 12)
plt.ylabel('Reading Score', fontsize = 15)
plt.yticks(fontsize = 12)
plt.title('Education level with respect to Reading score', fontsize = 20)
plt.show()

## Inference:
- Reading score also gives us the similar result thereby suggesting that the children of parent's holding a master's degree perform the best no matter which subject

In [None]:
df.head()

# What is the reason due to which group A students underperform with respect to their peers?

In [None]:
sns.catplot(data = df, x = 'race/ethnicity',
           hue = 'test preparation course',
           col = 'gender',
           kind = 'count',
           col_wrap = 2)
plt.show()

## Inference:
- We can see that Group A students usually do not take up the preparatory course in both genders and are naturally less in comparison.