# Introduction
The purpose of this kernel is to understand how other variables (gender, ethnicity, parental education, lunch, exam preparation)
affect student performance


![PIC](https://showmeinstitute.org/sites/default/files/pros-cons-of-standardized-tests-860x420.jpg)

In [None]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd

# data visualization(for EDA)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
sns.set(color_codes=True)

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')
df.head()

In [None]:
df['Total score']=df['math score']+df['reading score']+df['writing score']
df.head()

In [None]:
#preview data
df.info()

## Types Of Features

### Categorical Feature:
Categorical Features in the dataset: gender,race,parental level of education,lunch and test preparation course.



### Continous Feature:

Continous Features in the dataset: reading score,writing score and math score

In [None]:
f,ax=plt.subplots(1,3,figsize=(18,10)) 
#pie chart for gender
df['gender'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True ,legend=True)
ax[0].set_title('Gender') 
ax[0].set_ylabel('')

#pie chart for test preparation
df['test preparation course'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[1],shadow=True,legend=True)
ax[1].set_title('test preparation course') 
ax[1].set_ylabel('')

#pie chart for lunch
df['lunch'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[2],shadow=True,legend=True)
ax[2].set_title('lunch') 
ax[2].set_ylabel('')


## pie chart 1

Figure one shows that the number of female students is 51.8% and male are just 48.2%
## pie chart 2

The number of students who completed the courses is significantly lower than the none completed one.

## pie chart 3

Figure three represents that more number of students have standard lunch than free/reduced

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,10))
#pie chart for parents education
df['parental level of education'].value_counts().plot.pie(explode=None,autopct='%1.1f%%',ax=ax[0],shadow=True,legend=True)
ax[0].set_title('parental level of education') 
ax[0].set_ylabel('')

#pie chart for race
df['race/ethnicity'].value_counts().plot.pie(autopct='%1.1f%%',ax=ax[1],shadow=True,legend=True)
ax[1].set_title('race/ethnicity') 
ax[1].set_ylabel('')


# pie chart 1

The above plot shows most of the parents went to some college or had an associate's degree and there are very less people who had higher studies.
# pie chart 2
The second bar shows that more than 50% of students belong to  group D and C and there is just one-quarter student who belongs to group A and E

In [None]:
subjects = ['math score', 'reading score', 'writing score']
dataset = df.groupby('gender')[subjects].mean()

score_label = np.arange(0, 110, 10)
male_means = list(dataset.T['male'])
female_means = list(dataset.T['female'])


# set width of bar
barWidth = 0.35

fig, ax = plt.subplots(figsize=(7,8))

# Set position of bar on X axis
r1 = np.arange(0,len(subjects)*2,2)
r2 = [x + barWidth for x in r1]



# Make the plot
barMale = ax.bar( r1,male_means,  width=barWidth, label='Male means')
barFemale = ax.bar(r2,female_means,  width=barWidth, label='FeMale means')

# inserting x axis label
plt.xticks([r + barWidth for r in range(0,len(subjects)*2,2)], dataset)
ax.set_xticklabels(subjects)

# inserting y axis label
ax.set_yticks(score_label)
ax.set_yticklabels(score_label)

# inserting legend
ax.legend()

# def insert_data_labels(bars):
# 	for bar in bars:
# 		bar_height = bar.get_height()
# 		ax.annotate('{0:.0f}'.format(bar.get_height()),
# 			xy=(bar.get_x() + bar.get_width() / 2, bar_height),
# 			xytext=(0, 3),
# 			textcoords='offset points',
# 			ha='center',
# 			va='bottom'
# 		)
        
def insert_data_labels(bars):
	for bar in bars:
		bar_height = bar.get_height()
		ax.annotate('{0:.0f}'.format(bar.get_height()),
			xy=(bar.get_x() + bar.get_width() / 2, bar_height),
			xytext=(0, 3),
			textcoords='offset points',
			ha='center',
			va='bottom'
		)


insert_data_labels(barMale)
insert_data_labels(barFemale)


plt.show()

From the bar graph, we can see the math scores of boys are better than girls whereas girls perform better in reading and
writing.



In [None]:
subjects = ['math score', 'reading score', 'writing score']
dataset = df.groupby('test preparation course')[subjects].mean()


none_means = list(dataset.T['none'])
completed_means = list(dataset.T['completed'])

# set width of bar
barWidth = 0.35

fig, ax = plt.subplots(figsize=(7,8))

# Set position of bar on X axis
r1 = np.arange(0,len(subjects)*2,2)
r2 = [x + barWidth for x in r1]



# Make the plot
barNone = ax.bar(r1, none_means,  width=barWidth, label='none')
barcompleted = ax.bar(r2, completed_means,  width=barWidth, label='completed')

# inserting x axis label
plt.xticks([r + barWidth for r in range(0,len(subjects)*2,2)], dataset)
ax.set_xticklabels(subjects)

# inserting y axis label
ax.set_yticks(score_label)
ax.set_yticklabels(score_label)

# inserting legend
ax.legend()
insert_data_labels(barNone)
insert_data_labels(barcompleted)



plt.show()

From the bar graphs, it's clear that if the course is completed one achieves higher scores in the subject.

In [None]:
subjects = ['math score', 'reading score', 'writing score']
dataset = df.groupby('lunch')[subjects].mean()



standard_means = list(dataset.T['standard'])
free_means = list(dataset.T['free/reduced'])

# set width of bar
barWidth = 0.35

fig, ax = plt.subplots(figsize=(7,8))

# Set position of bar on X axis
r1 = np.arange(0,len(subjects)*2,2)
r2 = [x + barWidth for x in r1]

# Make the plot
barstandard  = ax.bar(r1, standard_means, width=barWidth, label='standard_mean')
barfreeandreduce = ax.bar(r2, free_means, width=barWidth, label='Free/reduced_mean')


# inserting x axis label
plt.xticks([r + barWidth for r in range(0,len(subjects)*2,2)], dataset)
ax.set_xticklabels(subjects)

# inserting y axis label
ax.set_yticks(score_label)
ax.set_yticklabels(score_label)

# inserting legend
ax.legend()


insert_data_labels(barstandard)
insert_data_labels(barfreeandreduce)

plt.show()

In all the cases the scores are higher by having the standard lunch.

In [None]:
subjects = ['math score', 'reading score', 'writing score']
dataset = df.groupby('race/ethnicity')[subjects].mean()


GroupA_mean = list(dataset.T['group A'])
GroupB_mean  = list(dataset.T['group B'])
GroupC_mean  = list(dataset.T['group C'])
GroupD_mean  = list(dataset.T['group D'])
GroupE_mean  = list(dataset.T['group E'])

# set width of bar
barWidth = 0.35

fig, ax = plt.subplots(figsize=(7,8))

# Set position of bar on X axis
r1 = np.arange(0,len(subjects)*2,2)
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
r5 = [x + barWidth for x in r4]

# Make the plot
Group_A = ax.bar(r1, GroupA_mean, width=barWidth, label='GroupA_mean')
Group_B = ax.bar(r2, GroupB_mean, width=barWidth, label='GroupB_mean')
Group_C = ax.bar(r3,GroupC_mean, width=barWidth, label='GroupC_mean')
Group_D = ax.bar(r4, GroupD_mean,width=barWidth, label='GroupD_mean')
Group_E= ax.bar(r5,GroupE_mean, width=barWidth, label='GroupE_mean')

# inserting x axis label
plt.xticks([r + barWidth for r in range(0,len(subjects)*2,2)], dataset)
ax.set_xticklabels(subjects)

# inserting y axis label
ax.set_yticks(score_label)
ax.set_yticklabels(score_label)

# inserting legend
ax.legend()
insert_data_labels(Group_A)
insert_data_labels(Group_B)
insert_data_labels(Group_C)
insert_data_labels(Group_D)
insert_data_labels(Group_E)


plt.show()

From the figure, it's clear that all groups follow the same pattern in three courses. where group E score is higher and A score lower

In [None]:
subjects = ['math score', 'reading score', 'writing score']
dataset = df.groupby('parental level of education')[subjects].mean()


some_college_mean = list(dataset.T['some college'])
associate_degree_mean  = list(dataset.T['associate\'s degree'])
some_high_school_mean  = list(dataset.T['some high school'])
high_school_mean = list(dataset.T['high school'])
bachelor_degree_mean  = list(dataset.T['bachelor\'s degree'])
master_degree_mean  = list(dataset.T['master\'s degree'])

# set width of bar
barWidth = 0.25

fig, ax = plt.subplots(figsize=(14,8))

# Set position of bar on X axis
r1 = np.arange(0,len(subjects)*2,2)
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
r5 = [x + barWidth for x in r4]
r6 = [x + barWidth for x in r5]

# Make the plot
some_college = ax.bar(r1, GroupA_mean, width=barWidth, label='some_college')
associate_degree = ax.bar(r2, GroupB_mean, width=barWidth, label='associate_degree')
some_high_school = ax.bar(r3,GroupC_mean, width=barWidth, label='some_high_school')
high_school = ax.bar(r4, GroupD_mean,width=barWidth, label='high_school')
bachelor_degree= ax.bar(r5,GroupE_mean, width=barWidth, label='bachelor_degree')
master_degree= ax.bar(r6,GroupE_mean, width=barWidth, label='master_degree')

# inserting x axis label
plt.xticks([r + barWidth for r in range(0,len(subjects)*2,2)], dataset)
ax.set_xticklabels(subjects)

# inserting y axis label
ax.set_yticks(score_label)
ax.set_yticklabels(score_label)

# inserting legend
ax.legend()
insert_data_labels(some_college)
insert_data_labels(associate_degree)
insert_data_labels(some_high_school)
insert_data_labels(high_school)
insert_data_labels(bachelor_degree)
insert_data_labels(master_degree)


plt.show()

Just like race/ethnicity this bar graph also follows the same pattern in all subjects. In the figure, we can see that children whose parents went to high school tend to score lower in all three courses. In contrast, children whose parents hold bachelor's and master's degrees score well in reading, writing, and math.


# Total score

In [None]:

plt.figure(figsize=(12,6))
plt.title('PARENTS LEVEL OF EDUCATION vs CHILDREN\'s TOTAL SCORE')
sns.barplot(x=df['parental level of education'],y='Total score',data=df,palette='Set2')
plt.tight_layout()

From the above plot, it's clear that if the parent's education is better such as, if they hold master and bachelor degree than their children tend to score well in all subjects

In [None]:

plt.figure(figsize=(12,6))
plt.title('Race and CHILDREN\'s TOTAL SCORE relation')
sns.barplot(x=df['race/ethnicity'],y='Total score',data=df,palette='Set2')
plt.tight_layout()

From the above plot we can see  that group E and D children have higher score than the other three groups 

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,6))

sns.barplot(x=df['gender'],y='Total score',data=df,palette='Set2',ax=ax[0])
ax[0].set_title('Gender vs score')



sns.barplot(x=df['test preparation course'],y='Total score',data=df,palette='Set2',ax=ax[1])
ax[1].set_title('course preparation and total score')




# Bar graph 1

Figure 1 shows that the woman scored more points than the man.


# Bar graph 2


In Figure 2, we can see that the students who complete the course have a higher score than the other.

In [None]:
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(8, 8))
    ax = sns.heatmap(corr,mask=mask,square=True,linewidths=.8,cmap='BrBG',annot=True)