In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import os
print(os.listdir("../input"))

In [None]:
df = pd.read_csv("../input/StudentsPerformance.csv")

In [None]:
df.head()

In [None]:
# To check missing values and other statistics of data
df.describe()

*Some analysis which can be made from describe function -*   
    1. Students have scored 100 in each domain
    2. There are few students who are possibly failing subjects
    3. No missing values for scores


In [None]:
# Check missing values
df.isnull().sum()

Since, given dataframe has no missing values we can start with EDA

In [None]:
print(df['gender'].value_counts())
df['gender'].value_counts().plot.bar()

In [None]:
df['race/ethnicity'].value_counts().plot.bar()

In [None]:
df['parental level of education'].value_counts().plot.bar()

In [None]:
#df['math score'].plot.hist()
sns.distplot(df['math score'], bins=10, kde=False)

In [None]:
g = sns.FacetGrid(df, col="gender")
g.map(sns.kdeplot, "math score")
g.map(sns.kdeplot,"reading score")

In [None]:
sns.pairplot(df[['math score', 'reading score', 'writing score']])

## Questions

#### After intial analysis we narrow our scope to following questions
1. Understanding %of females and males who are in top 10/20 in each subject to see any gender bias
2. Correlation between parents degree and marks
3. Correlation between test preparation and higher score in subjects, or particular increase in a subject


## 1. Understanding %of females and males who are in top 10/20 in each subject to see any gender bias


In [None]:
#Understanding number of males and females in top 10 of math score
df[['gender','math score']].sort_values(by='math score', ascending=False).iloc[:10].gender.value_counts()

In [None]:
#Understanding number of males and females in top 10 of reading score 
df[['gender','reading score']].sort_values(by='reading score', ascending=False).iloc[:10].gender.value_counts()

In [None]:
#Understanding number of males and females in top 10 of writing score 
df[['gender','writing score']].sort_values(by='writing score', ascending=False).iloc[:10].gender.value_counts()

In [None]:
df[['gender','math score']].sort_values(by='math score', ascending=False).iloc[:50].gender.value_counts().plot.bar()

In [None]:
df[['gender','reading score']].sort_values(by='reading score', ascending=False).iloc[:50].gender.value_counts().plot.bar()

In [None]:
df[['gender','writing score']].sort_values(by='writing score', ascending=False).iloc[:50].gender.value_counts().plot.bar()

#### Intial inference -
There's a clear bias of gender, i.e in given dataset for reading and writing, females score more as compare to men. For math score, there's no dominant gender as both have proportional number of candidates.

In [None]:
sns.kdeplot(df[df['math score'] >40]['math score'])

In [None]:
sns.countplot(df[df['math score'] >90]['math score'])

## 2. Correlation between parents degree and marks

 The main idea behind this analysis was to understand whether level of education of parent in some way affect marks scored by students. This analysis is also extended to understanding whether parents with degree prefer their children to go for classes. 

In [None]:
#Intial EDA
df.groupby('parental level of education')['math score'].agg([len, min, max])
#df.groupby('parental level of education')['reading score'].agg([len, min, max])
#df.groupby('parental level of education')['writing score'].agg([len, min, max])

 From given aggregate data we can analyse few key findings -
* The more obvious is the dominant out of all category is associate's degree with maxinum number of parents.
* Another important lookup is mininmum marks scored by students. Students of Parent's who have completed master's have highest minimun score (40, 42, 46) as compare to other. This doesn't help us conclude anything but it's an important observation.

In [None]:
group_math = df.sort_values(by='math score', ascending=False).iloc[:50]
group_math.groupby('parental level of education')['math score'].agg([len, min, max])

To understand overall correlation between marks and education we'll take the average of 3 scores.

In [None]:
sum_ = df['math score'] + df['reading score'] + df['writing score']
df['avg'] = sum_/3
df.head()

In [None]:
df.agg([len, min, max])

In [None]:
#To understand number of students with average more than 80 and maximum they've scored
df[df['avg']>80].groupby('parental level of education')['avg'].agg([len, min, max]).plot.bar()

Since, 'len' would not be a good metric to understand proportion of students, we'll find percentage of students per category

In [None]:
(df[df['avg']>75].groupby('parental level of education')['avg'].agg([len])/df.groupby('parental level of education')['avg'].agg([len])).plot.bar()

#### Inference
It's quite evident that students whose parent have master's degree (highly educated) tend to have more proportion in average scores > 80

In [None]:
(df[df['avg']>80].groupby('parental level of education')['avg'].agg([len])/df.groupby('parental level of education')['avg'].agg([len]))

In [None]:
g = sns.FacetGrid(df, col="parental level of education", col_wrap=1)
g.map(sns.kdeplot, "math score")

### Final Inference -

[](http://)Education of parents tend to correlate with marks of student (case of Master's degree). The proportion of students whose average is greater than 80 has certain sense of order to it. Because, Master's degree (33%) -> Bachelor's degree (26%) -> Associate Degree (22%) -> College Degree (19%) -> Some High School (16%) -> High School (9%)


## 3. Correlation between test preparation and higher score in subjects, or particular increase in a subject


In [None]:
#Initial EDA
df.groupby('test preparation course').size()

In [None]:
df.groupby('test preparation course')['parental level of education'].value_counts()

In [None]:
df.groupby('test preparation course')['parental level of education'].value_counts()/df.groupby('parental level of education').size()

In [None]:
df.groupby('parental level of education').size()

In [None]:
df[df['avg']>85].groupby('test preparation course')['test preparation course'].value_counts()

In [None]:
temp = df.sort_values(by='math score', ascending=False).iloc[:20]
temp.groupby('test preparation course')['test preparation course'].value_counts()

In [None]:
temp = df.sort_values(by='reading score', ascending=False).iloc[:20]
temp.groupby('test preparation course')['test preparation course'].value_counts()

In [None]:
temp = df.sort_values(by='writing score', ascending=False).iloc[:20]
temp.groupby('test preparation course')['test preparation course'].value_counts()

### Inferences -
1. Maximum proportion of students didn't opt for test prepaation course. But, proportion of students in top 20-50 are dominated by students who opted for test preparation course. This indicates that opting for this course will improve your marks/performance.
2. Also, Parent's with higher qualification (Master's degree ) didn't tend to send their students for test preparation course. Reason could be themselves teaching them or solving doubts in case of difficulties.