In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
data.head( )

In [None]:
data.info()

In [None]:
data['total score'] = data['math score'] + data['reading score'] + data['writing score']

## Data visualization and Exploratory Data Analysis 

### Analysis based on race or ethnicity

In [None]:
data['race/ethnicity'].value_counts()

There were people from five different race or ethnicity

In [None]:
sns.countplot(data['race/ethnicity'])
plt.title('Number of people belonging to each race/ethnicity')

In [None]:
ethnic_group = data.groupby(['race/ethnicity'])


In [None]:
ethnic_group['total score'].max()

fun fact : one student from group E ethnicity secured full marks in all the subjects

In [None]:
ethnic_group_mean = ethnic_group['total score'].mean()
print(ethnic_group_mean)

On an average a group E student scored 30 marks more than group A student

In [None]:
import matplotlib.pyplot as plt
sns.barplot(x = ['Group A','Group B','Group C','Group D','Group E'], y = ethnic_group_mean)
plt.title('Performance of different ethnic group based on their total score')

In [None]:
ethnic_group_math = ethnic_group['math score'].mean()
print(ethnic_group['math score'].max())
print(ethnic_group_math)

In [None]:
sns.barplot(x = ['Group A','Group B','Group C','Group D','Group E'], y = ethnic_group_math)
plt.title('Performance of different ethnic group based on their math score')

In [None]:
print(ethnic_group['reading score'].max())
print(ethnic_group['reading score'].mean())

In [None]:
sns.barplot(x = ['Group A','Group B','Group C','Group D','Group E'], y = ethnic_group['reading score'].mean())
plt.title('Performance of different ethnic group based on their reading score')

In [None]:
print(ethnic_group['writing score'].max())
print(ethnic_group['writing score'].mean())

In [None]:
sns.barplot(x = ['Group A','Group B','Group C','Group D','Group E'], y = ethnic_group['writing score'].mean())
plt.title('Performance of different ethnic group based on their writing score')

Insights from the illustrations:  
    1. People of group A have performed the worst overall (considering all the three subjects) so they need special attention. 
    2. People of group E have performed the best overall ( considering all the three subjects ). 
    3. Not a single student from Group B have secured full marks in subjects. 
    4. Group E have performed better in maths than in reading in writing and the mean score for them is 74.
    5. All other groups have performed better in reading and writing than in maths.

Lets consider that out of three hundred total marks, a student needs to score 120 marks to pass the exam

In [None]:
data.loc[data['total score'] <= 120].count()

In [None]:
data.loc[data['total score'] <= 120]

These are the students who have not done satisfactorily in the examination and needs to work hard for the next exam

In [None]:
data.loc[data['total score'] <= 120]['race/ethnicity'].value_counts()


In [None]:
sns.countplot(data.loc[data['total score'] <= 120]['race/ethnicity'])
plt.title('Number of students failed from each race/ethnicity')

One of the possibility can be that most number of student failed from group c because it has more number of students overall. So lets calculate the percentage of students failed for each ethnicity

In [None]:
print('Percentage of total student failed ', round(32/1000*100,2),'%')
print('Percentage of student failed for race A', round(3/89*100,2),'%')
print('Percentage of student failed for race B', round(8/190*100,2),'%')
print('Percentage of student failed for race C', round(10/319*100,2),'%')
print('Percentage of student failed for race D', round(8/262*100,2),'%')
print('Percentage of student failed for race E', round(2/140*100,2),'%')

Insights from the analysis: 
    1. Overall only 32 students failed and rest all were able to score higher than the pass marks 
    2. The pass percentage is 3.37 %
    3. Most number of students failed from the race group C which is 10
    4. Group B had highest fail percentage among all which is 4.21

### Analysis based on gender

In [None]:
sns.countplot(data['gender'])
print(data['gender'].value_counts())

In [None]:
sns.countplot(data['gender'], hue = data['race/ethnicity'])

In [None]:
gender_group = data.groupby('gender')

In [None]:
gender_group_mean = gender_group['total score'].mean()
print(gender_group_mean)

In [None]:
sns.barplot(y=gender_group_mean,x = ['female','male'])
plt.title('Mean score of male and female')

In [None]:
data.loc[data['total score'] <= 120]['gender'].value_counts()

In [None]:
print('percentage of females who failed the test', round(17/518*100,2),'%')
print('percentage of males who failed the test', round(15/482*100,2),'%')

Insights from the analysis: 
    1. More number of females have given the examination than male.
    2. Maximum number of females were from group C ethnicity 
    3. Maximum number of people were from group C and group D
    4. Average score of females was 10 higher than that of male. 
    5. Percentage of failure for female was slightly higher than that of male. 

### Analysis based on parental qualification

In [None]:
data['parental level of education'].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data['parental level of education'])

In [None]:
parent_education_group = data.groupby(['parental level of education'])

In [None]:
parent_education_group_mean = parent_education_group['total score'].mean()
print(parent_education_group_mean )

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x = ['Associates degree','bachelor degree','high school','masters degree','some college','some high school'], y = parent_education_group['total score'].mean())
plt.title('Performance based on the qualification of parents ')

In [None]:
data.loc[data['total score'] <= 120]['parental level of education'].value_counts()

fun fact: Not a single student whose parent has a master's degree has failed in the test

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data.loc[data['total score'] <= 120]['parental level of education'])

Insights from the data:
    1. There is some co relation between a students performance and his/her parents qualification
    2. On an average students whose parent has a master's dagree scores 30 marks more than a student whose parent completed the high school
    3. The same pattern is followed in the case of securing the passing marks as well

### Analysis based on test preparation course

In [None]:
data['test preparation course'].value_counts()

In [None]:
course_completion_group = data.groupby('test preparation course')


In [None]:
course_completion_group_mean = course_completion_group ['total score'].mean()
print(course_completion_group_mean)

In [None]:
sns.barplot(x = ['completed','none'], y = course_completion_group ['total score'].mean())
plt.title('Performance based on whether a student has completed the test preparation course or not')

In [None]:
data.loc[data['total score'] <= 120]['test preparation course'].value_counts()

In [None]:
sns.countplot(data.loc[data['total score'] <= 120]['test preparation course'])

Insights from the analysis: 
    1. Mostly students have not taken the test preparation course.
    2. The students who have taken the test preparation course have performed better.
    3. Students who failed the test have not the taken the test preparation course mostly. 

Thank you so much for being so patient if you have made it this far. I have tried to make all the possible analysis but still there can be numerous more.

I hope it helped in some way or the other.

Feel free to upvote, comment or fork.
