In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install dataprep

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import dataprep 
from dataprep.eda import create_report
from dataprep.eda import plot
from dataprep.eda import plot_correlation
from dataprep.eda import plot_missing

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Create DataFrame
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
df.head(10)

In [None]:
df['average score'] = ((df['math score']+ df['reading score'] + df['writing score'])/3).round(2)
df.head()

In [None]:
# Check if there is missing data
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

**Students Profile Statistics**

In [None]:
# Students Gender
students_age = df.groupby('gender')['math score'].count()
plt.figure(figsize=(10,5))
ax = sns.barplot(students_age.index, students_age.values)
ax.set_xlabel('Gender')
ax.set_ylabel('Population')
ax.set_title('Students Population grouped by Gender')
plt.show()

In [None]:
df['race/ethnicity'].nunique()

In [None]:
# Race / Ethnicity Distribution
race_distribution = df.groupby('race/ethnicity')['math score'].count()
pie, ax = plt.subplots(figsize=[10,6])
labels = race_distribution.keys()
plt.pie(x = race_distribution, autopct = '%.1f%%', explode=[0.05]*5, labels = labels, pctdistance = 0.5)
plt.title('Race/ Ethnicity Distribution ')
plt.show()

In [None]:
df['parental level of education'].nunique()

In [None]:
# Parental Level of Education Distribution
parental_distribution = df.groupby('parental level of education')['math score'].count()
pie, ax = plt.subplots(figsize=[10,6])
labels = parental_distribution.keys()
plt.pie(x = parental_distribution, autopct = '%.1f%%', explode=[0.05]*6, labels = labels, pctdistance = 0.5)
plt.title('Parental Level of Education Distribution ')
plt.show()

In [None]:
df['lunch'].unique()

In [None]:
# How Students take Their Lunch Distribution
lunch_distribution = df.groupby('lunch')['math score'].count()
pie, ax = plt.subplots(figsize=[10,6])
labels = lunch_distribution.keys()
plt.pie(x = lunch_distribution, autopct = '%.1f%%', explode=[0.05]*2, labels = labels, pctdistance = 0.5)
plt.title('Distribution of How Students take Their Lunch')
plt.show()

In [None]:
# How Students take Their Preparation Course Distribution
prep_distribution = df.groupby('test preparation course')['math score'].count()
pie, ax = plt.subplots(figsize=[10,6])
labels = prep_distribution.keys()
plt.pie(x = prep_distribution, autopct = '%.1f%%', explode=[0.05]*2, labels = labels, pctdistance = 0.5)
plt.title('Distribution of How Students take Their Preparation Course')
plt.show()

In [None]:
# Create Class of Average Score
bins = [0, 11, 21, 31, 41, 51, 61, 71, 81, 91, 100]
labels = ['0 - 10','11 - 20','21 - 30','31 - 40','41 - 50','51 - 60','61 - 70','71 - 80','81 - 90','91 - 100']
df['Average Score Range'] = pd.cut(df['average score'], bins, labels = labels, include_lowest = True)
df.head()

**How Test Preparation Course Affected the Students Average Score**

In [None]:
plt.figure(figsize=(15,7))
ax = sns.countplot(y = 'Average Score Range', data = df, hue = 'test preparation course', palette = 'gnuplot', order = df['Average Score Range'].value_counts().index)
plt.show()

In [None]:
# Let's say Average Score under 50 considered as fail and if any of the Score under 41 considered as fail
# Create a pass/ fail column
def f(row):
    if row['average score'] < 50 and (row['math score'] <= 45 or row['reading score'] <= 45 or row['writing score'] <=45):
        val = 'fail'
    else:
        val = 'pass'
    return val
         
df['Pass/ Fail'] = df.apply(f, axis = 1)
df.head()

In [None]:
# Percentage of Pass/ Fail Students Without take preparation course
pasfail_distribution1 = df[df['test preparation course'] == 'none'].groupby('Pass/ Fail')['math score'].count()
pie, ax = plt.subplots(figsize=[10,6])
labels = pasfail_distribution1.keys()
plt.pie(x = pasfail_distribution1, autopct = '%.1f%%', explode=[0.05]*2, labels = labels, pctdistance = 0.5)
plt.title('Distribution of Pass/ Fail Students Without take Test Preparation Course')
plt.show()

In [None]:
# Percentage of Pass/ Fail Students WIth take preparation course
pasfail_distribution2 = df[df['test preparation course'] == 'completed'].groupby('Pass/ Fail')['math score'].count()
pie, ax = plt.subplots(figsize=[10,6])
labels = pasfail_distribution2.keys()
plt.pie(x = pasfail_distribution2, autopct = '%.1f%%', explode=[0.05]*2, labels = labels, pctdistance = 0.5)
plt.title('Distribution of Pass/ Fail Students With take Test Preparation Course')
plt.show()

In [None]:
# Create columns that contain Mark (A,B,C,D,E) for each subject
def f(row, column):
    if row[column] < 30:
        val = 'E'
    elif 30 <= row[column] <= 45:
        val = 'D'
    elif 45 < row[column] <= 70:
        val = 'C'
    elif 70 < row[column] <= 85:
        val = 'B'
    else:
        val = 'A'
    return val
         
df['math mark'] = df.apply(f, axis = 1, column = 'math score')
df.head()

In [None]:
df['reading mark'] = df.apply(f, axis = 1, column = 'reading score')
df.head()

In [None]:
df['writing mark'] = df.apply(f, axis = 1, column = 'writing score')
df.head()

In [None]:
# Plot Data for Math Mark
plt.figure(figsize=(15,7))
ax = sns.countplot(y = 'math mark', data = df, hue = 'parental level of education', palette = 'gnuplot', 
                   order = df['math mark'].value_counts().index)
plt.show()

In [None]:
# Plot Data for Math Mark
plt.figure(figsize=(15,7))
ax = sns.countplot(y = 'reading mark', data = df, hue = 'parental level of education', palette = 'gnuplot', 
                   order = df['reading mark'].value_counts().index)
plt.show()

In [None]:
# Plot Data for Math Mark
plt.figure(figsize=(15,7))
ax = sns.countplot(y = 'writing mark', data = df, hue = 'parental level of education', palette = 'gnuplot', 
                   order = df['writing mark'].value_counts().index)
plt.show()

In [None]:
def f(size,a):
    plt.figure(figsize=(size))
    n = 1
    for col in ['math score','reading score', 'writing score', 'average score']:
        plt.subplot(2,2,n)
        sns.boxplot(x = a, y = col, data = df)
        plt.title(col + ' variation')
        n=n+1

In [None]:
f(size = (15,10),a = df['test preparation course'])

In [None]:
f(size = (15,10),a = df['race/ethnicity'])

In [None]:
f(size = (15,10),a = df['gender'])

In [None]:
f(size = (25,15),a = df['parental level of education'])

In [None]:
f(size = (15,10),a = df['lunch'])

In [None]:
df.head()

In [None]:
def f(a,b):
    plt.figure(figsize = (20,10))
    ax = sns.countplot(x = a, data = df, hue = b, palette = 'gnuplot', 
                   order = df[a].value_counts().index)

In [None]:
f(a = 'gender',b = 'parental level of education')

In [None]:
f(a = 'race/ethnicity',b = 'parental level of education')

In [None]:
f(a = 'race/ethnicity',b = 'gender')

In [None]:
f(a = 'race/ethnicity',b = 'Average Score Range')

In [None]:
f(a = 'race/ethnicity',b = 'Pass/ Fail')

In [None]:
f(a = 'race/ethnicity',b = 'math mark')

In [None]:
f(a = 'race/ethnicity',b = 'reading mark')

In [None]:
f(a = 'race/ethnicity',b = 'writing mark')

In [None]:
f(a = 'parental level of education',b = 'writing mark')

In [None]:
f(a = 'parental level of education',b = 'math mark')

In [None]:
f(a = 'parental level of education',b = 'reading mark')

In [None]:
f(a = 'parental level of education',b = 'Average Score Range')

In [None]:
plot(df, 'math score')

In [None]:
plot(df, 'writing score')

In [None]:
plot(df, 'reading score')

In [None]:
plot(df, 'average score')

In [None]:
df.head()

In [None]:
education_race = pd.DataFrame({'count' : df.groupby( [ "parental level of education", "race/ethnicity",'Average Score Range'] ).size()}).reset_index()
education_race.head()

In [None]:
fig = px.treemap(education_race, names = 'Average Score Range', values = 'count', path = ['parental level of education', 'race/ethnicity', 'Average Score Range'], 
                 color_discrete_sequence = px.colors.qualitative.Set1, width = 1500, height = 1000)
fig.show()