In [None]:
!pip install researchpy

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt
import researchpy as rp
from scipy.stats import kruskal
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
alpha = 0.05

In [None]:
df = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")
df.head()

In [None]:
df.info()

In [None]:
df["avg_score"] = (df["math score"] + df["reading score"] + df["writing score"]) / 3
df.head()

We have many categorical variables, but some are ordinal in nature. Let's tell pandas that.

In [None]:
paredu_mappings = {
    "some high school": 1,
    "high school": 2,
    "associate's degree": 3,
    "some college": 4,
    "bachelor's degree": 5,
    "master's degree": 6
}
test_prep_mappings = {
    "none": 0,
    "completed": 1
}
lunch_mappings = {
    "free/reduced": 0,
    "standard": 1
}
cat_type = CategoricalDtype(categories=paredu_mappings.keys(), ordered=True)
df["parental level of education"] = df["parental level of education"].astype(cat_type)

cat_type = CategoricalDtype(categories=test_prep_mappings.keys(), ordered=True)
df["test preparation course"] = df["test preparation course"].astype(cat_type)

cat_type = CategoricalDtype(categories=lunch_mappings.keys(), ordered=True)
df["lunch"] = df["lunch"].astype(cat_type)

df.head()

In [None]:
df.info()

In [None]:
df.corr()

It's no wonder that we see a strong correlation between the different scores, reading/math have 0.81, reading/writing score have 0.95.

## Are the scores normally distributed?

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 10))

df["avg_score"].hist(ax=axes[0,0])
axes[0,0].set_title('avg score')
df["math score"].hist(ax=axes[0,1])
axes[0,1].set_title('math score')
df["reading score"].hist(ax=axes[1,0])
axes[1,0].set_title('reading score')
df["writing score"].hist(ax=axes[1,1])
axes[1,1].set_title('writing score')
fig.suptitle('Scores histograms', fontsize=12)


They look normal'ish, although a bit skewed, but are they really?

In [None]:
# Under the null hypothesis, the two distributions are identical, F(x)=G(x)
from scipy.stats import shapiro

for score_name in ["avg_score", "math score", "reading score", "writing score"]:
    stat, p_value = shapiro(df[score_name])
    #print('Statistics=%.3f, p=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    if p_value > alpha:
        print(f'{score_name} looks Gaussian (fail to reject H0)')
    else:
        print(f'{score_name} does NOT look Gaussian (reject H0)')



## Map ordinals to numeric

Let's map the ordinals to numeric and look at their correlation.

In [None]:
df["parental level of education ord"] = df["parental level of education"].apply(lambda x: paredu_mappings[x]).astype(int)
df["test preparation course ord"] = df["test preparation course"].apply(lambda x: test_prep_mappings[x]).astype(int)
df["lunch ord"] = df["lunch"].apply(lambda x: lunch_mappings[x]).astype(int)

# Gender is not really ordinal but let's try it anyway
df["gender ord"] = df["gender"].apply(lambda x: 1 if x=='female' else 0)

df.info()

In [None]:
sns.heatmap(df.corr(), annot=True)

Looking at the correlation between the new ordinal variables, we see, for example, that the parental level of education is not very correlated with average score (0.19) which is surprising. That might be because the interval of 1 between the various levels is actually not very representative. Let's take a closer look.



## Parental level of education

In [None]:
df.groupby("parental level of education").count().iloc[:,1].plot(kind="bar")

In [None]:
df.boxplot(column="avg_score", by="parental level of education", figsize=(20,10))

They do seem to differ slightly.

In [None]:
df.groupby(["parental level of education"])["avg_score", "math score", "reading score", "writing score"].mean().plot.bar()
plt.show()

In [None]:
df.hist(column="avg_score", by="parental level of education", figsize=(20,10))

Some distributions look a bit skewed to the left (levels 0, 1, 3, 4). The master's group seems to have a 2 head distribution but that might just be noise due to it being a smaller group (56).

In [None]:
rp.summary_cont(df.groupby("parental level of education")['avg_score'])

Another way of looking at it, is to see if the various groups differ in their distribution of scores. Normally, we could use an ANOVA test but the groups have different sizes and the average score is not normal.. so we can instead use a non-parametric test like Kruskal Wallis, similar to anova.

In [None]:
list(df["parental level of education"].unique())

In [None]:
# Get scores for each group
edu_groups = list(df["parental level of education"].unique())
edu_group_scores = [df[df["parental level of education"]==g]["avg_score"].values for g in edu_groups]

stat, p = kruskal(*edu_group_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))

# interpret
if p > alpha:
    print('Group have same distributions of scores (fail to reject H0)')
else:
    print('Parental education groups have different distributions of scores (reject H0) ')

## Race/ethnicity

In [None]:
df.groupby("race/ethnicity").count().iloc[:,1].plot(kind="bar")

In [None]:
rp.summary_cont(df.groupby("race/ethnicity")['avg_score'])

In [None]:
df.groupby(["race/ethnicity"])["avg_score", "math score", "reading score", "writing score"].mean().plot.bar()
plt.show()

In [None]:
df.boxplot(column="avg_score", by="race/ethnicity", figsize=(20,10))

The groups have different sizes and the average score is not normal and the variable is nominal/categorical - here there's no way to measure correlation at all so we can just do the same as before, compare the groups between themselves and see if they differ significantly in their scores.

In [None]:
# Get scores for each group
eth_groups = list(df["race/ethnicity"].unique())
eth_group_scores = [df[df["race/ethnicity"]==g]["avg_score"].values for g in eth_groups]

# Test
stat, p = kruskal(*eth_group_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))

# interpret
if p > alpha:
    print('race/ethnicity groups have same distributions of scores (fail to reject H0)')
else:
    print('race/ethnicity groups have different distributions of scores (reject H0) ')

## Gender

In [None]:
df.groupby("gender").count().iloc[:,1].plot(kind="bar")

In [None]:
df.groupby(["gender"])["avg_score", "math score", "reading score", "writing score"].mean().plot.bar()
plt.show()

In [None]:
df.hist(column="avg_score", by="gender", figsize=(20,10))

In [None]:
df.boxplot(column="avg_score", by="gender", figsize=(20,10))

In [None]:
# Get scores for each group
gender_groups = list(df["gender"].unique())
gender_group_scores = [df[df["gender"]==g]["avg_score"].values for g in gender_groups]

# Test
stat, p = kruskal(*gender_group_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))

# interpret
if p > alpha:
    print('gender groups have same distributions of scores (fail to reject H0)')
else:
    print('gender groups have different distributions of scores (reject H0) ')

## Test preparation

In [None]:
df.groupby(["test preparation course"])["avg_score", "math score", "reading score", "writing score"].mean().plot.bar()
plt.show()

In [None]:
# Get scores for each group
test_prep_groups = list(df["test preparation course"].unique())
test_prep_group_scores = [df[df["test preparation course"]==g]["avg_score"].values for g in test_prep_groups]

# Test
stat, p = kruskal(*test_prep_group_scores)
print('Statistics=%.3f, p=%.3f' % (stat, p))

# interpret
if p > alpha:
    print('test_prep_group_scores have same distributions of scores (fail to reject H0)')
else:
    print('test_prep_group_scores have different distributions of scores (reject H0) ')