In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats import weightstats as stests

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
StudentsPerformance = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')

# Examining Data #

In [None]:
StudentsPerformance.head()

In [None]:
print("The number of training examples(data points) = %i " % StudentsPerformance.shape[0])
print("The number of features = %i " % StudentsPerformance.shape[1])

In [None]:
StudentsPerformance.info(); 

In [None]:
StudentsPerformance.isnull().sum()

no null values

In [None]:
StudentsPerformance.describe() #why does it show these features only? ans: bec these are the numeric values

In [None]:
duplicate = StudentsPerformance[StudentsPerformance.duplicated()]
duplicate.size 

*no duplicates :)*

In [None]:
 
plt.figure(figsize=(15,6))
plt.subplot(1, 3, 1)
fig = StudentsPerformance.boxplot(column='math score')
fig.set_title('')
fig.set_ylabel('Math Score')
 
plt.subplot(1, 3, 2)
fig = StudentsPerformance.boxplot(column='reading score')
fig.set_title('')
fig.set_ylabel('Reading Score')

plt.subplot(1, 3, 3)
fig = StudentsPerformance.boxplot(column='writing score')
fig.set_title('')
fig.set_ylabel('writing Score')


*Very few outliers*

In [None]:
#g = sns.displot(StudentsPerformance["writing score"], color="r", label="Skewness : %.2f"%(StudentsPerformance["math score"].skew()))
#g = sns.displot(StudentsPerformance["reading score"], color="g", label="Skewness : %.2f"%(StudentsPerformance["reading score"].skew()))
#g = sns.displot(StudentsPerformance["writing score"], color="b", label="Skewness : %.2f"%(StudentsPerformance["writing score"].skew()))

#sns.displot(StudentsPerformance["math score"])
#sns.displot(StudentsPerformance["reading score"])
#sns.displot(StudentsPerformance["writing score"])

In [None]:
f = plt.figure(figsize=(25,20))
f.add_subplot(331)
sns.histplot(data= StudentsPerformance, x= 'math score',color="r");
f.add_subplot(332)
sns.histplot(data= StudentsPerformance, x= 'reading score',color="g");
f.add_subplot(333)
sns.histplot(data= StudentsPerformance, x= 'writing score',color="b");

*looks good (no outliers)*

In [None]:
#math score
mean = StudentsPerformance.mean()[0]
std = StudentsPerformance.std()[0]
Upper_boundary = mean + 3* std
Lower_boundary = mean - 3* std
print('math score outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_boundary, upperboundary=Upper_boundary))

#reading score
mean = StudentsPerformance.mean()[1]
std = StudentsPerformance.std()[1]
Upper_boundary = mean + 3* std
Lower_boundary = mean - 3* std
print('reading score outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_boundary, upperboundary=Upper_boundary))

#writing score
mean = StudentsPerformance.mean()[2]
std = StudentsPerformance.std()[2]
Upper_boundary = mean + 3* std
Lower_boundary = mean - 3* std
print('writing score outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_boundary, upperboundary=Upper_boundary))

In [None]:
#outliers0 = StudentsPerformance[StudentsPerformance['math score']>111]['math score']
#outliers0

*Data is clean*

# Feature Engineering #
Adding new features from numerical values


In [None]:
StudentsPerformance['Total Score %']=((StudentsPerformance['math score'] + StudentsPerformance['reading score'] + StudentsPerformance['writing score'])/300) *100
#StudentsPerformance['Average Score']=(StudentsPerformance['Total Score'])/3
StudentsPerformance['status'] = StudentsPerformance.apply(lambda StudentsPerformance : 'failed' if StudentsPerformance['Total Score %']< 50 else 'passed', axis = 1)


StudentsPerformance.head()

In [None]:
StudentsPerformance['status'].value_counts()

# Getting Data Ready #

In [None]:
StudentsPerformance.columns[StudentsPerformance.dtypes==object]


In [None]:
StudentsPerformance['race/ethnicity'].unique()

In [None]:
cleanup_nums = { "race/ethnicity": {"group A": 0, "group B": 1, "group C": 2,"group D": 3,"group E": 4 },"gender": {"male": 0, "female": 1}, "status": {"passed": 1, "failed": 0}}
StudentsPerformance.replace(cleanup_nums, inplace=True)

StudentsPerformance.head()


# Hypothesis Testing

In [None]:

#Null Hypothesis( H0 ): There is no difference in the pass/fail status between students females and males.


import random
male = StudentsPerformance[StudentsPerformance['gender'] == 0]
female = StudentsPerformance[StudentsPerformance['gender'] == 1]

## empty list for storing mean sample
m_mean_samples = []
f_mean_samples = []

for i in range(50):
    m_mean_samples.append(np.mean(random.sample(list(male['status']),50,)))
    f_mean_samples.append(np.mean(random.sample(list(female['status']),50,)))

In [None]:
ttest,pval = stats.ttest_rel(m_mean_samples, f_mean_samples)
print(float(pval))
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

In [None]:
ztest ,pval1 = stests.ztest(m_mean_samples,f_mean_samples, value=0,alternative='two-sided')
if pval1<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

In [None]:
sns.barplot(x=StudentsPerformance['gender'],y=StudentsPerformance['status']);

*There's a slight difference*

In [None]:

#Null Hypothesis( H0 ): There is no difference in the pass/fail status among students of different races.
#Alternative Hypothesis( HA ): There is a difference in the pass/fail status among students of different races

Group_A = StudentsPerformance[StudentsPerformance['race/ethnicity'] == 0]
Group_B = StudentsPerformance[StudentsPerformance['race/ethnicity'] == 1]
Group_C = StudentsPerformance[StudentsPerformance['race/ethnicity'] == 2]
Group_D = StudentsPerformance[StudentsPerformance['race/ethnicity'] == 3]
Group_E = StudentsPerformance[StudentsPerformance['race/ethnicity'] == 4]


## empty list for storing mean sample
G0_mean_samples=[]
G1_mean_samples=[]
G2_mean_samples=[]
G3_mean_samples=[]
G4_mean_samples=[]

for i in range(50):
    G0_mean_samples.append(np.mean(random.sample(list(Group_A['status']),50,)))
    G1_mean_samples.append(np.mean(random.sample(list(Group_B['status']),50,)))
    G2_mean_samples.append(np.mean(random.sample(list(Group_C['status']),50,)))
    G3_mean_samples.append(np.mean(random.sample(list(Group_D['status']),50,)))
    G4_mean_samples.append(np.mean(random.sample(list(Group_E['status']),50,)))

    
                         
F, p = stats.f_oneway(G0_mean_samples, G1_mean_samples, G2_mean_samples,G3_mean_samples,G4_mean_samples)

print("p-value for significance is: ", p)
if p<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

In [None]:
#Plot
sns.barplot(x=StudentsPerformance['race/ethnicity'],y=StudentsPerformance['status']);


*About 90% of the students in race Group E passed. whereas about 83% of the students in race Group A passed.*