In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import random
from scipy import stats
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Loading Data**

In [None]:
data = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')

**Examining Data**

In [None]:
print('number of features: {}'.format(data.shape[1]))
print('number of data points: {}'.format(data.shape[0]))

In [None]:
data.head()

In [None]:
data.describe(include = 'all')

In [None]:
data.info()

**Missing Values**

In [None]:
data.isnull().sum()

There is no missing values in any of the features.

**Duplicates**

In [None]:
duplicate = data[data.duplicated()]
duplicate.size

There is no duplicates in the data.

**Outliers**

In [None]:
plt.figure(figsize=(15,6))
plt.subplot(1, 3, 1)
fig = data.boxplot(column='math score')
fig.set_title('')
fig.set_ylabel('math score')
 
plt.subplot(1, 3, 2)
fig = data.boxplot(column='reading score')
fig.set_title('')
fig.set_ylabel('reading score')

plt.subplot(1, 3, 3)
fig = data.boxplot(column='writing score')
fig.set_title('')
fig.set_ylabel('writing score')

There are outliers in each of the following features: math score, reading score, and writing score.

**How clean do you think the dataset is?**

I think the data is 70% clean and that is beacause 5 out of the 8 features are of unsuitable data types and 3 of the features contain outliers. Also, I think that the feature lunch may be an inconsistent column.

**Construct a new feature from the available numerical features**

I think we can combine the math score, reading score, and writing score into one feature by taking their average.

In [None]:
data['Average Score'] = (data['math score'] + data['reading score'] + data['writing score'])/3
data.head()

**Use hypothesis testing and visualizations to show whether gender and  race affect writing, reading, and math scores(you can use the constructed feature in step 2)**

**Null Hypothesis( H0 ):** There is no difference in the math scores between students females and males.

**Alternative Hypothesis( HA ):** There is a difference in the math scores between students females and males.

In [None]:
males = data[data['gender'] == 'male']
females = data[data['gender'] == 'female']

males_mean = []
females_mean = []

for i in range(50):
    males_mean.append(np.mean(random.sample(list(males['math score']),50,)))
    females_mean.append(np.mean(random.sample(list(females['math score']),50,)))
    
ttest,pval = stats.ttest_rel(males_mean, females_mean)
print(float(pval))
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")
    
sns.barplot(x=data['gender'],y=data['math score']);

**Null Hypothesis( H0 ):** There is no difference in the reading scores between students females and males.

**Alternative Hypothesis( HA ):** There is a difference in the reading scores between students females and males.

In [None]:
males1 = data[data['gender'] == 'male']
females1 = data[data['gender'] == 'female']

males_mean1 = []
females_mean1 = []

for i in range(50):
    males_mean1.append(np.mean(random.sample(list(males1['reading score']),50,)))
    females_mean1.append(np.mean(random.sample(list(females1['reading score']),50,)))
    
ttest1,pval1 = stats.ttest_rel(males_mean1, females_mean1)
print(float(pval1))
if pval1<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")
    
sns.barplot(x=data['gender'],y=data['reading score']);

**Null Hypothesis( H0 ):** There is no difference in the writing scores between students females and males.

**Alternative Hypothesis( HA ):** There is a difference in the writing scores between students females and males.

In [None]:
males2 = data[data['gender'] == 'male']
females2 = data[data['gender'] == 'female']

males_mean2 = []
females_mean2 = []

for i in range(50):
    males_mean2.append(np.mean(random.sample(list(males2['writing score']),50,)))
    females_mean2.append(np.mean(random.sample(list(females2['writing score']),50,)))
    
ttest2,pval2 = stats.ttest_rel(males_mean2, females_mean2)
print(float(pval2))
if pval2<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")
    
sns.barplot(x=data['gender'],y=data['writing score']);

**Null Hypothesis( H0 ):** There is no difference in the math scores between students with different race.

**Alternative Hypothesis( HA ):** There is a difference in the math scores between students with different race.

In [None]:
race1 = data[data['race/ethnicity'] == 'group A']
race2 = data[data['race/ethnicity'] == 'group B']
race3 = data[data['race/ethnicity'] == 'group C']
race4 = data[data['race/ethnicity'] == 'group D']
race5 = data[data['race/ethnicity'] == 'group E']

race1_mean = []
race2_mean = []
race3_mean = []
race4_mean = []
race5_mean = []
for i in range(50):
    race1_mean.append(np.mean(random.sample(list(race1['math score']),50,)))
    race2_mean.append(np.mean(random.sample(list(race2['math score']),50,)))
    race3_mean.append(np.mean(random.sample(list(race3['math score']),50,)))
    race4_mean.append(np.mean(random.sample(list(race4['math score']),50,)))
    race5_mean.append(np.mean(random.sample(list(race5['math score']),50,)))
    
ttest4,pval4 = stats.f_oneway(race1_mean, race2_mean, race3_mean, race4_mean, race5_mean)
print(float(pval4))
if pval4<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")
    
sns.barplot(x=data['race/ethnicity'],y=data['math score']);

**Null Hypothesis( H0 ):** There is no difference in the writing scores between students with different race.

**Alternative Hypothesis( HA ):** There is a difference in the writing scores between students with different race.

In [None]:
race1_1 = data[data['race/ethnicity'] == 'group A']
race2_1 = data[data['race/ethnicity'] == 'group B']
race3_1 = data[data['race/ethnicity'] == 'group C']
race4_1 = data[data['race/ethnicity'] == 'group D']
race5_1 = data[data['race/ethnicity'] == 'group E']

race1_mean_1 = []
race2_mean_1 = []
race3_mean_1 = []
race4_mean_1 = []
race5_mean_1 = []
for i in range(50):
    race1_mean_1.append(np.mean(random.sample(list(race1_1['writing score']),50,)))
    race2_mean_1.append(np.mean(random.sample(list(race2_1['writing score']),50,)))
    race3_mean_1.append(np.mean(random.sample(list(race3_1['writing score']),50,)))
    race4_mean_1.append(np.mean(random.sample(list(race4_1['writing score']),50,)))
    race5_mean_1.append(np.mean(random.sample(list(race5_1['writing score']),50,)))
    
ttest5,pval5 = stats.f_oneway(race1_mean_1, race2_mean_1, race3_mean_1, race4_mean_1, race5_mean_1)
print(float(pval5))
if pval5<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")
    
sns.barplot(x=data['race/ethnicity'],y=data['writing score']);

**Null Hypothesis( H0 ):** There is no difference in the reading scores between students with different race.

**Alternative Hypothesis( HA ):** There is a difference in the reading scores between students with different race.

In [None]:
race1_2 = data[data['race/ethnicity'] == 'group A']
race2_2 = data[data['race/ethnicity'] == 'group B']
race3_2 = data[data['race/ethnicity'] == 'group C']
race4_2 = data[data['race/ethnicity'] == 'group D']
race5_2 = data[data['race/ethnicity'] == 'group E']

race1_mean_2 = []
race2_mean_2 = []
race3_mean_2 = []
race4_mean_2 = []
race5_mean_2 = []
for i in range(50):
    race1_mean_2.append(np.mean(random.sample(list(race1_2['reading score']),50,)))
    race2_mean_2.append(np.mean(random.sample(list(race2_2['reading score']),50,)))
    race3_mean_2.append(np.mean(random.sample(list(race3_2['reading score']),50,)))
    race4_mean_2.append(np.mean(random.sample(list(race4_2['reading score']),50,)))
    race5_mean_2.append(np.mean(random.sample(list(race5_2['reading score']),50,)))
    
ttest6,pval6 = stats.f_oneway(race1_mean_2, race2_mean_2, race3_mean_2, race4_mean_2, race5_mean_2)
print(float(pval6))
if pval6<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")
    
sns.barplot(x=data['race/ethnicity'],y=data['reading score']);