In [1]:
# Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from scipy import stats
from statsmodels.formula.api import ols
import statsmodels.api as sm

In [2]:
df = pd.read_csv('../data/cleaned_student_data.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


## **Using One-way ANOVA, Comparing the mean marks of students from three different test.**

    Hypothesis state:
        Null hypothesis : there is no significance difference between the different test score means.
        Altenative hypothsis : There is significance difference between the different test score means.

    Significance level = 0.05

In [3]:
math_score = df['math_score']
reading_score = df['reading_score']
writing_score = df['writing_score']

f_stats, p_value = stats.f_oneway(math_score, reading_score, writing_score)
print(f'F-stats : {f_stats:.2f}')
print(f'P-value : {p_value:.4f}\n')

if p_value < 0.05:
    print(f'Since {p_value:0.4f} < 0.05, Reject Null hypothesis')
    print('There is significance difference between the different test score means.')
else:
    print(f'Since {p_value:.4f} > 0.05, Fail to reject Null hypothesis')
    print('There is no significance difference between the different test score means.')

F-stats : 10.82
P-value : 0.0000

Since 0.0000 < 0.05, Reject Null hypothesis
There is significance difference between the different test score means.


# **Chi-Square Tests – Goodness of Fit**

    There were 1000 students sit in the test. We expected equal participation from each enthincity.

    Hypothesis state:
        Null hypothesis : there is no significance difference between observed and expected values.
        Altenative hypothsis : There is significance difference between observed and expected values.

    Significance level = 0.05

In [4]:
by_ethnicity = df.groupby('race/ethnicity').agg(count=('race/ethnicity', 'count'))
observed = by_ethnicity['count']
expected = [1000//len(by_ethnicity.index)]*len(by_ethnicity.index) #Since we expect equal participation from each ethnicity group

chi2, p_value = stats.chisquare(f_obs=observed, f_exp=expected)
print(f'Chi-Square Goodness of Fit Test = {chi2}')
print(f'P-value : {p_value:.4f}\n')

if p_value < 0.05:
    print(f'Since {p_value:0.4f} < 0.05, Reject Null hypothesis')
    print('There is significance difference between observed and expected values')
else:
    print(f'Since {p_value:.4f} > 0.05, Fail to reject Null hypothesis')
    print('There is no significance difference between observed and expected values')

Chi-Square Goodness of Fit Test = 170.13
P-value : 0.0000

Since 0.0000 < 0.05, Reject Null hypothesis
There is significance difference between observed and expected values


# **Two-Way ANOVA**

    We want to know whether course completion affects Average score in different test across Gender.
    Design a mock dataset and perform a Two-Way ANOVA.

In [5]:
# Only male 
course_comp_male = df.query("gender == 'male'")
by_course_completion_male = course_comp_male.groupby('test_preparation_course').mean(numeric_only=True).stack()

# Only female
course_comp_female = df.query("gender == 'female'")
by_course_completion_female = course_comp_female.groupby('test_preparation_course').mean(numeric_only=True).stack()


new_df = pd.DataFrame({
    'Gender': np.repeat(['Male', 'Female'], 6),
    'Course_completion': np.tile(np.repeat(['Completed', 'Not completed'], 3), 2),
    'Average_Score': list(np.round(by_course_completion_male, 2)) + list(np.round(by_course_completion_female, 2))
})
new_df

Unnamed: 0,Gender,Course_completion,Average_Score
0,Male,Completed,72.34
1,Male,Completed,70.21
2,Male,Completed,69.79
3,Male,Not completed,66.69
4,Male,Not completed,62.8
5,Male,Not completed,59.65
6,Female,Completed,67.2
7,Female,Completed,77.38
8,Female,Completed,78.79
9,Female,Not completed,61.67


In [6]:
# Two-Way ANOVA
model = ols('Average_Score ~ C(Gender) + C(Course_completion) + C(Gender):C(Course_completion)', data=new_df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Gender),42.262533,1.0,2.25765,0.171358
C(Course_completion),175.873633,1.0,9.39511,0.015461
C(Gender):C(Course_completion),0.017633,1.0,0.000942,0.976267
Residual,149.7576,8.0,,


    Gender:
        P-value(0.1713) > 0.05, There is no significant effect of Gender on Average score of different test.

    Course completion:
        P-value(0.0154) < 0.05, There is significant effect of Course completion on Average score of different test.

    Gender/Course_completion :
        P-value(0.976) > 0.05, There is no significant interaction effect between Gender and Course completion.

    This means the effect of Course completion on average score does not depend on Gender.

### We want to know whether course completion and Lunch type affect Average score across different test.

In [7]:
# Free meal
free_meal = df.query("lunch == 'free/reduced'")
by_course_completion1 = free_meal.groupby('test_preparation_course').mean(numeric_only=True).stack()

# Standard meal
standard = df.query("lunch == 'standard'")
by_course_completion2 = standard.groupby('test_preparation_course').mean(numeric_only=True).stack()


new_df = pd.DataFrame({
    'Lunch': np.repeat(['Free meal', 'Standard meal'], 6),
    'Course_completion': np.tile(np.repeat(['Completed', 'Not completed'], 3), 2),
    'Average_Score': list(np.round(by_course_completion1, 2)) + list(np.round(by_course_completion2, 2))
})
new_df

Unnamed: 0,Lunch,Course_completion,Average_Score
0,Free meal,Completed,63.05
1,Free meal,Completed,69.87
2,Free meal,Completed,70.35
3,Free meal,Not completed,56.51
4,Free meal,Not completed,61.6
5,Free meal,Not completed,58.74
6,Standard meal,Completed,73.53
7,Standard meal,Completed,76.22
8,Standard meal,Completed,76.77
9,Standard meal,Not completed,68.13


In [8]:
# Two-Way ANOVA
model = ols('Average_Score ~ C(Lunch) + C(Course_completion) + C(Lunch):C(Course_completion)', data=new_df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Lunch),219.393008,1.0,32.702638,0.000445
C(Course_completion),192.240075,1.0,28.655233,0.000683
C(Lunch):C(Course_completion),1.928008,1.0,0.287388,0.606479
Residual,53.6698,8.0,,


    Lunch:
        P-value(0.0004) < 0.05, There is no significant effect of Lunch on Average score of different test.
    
    Course completion:
        P-value(0.0006) < 0.05, There is significant effect of Course completion on Average score of different test.
    
    Gender/Course_completion :
        P-value(0.6064) > 0.05, There is no significant interaction effect between Lunch and Course completion.
    
    This means the effect of Course completion on average score does not depend on Gender.



# **One-sample T-test**

    Hypothesis state:
        Null hypothesis : there is no significance difference between the different test score means.
        Altenative hypothsis : There is significance difference between the different test score means.
    
    Significance level = 0.05

### **Maths score**

In [9]:
maths_population_mean = df['math_score'].mean()
np.random.seed(1)
maths_sample = np.random.choice(df['math_score'], size=25, replace=True)

t_stats, p_value = stats.ttest_1samp(maths_sample, maths_population_mean)
print(f'T-stats : {t_stats:0.2f}')
print(f'P-value : {p_value:0.4f}\n')

if p_value < 0.05:
    print(f'Since {p_value:0.4f} < 0.05, Reject Null hypothesis')
    print('There is significance difference between the population mean and sample mean')
else:
    print(f'Since {p_value:.4f} > 0.05, Fail to reject Null hypothesis')
    print('There is no significance difference between the population mean and sample mean')

T-stats : -0.07
P-value : 0.9472

Since 0.9472 > 0.05, Fail to reject Null hypothesis
There is no significance difference between the population mean and sample mean


### **Reading score**

In [10]:
readig_population_mean = df['reading_score'].mean()
np.random.seed(1)
reading_sample = np.random.choice(df['reading_score'], size=25, replace=True)
t_stats, p_value = stats.ttest_1samp(maths_sample, readig_population_mean)
print(f'T-stats : {t_stats:0.2f}')
print(f'P-value : {p_value:0.4f}\n')

if p_value < 0.05:
    print(f'Since {p_value:0.4f} < 0.05, Reject Null hypothesis')
    print('There is significance difference between the population mean and sample mean')
else:
    print(f'Since {p_value:.4f} > 0.05, Fail to reject Null hypothesis')
    print('There is no significance difference between the population mean and sample mean')

T-stats : -1.05
P-value : 0.3031

Since 0.3031 > 0.05, Fail to reject Null hypothesis
There is no significance difference between the population mean and sample mean


### **writing_score**

In [11]:
writing_population_mean = df['writing_score'].mean()
np.random.seed(1)
writing_sample = np.random.choice(df['writing_score'], size=25, replace=True)
t_stats, p_value = stats.ttest_1samp(maths_sample, writing_population_mean)
print(f'T-stats : {t_stats:0.2f}')
print(f'P-value : {p_value:0.4f}\n')

if p_value < 0.05:
    print(f'Since {p_value:0.4f} < 0.05, Reject Null hypothesis')
    print('There is significance difference between the population mean and sample mean')
else:
    print(f'Since {p_value:.4f} > 0.05, Fail to reject Null hypothesis')
    print('There is no significance difference between the population mean and sample mean')

T-stats : -0.70
P-value : 0.4934

Since 0.4934 > 0.05, Fail to reject Null hypothesis
There is no significance difference between the population mean and sample mean


# **Two-sample T-test**

    Hypothesis state:
        Null hypothesis : there is no significance difference between two sample.
        Altenative hypothsis : There is significance difference between two sample.
    
    Significance level = 0.05



In [12]:
np.random.seed(1)
reading_sample = np.random.choice(df['reading_score'], size=25, replace=True) # For reading score
writing_sample = np.random.choice(df['writing_score'], size=25, replace=True) # For writing score

t_stats, p_value = stats.ttest_ind(reading_sample, writing_sample)
print(f'T-stats : {t_stats:0.2f}')
print(f'P-value : {p_value:0.4f}\n')

if p_value < 0.05:
    print(f'Since {p_value:0.4f} < 0.05, Reject Null hypothesis')
    print('There is significance difference between the reading average score and writing average score')
else:
    print(f'Since {p_value:.4f} > 0.05, Fail to reject Null hypothesis')
    print('There is no significance difference between the reading average score and writing average score')

T-stats : -0.30
P-value : 0.7631

Since 0.7631 > 0.05, Fail to reject Null hypothesis
There is no significance difference between the reading average score and writing average score
