In [1]:
import numpy as np 
import pandas as pd


In [2]:
Students_data = pd.read_excel('studentdemo.xlsx')
paired_data = pd.read_csv('pairdtestdata.csv')

In [3]:
Students_data.head()

Unnamed: 0,sex,age,address,internet,romantic,Dalc,absences,G1,G2,G3
0,F,18,U,no,no,1,6,5,6,6
1,F,17,U,yes,no,1,4,5,5,6
2,F,15,U,yes,no,2,10,7,8,10
3,F,15,U,yes,yes,1,2,15,14,15
4,F,16,U,no,no,1,4,6,10,10


In [4]:
Students_data.describe()

Unnamed: 0,age,Dalc,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,1.481013,5.708861,10.908861,10.713924,10.41519
std,1.276043,0.890741,8.003096,3.319195,3.761505,4.581443
min,15.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,1.0,0.0,8.0,9.0,8.0
50%,17.0,1.0,4.0,11.0,11.0,11.0
75%,18.0,2.0,8.0,13.0,13.0,14.0
max,22.0,5.0,75.0,19.0,19.0,20.0


In [5]:
from scipy.stats import ttest_1samp
## library for one sample t-test

# INFERENTIAL STATISTICS
# HYPOTHESIS TESTING USING 
#### Pvalue 
#### Critical value 
#### Confidence interval

## one sample t-test using pvalue
To test whether the sample mean and the hypothesized mean are equal or different 
Null hypothesis - no relationship or no difference thus have the same mean 
Alternative hypothesis - relationship between variables thus different means
If pvalue is less than 0.05 then we reject the null hypothesis otherwise we fail to reject 
if the sample mean is not significantly different tothe hypothesized then we reject the null hypothesis 

In [6]:
tscore, pvalue = ttest_1samp(Students_data.age, popmean=17)
print("t Statistic:  {:.4f}".format(tscore))
print("P Value:  {:.4f}".format(pvalue))

if pvalue<0.05:
    print('The pvalue is less than alpha = 0.05, reject the null hypothesis')
else:
        print('Since pvalue is greater than 0.05, there isno sufficient evidence to reject the null hypothesis')


t Statistic:  -4.7317
P Value:  0.0000
The pvalue is less than alpha = 0.05, reject the null hypothesis


In [7]:
## library for independent t-test
from scipy.stats import ttest_ind

In [8]:
Students_data.groupby('sex').mean()

Unnamed: 0_level_0,age,Dalc,absences,G1,G2,G3
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,16.730769,1.254808,6.216346,10.620192,10.389423,9.966346
M,16.657754,1.73262,5.144385,11.229947,11.074866,10.914439


## Two sample t-test(independent) using pvalue
even if male students had higher mean average in G1 compared to their female counterparts, the difference is not statistically significant 

In [9]:
G1_male = Students_data.G1[Students_data.sex=='M']
G1_female = Students_data.G1[Students_data.sex=='F']

tscore, pvalue = ttest_ind(G1_male, G1_female, equal_var = False)
print("t Statistic: {:.4f}".format(tscore))
print("p value: {:.4f}".format(pvalue))

if pvalue<0.05:
    print('The pvalue is less than alpha = 0.05, reject the null hypothesis')
else:
        print('Since pvalue is greater than 0.05, there isno sufficient evidence to reject the null hypothesis')



t Statistic: 1.8237
p value: 0.0690
Since pvalue is greater than 0.05, there isno sufficient evidence to reject the null hypothesis


## paired sample t-test using pvalue
mean difference in the weight of chicks before and two weeks after they are subject to new feed is statistically significant. Therefore we have enough evidence to reject the null hypothesis 

In [10]:
## library for paired sample t-test
from scipy.stats import ttest_rel 

In [11]:
tscore, pvalue = ttest_rel(paired_data['Weight Before'], paired_data['Weight After'])
print("t Statistic: {:.4f}".format(tscore))
print("P Value: {:.4f}".format(pvalue))

if pvalue<0.05:
    print('The pvalue is less than alpha = 0.05, reject the null hypothesis')
else:
        print('Since pvalue is greater than 0.05, there isno sufficient evidence to reject the null hypothesis')


t Statistic: -18.8433
P Value: 0.0000
The pvalue is less than alpha = 0.05, reject the null hypothesis


# Decision based on critical value to test hypothesis
We look at the value of the test statistics. If the absolute value of the test statistics (tscore) is greater than the t_critical value, we reject the null hypothesis 
We will use 0.975 to find the critical value 

In [12]:
from scipy.stats import t ###library to get the critical value 
import math

## One sample t-test using critical value

In [13]:
tscore, pvalue = ttest_1samp(Students_data.age, popmean=17)
print("t Statistic: {:.4f}".format(tscore))
print("P Value: {:.4f}".format(pvalue))

t_critical_value = t.ppf(q=0.975, df=len(Students_data.age)-1)
print("T critical value: {:.4f}".format(t_critical_value))
print()
if np.abs(tscore)>t_critical_value:
    print('The absolute of T_Statistics is greater than T_critical value, reject the null hypothesis')
else:
        print('Since absolute of T_Statistics is less than T_critical value, there is no sufficient evidence to reject the null hypothesis')


t Statistic: -4.7317
P Value: 0.0000
T critical value: 1.9660

The absolute of T_Statistics is greater than T_critical value, reject the null hypothesis


## Independent two sample t-test
   ### Mean difference in G1

In [14]:
G1_male = Students_data.G1[Students_data.sex=='M']
G1_female = Students_data.G1[Students_data.sex=='F']

tscore, pvalue = ttest_ind(G1_male, G1_female, equal_var = False)
print("t Statistic: {:.4f}".format(tscore))
print("p value: {:.4f}".format(pvalue))

t_critical_value = t.ppf(q=0.975, df=len(G1_male)+len(G1_female)-2)
print("T critical value: {:.4f}".format(t_critical_value))
print()
if np.abs(tscore)>t_critical_value:
    print('The absolute of T_Statistics is greater than T_critical value, reject the null hypothesis')
else:
        print('Since absolute of T_Statistics is less than T_critical value, there is no sufficient evidence to reject the null hypothesis')


t Statistic: 1.8237
p value: 0.0690
T critical value: 1.9660

Since absolute of T_Statistics is less than T_critical value, there is no sufficient evidence to reject the null hypothesis


###  Mean difference in G2

In [15]:
G2_male = Students_data.G2[Students_data.sex=='M']
G2_female = Students_data.G2[Students_data.sex=='F']

tscore, pvalue = ttest_ind(G2_male, G2_female, equal_var = False)
print("t Statistic: {:.4f}".format(tscore))
print("p value: {:.4f}".format(pvalue))

t_critical_value = t.ppf(q=0.975, df=len(G2_male)+len(G2_female)-2)
print("T critical value: {:.4f}".format(t_critical_value))
print()
if np.abs(tscore)>t_critical_value:
    print('The absolute of T_Statistics is greater than T_critical value, reject the null hypothesis')
else:
        print('Since absolute of T_Statistics is less than T_critical value, there is no sufficient evidence to reject the null hypothesis')


t Statistic: 1.8077
p value: 0.0714
T critical value: 1.9660

Since absolute of T_Statistics is less than T_critical value, there is no sufficient evidence to reject the null hypothesis


# Confidence interval method in hypothesis testing 
## one sample t-test using confidence interval
we will reject the null hyposthesis because the hypothesized mean 17 is not within the confidence interval

In [16]:
print("Hypothesized mean age = 17")

## calculate the critical value 
t_critical_value = t.ppf(q=0.975, df=394)
print("T critical value: {:.4f}".format(t_critical_value))

## Standard deviation
std_dv = Students_data.age.std(ddof=1)
print("The standard deviation: {:.4f}".format(std_dv))

##Standard error
std_error = std_dv/math.sqrt(len(Students_data.age))
print("The stanard error: {:.4f}".format(std_error))

##Calculate Margin of error
error_margin =t_critical_value*std_error
print("The margin of Error: {:.4f}".format(error_margin))

##Confidence interval
confidence_interval = (Students_data.age.mean() - error_margin,
                      Students_data.age.mean() + error_margin)
print("The confidence interval is: ", confidence_interval)
print()
print("Reject the null hypothesis if the hypothesized mean does not fall within the confidence interval")

## we will reject the null hyposthesis because the hypothesized mean 17 is not within the confidence interval

Hypothesized mean age = 17
T critical value: 1.9660
The standard deviation: 1.2760
The stanard error: 0.0642
The margin of Error: 0.1262
The confidence interval is:  (16.569975930412973, 16.822429132878163)

Reject the null hypothesis if the hypothesized mean does not fall within the confidence interval


## Independent sample t-test (confidence interval)
we will fail to reject the null hypothesis as the difference in mean 0.610 is is less than differnce in confidence level 0.655,
and the pvalue is more than 0.05 

In [17]:
G1_male = Students_data.G1[Students_data.sex=='M']
G1_female = Students_data.G1[Students_data.sex=='F']

#sample size of male and  female
N_male = len(G1_male)
N_female = len(G1_female)
degree_of_freedom = N_male + N_female - 2

tscore, pvalue = ttest_ind(G1_male, G1_female)
print("t Statistic: {:.4f}".format(tscore))
print("p value: {:.4f}".format(pvalue))
t_critical_value = t.ppf(q=0.975, df = degree_of_freedom)

## calculate the mean difference and 95% confidence interval
std_male = G1_male.std(ddof=1)
std_female = G1_female.std(ddof=1)

pooled_std = math.sqrt( ((N_male-1)*(std_male)**2 + (N_female-1)*(std_female)**2)/ degree_of_freedom)
std_error = (pooled_std/math.sqrt(N_female + N_male))

diff_mean = G1_male.mean() - G1_female.mean()
MoE = t_critical_value * std_error

print("Sample sizes: \n\tsample size of male = {}\n\tsample size of female = {}".format(N_male, N_female))
print("Means in G1: \n\tMean of male in G1 = {:.3f}\n\tMean of female in G1 = {:.3f}".format(G1_male.mean(), G1_female.mean()))
print("Standard deviation in G1: \n\tStd of male in G1 = {:.3f}\n\tStd of female in G1 = {:.3f}".format(std_male, std_female))
print("Pooled std, Standard error and Margin of error: \n\tPooled Std Deviation = {:.3f}\n\tStandard Error = {:.3f}\n\tMargin of Error = {:.3f}".format(pooled_std, std_error, MoE))
print("The results of the independent t-test are: \n\tt-value = {:.3f}\n\tp_value = {:.3f}\n\tt-critical value = {:.3f}".format(tscore, pvalue, t_critical_value))
print("\nThe difference between groups is {:.3f} [{:.3f} to {:.3f}] (mean [95% CI])".format(diff_mean, diff_mean - MoE, diff_mean + MoE))

## we will fail to reject the null hypothesis as the difference in mean 0.610 is less than differnce in confidence level 
## and the pvalue is more than 0.05 

t Statistic: 1.8284
p value: 0.0683
Sample sizes: 
	sample size of male = 187
	sample size of female = 208
Means in G1: 
	Mean of male in G1 = 11.230
	Mean of female in G1 = 10.620
Standard deviation in G1: 
	Std of male in G1 = 3.393
	Std of female in G1 = 3.233
Pooled std, Standard error and Margin of error: 
	Pooled Std Deviation = 3.309
	Standard Error = 0.167
	Margin of Error = 0.327
The results of the independent t-test are: 
	t-value = 1.828
	p_value = 0.068
	t-critical value = 1.966

The difference between groups is 0.610 [0.282 to 0.937] (mean [95% CI])


In [18]:
## we fail to reject the null hypothesis as the difference in confidence interval is more than the difference between groups
0.937 - 0.282
## NB if the differnce between groups 0.610 is greater than the difference in confidence interval 
## then we reject the null hypothesis

0.655

# Using the pingouin library to test hypothesis

In [19]:
## library for pingouin 
import pingouin as pg

## One sample t-test using pingouin

In [24]:
result = pg.ttest(Students_data.age, 17, correction = False)
display(result)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.731703,394,two-sided,3e-06,"[16.57, 16.82]",0.238078,2798.407,0.997111


In [26]:
## Mean of the students age 
Students_data.age.mean()

16.696202531645568

### One sided t-test(less)
##### Null hypothesis: Mean_age >= 17
##### Alternative hypothesis : Mean_age < 17

In [22]:
## Reject the null hypothesis
result = pg.ttest(Students_data.age, 17, correction = False, alternative='less')
display(result)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.731703,394,less,2e-06,"[-inf, 16.8]",0.238078,5596.814,0.99896


### One sided t-test(greater)
##### Null hypothesis:  Mean_age <= 17
##### Alternative hypothesis : Mean_age > 17

In [23]:
## Fail to reject the null hypothesis
result = pg.ttest(Students_data.age, 17, correction = False, alternative='greater')
display(result)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.731703,394,greater,0.999998,"[16.59, inf]",0.238078,0.0,0.0


## Two sample t-test using pingouin

In [51]:
G1_male = Students_data.G1[Students_data.sex == 'M']
G1_female = Students_data.G1[Students_data.sex == 'F']

print('Mean difference: {:.4f}'.format(G1_male.mean()-G1_female.mean()))
result = pg.ttest(G1_male, G1_female, correction='auto')
display(result)

Mean difference: 0.6098


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.82366,383.788441,two-sided,0.068981,"[-0.05, 1.27]",0.184251,0.552,0.445962


# CORELATION Between G1,G2 and G3 
## using Pearson r

In [56]:
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from scipy.stats import kendalltau

In [57]:
cor, p_value = pearsonr(Students_data.G1, Students_data.G2)
print("correlation coefficient for G1 and G2: {:.4f}".format(cor))
print("P Value for G1 and G2: {:.4f}".format(p_value))
if p_value<0.05:
    print('The p_value is less than alpha = 0.05, correlation is statistically significant')
else:
        print('Correlation not statistically significant')
        
print('*************G1 and G2***********')
        
cor, p_value = pearsonr(Students_data.G1, Students_data.G3)
print("correlation coefficient for G1 and G3: {:.4f}".format(cor))
print("P Value for G1 and G3: {:.4f}".format(p_value))
if p_value<0.05:
    print('The p_value is less than alpha = 0.05, correlation is statistically significant')
else:
        print('Correlation not statistically significant')
        
print('*************G2 and G3***********')
        
cor, p_value = pearsonr(Students_data.G2, Students_data.G3)
print("correlation coefficient for G2 and G3: {:.4f}".format(cor))
print("P Value for G2 and G3: {:.4f}".format(p_value))

        

        



correlation coefficient for G1 and G2: 0.8521
P Value for G1 and G2: 0.0000
The p_value is less than alpha = 0.05, correlation is statistically significant
*************G1 and G2***********
correlation coefficient for G1 and G3: 0.8015
P Value for G1 and G3: 0.0000
The p_value is less than alpha = 0.05, correlation is statistically significant
*************G2 and G3***********
correlation coefficient for G2 and G3: 0.9049
P Value for G2 and G3: 0.0000


## Using spearmanr

In [58]:
cor, p_value = spearmanr(Students_data.G1, Students_data.G2)
print("correlation coefficient for G1 and G2: {:.4f}".format(cor))
print("P Value for G1 and G2: {:.4f}".format(p_value))
if p_value<0.05:
    print('The p_value is less than alpha = 0.05, correlation is statistically significant')
else:
        print('Correlation not statistically significant')
        
print('*************G1 and G2***********')
        
cor, p_value = spearmanr(Students_data.G1, Students_data.G3)
print("correlation coefficient for G1 and G3: {:.4f}".format(cor))
print("P Value for G1 and G3: {:.4f}".format(p_value))
if p_value<0.05:
    print('The p_value is less than alpha = 0.05, correlation is statistically significant')
else:
        print('Correlation not statistically significant')
        
print('*************G2 and G3***********')
        
cor, p_value = spearmanr(Students_data.G2, Students_data.G3)
print("correlation coefficient for G2 and G3: {:.4f}".format(cor))
print("P Value for G2 and G3: {:.4f}".format(p_value))


correlation coefficient for G1 and G2: 0.8948
P Value for G1 and G2: 0.0000
The p_value is less than alpha = 0.05, correlation is statistically significant
*************G1 and G2***********
correlation coefficient for G1 and G3: 0.8780
P Value for G1 and G3: 0.0000
The p_value is less than alpha = 0.05, correlation is statistically significant
*************G2 and G3***********
correlation coefficient for G2 and G3: 0.9571
P Value for G2 and G3: 0.0000


## Using Kendall Tau

In [59]:
cor, p_value = kendalltau(Students_data.G1, Students_data.G2)
print("correlation coefficient for G1 and G2: {:.4f}".format(cor))
print("P Value for G1 and G2: {:.4f}".format(p_value))
if p_value<0.05:
    print('The p_value is less than alpha = 0.05, correlation is statistically significant')
else:
        print('Correlation not statistically significant')
        
print('*************G1 and G2***********')
        
cor, p_value = kendalltau(Students_data.G1, Students_data.G3)
print("correlation coefficient for G1 and G3: {:.4f}".format(cor))
print("P Value for G1 and G3: {:.4f}".format(p_value))
if p_value<0.05:
    print('The p_value is less than alpha = 0.05, correlation is statistically significant')
else:
        print('Correlation not statistically significant')
        
print('*************G2 and G3***********')
        
cor, p_value = kendalltau(Students_data.G2, Students_data.G3)
print("correlation coefficient for G2 and G3: {:.4f}".format(cor))
print("P Value for G2 and G3: {:.4f}".format(p_value))


correlation coefficient for G1 and G2: 0.7735
P Value for G1 and G2: 0.0000
The p_value is less than alpha = 0.05, correlation is statistically significant
*************G1 and G2***********
correlation coefficient for G1 and G3: 0.7463
P Value for G1 and G3: 0.0000
The p_value is less than alpha = 0.05, correlation is statistically significant
*************G2 and G3***********
correlation coefficient for G2 and G3: 0.8835
P Value for G2 and G3: 0.0000


# Regression using stats model

In [60]:
import statsmodels.api as sm

In [61]:
mod = sm.OLS(Students_data.G3, Students_data.G1)
res = mod.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                     G3   R-squared (uncentered):                   0.940
Model:                            OLS   Adj. R-squared (uncentered):              0.940
Method:                 Least Squares   F-statistic:                              6212.
Date:                Fri, 27 Aug 2021   Prob (F-statistic):                   2.54e-243
Time:                        22:45:31   Log-Likelihood:                         -964.10
No. Observations:                 395   AIC:                                      1930.
Df Residuals:                     394   BIC:                                      1934.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

# Multiple Linear Regression

In [63]:
predictors = Students_data[['G1','G2']]
response = Students_data.G3

mult_mod = sm.OLS(response, predictors)
mult_res = mult_mod.fit()
print(mult_res.summary())

                                 OLS Regression Results                                
Dep. Variable:                     G3   R-squared (uncentered):                   0.969
Model:                            OLS   Adj. R-squared (uncentered):              0.969
Method:                 Least Squares   F-statistic:                              6151.
Date:                Fri, 27 Aug 2021   Prob (F-statistic):                   2.71e-297
Time:                        22:51:30   Log-Likelihood:                         -834.58
No. Observations:                 395   AIC:                                      1673.
Df Residuals:                     393   BIC:                                      1681.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------