# Statistical Test

## 1. Chi-squared Test


##### Purpose : To find is there any association between 2 qualtitative features.

In [13]:
import seaborn as sns
import pandas as pd
import scipy.stats as stats

In [2]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'tips',
 'titanic']

In [5]:
tips_data = sns.load_dataset('tips')
tips_data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


### 1. Initial Investigation

In [6]:
tips_data.shape

(244, 7)

In [7]:
tips_data.day.unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Sun', 'Sat', 'Thur', 'Fri']

In [8]:
tips_data.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Dinner', 'Lunch']

In [10]:
tips_data.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [32]:
smoker_sex_data = pd.crosstab(tips_data.sex,tips_data.smoker)#,margins='True')
smoker_sex_data

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [26]:
#Null Hypothesis: There is no association between the smoker and sex.
#Alternate Hypo : There is an association between the smoker and sex.

#If p-val<0.05, we reject H0.

In [33]:
chi_value, pval, df, expected_value = stats.chi2_contingency(smoker_sex_data)
print('Chi-squared value: {}.\nPval: {}.\nDegree of Freedom: {}.\nExpected Value: \n{}.'.format(chi_value, pval, df, expected_value))

Chi-squared value: 0.008763290531773594.
Pval: 0.925417020494423.
Degree of Freedom: 1.
Expected Value: 
[[59.84016393 97.15983607]
 [33.15983607 53.84016393]].


In [30]:
time_sex_data = pd.crosstab(tips_data.sex,tips_data.time)#,margins='True')
time_sex_data

time,Lunch,Dinner
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,33,124
Female,35,52


In [31]:
chi_value, pval, df, expected_value = stats.chi2_contingency(time_sex_data)
print('Chi-squared value: {}.\nPval: {}.\nDegree of Freedom: {}.\nExpected Value: \n{}.'.format(chi_value, pval, df, expected_value))

Chi-squared value: 9.343808982970623.
Pval: 0.002237400118075248.
Degree of Freedom: 1.
Expected Value: 
[[ 43.75409836 113.24590164]
 [ 24.24590164  62.75409836]].


## ------------------------------------------------------------------------------------------------------------

## 2. T - test

##### Purpose: To find is there any significant difference in the average of 2 groups.

* 1 Sample t test
* 2 Sample t test
* Paired t test

### 1 Sample t test

In [34]:
test_scores = [60,40,100,80,70,20,50,60,20,60,69,40,10,50,60,40,60,20,30,30,50]
len(test_scores)

21

In [37]:
import numpy as np

In [38]:
np.mean(test_scores) # Population average scores

48.523809523809526

In [43]:
random_test_scores = np.random.choice(test_scores,8)
random_test_scores

array([ 60,  40,  60,  40,  80,  30,  20, 100])

In [48]:
np.mean(random_test_scores)

53.75

In [45]:
from scipy.stats import ttest_1samp

In [47]:
t_score,pval = ttest_1samp(random_test_scores,48.52)

In [None]:
#H0 : There is no significance difference in the average test_scores.
#Ha : There is a significance difference in the avearage test_scores

In [49]:
if pval<=0.05:
    print("We reject the null hypothesis saying that there is a signifcant difference im the average marks.")
else:
    print("We do not reject the null hypothesis saying that there is no signifcant difference in the average marks.")

We do not reject the null hypothesis saying that there is no signifcant difference in the average marks.


### 2 Sample t test

In [50]:
from scipy.stats import ttest_ind

In [54]:
#H0: There is no significant difference in the average marks scored by DS_11AM and 3pm batch.
#Ha: There is a significant difference in the average marks scored by DS_11AM and 3pm batch.
test_scores_11am = [100,90,80,50,70,40,20,10,10,10,10,10,10,10,10,5,5,5,5,5]
test_scores_3pm  = [100,90,90,90,80,100,80,90,90,100,80,90,100,100,100,100,99,85,99,100]

20

In [56]:
np.mean(test_scores_11am)

27.75

In [57]:
np.mean(test_scores_3pm)

93.15

In [63]:
t_score, pval = ttest_ind(test_scores_11am,test_scores_3pm)
pval

7.81437993692361e-11

In [62]:
if pval<=0.05:
    print("We reject the null hypothesis saying that there is a signifcant difference im the average marks.")
else:
    print("We do not reject the null hypothesis saying that there is no signifcant difference in the average marks.")

We reject the null hypothesis saying that there is a signifcant difference im the average marks.


In [64]:
#Dhruvi
#H0: There is no significant difference in the average marks scored by Class A and B in Maths Exam.
#Ha: There is a significant difference in the average marks scored by Class A and B in Maths Exam.
ClassA = [50,80,35,65,88,79,56,77,89,98]
ClassB = [98,77,26,35,42,99,89,75,65,32]

t_score, pval = ttest_ind(ClassA,ClassB)
if pval<=0.05:
    print("We reject the null hypothesis saying that there is a signifcant difference im the average marks.")
else:
    print("We do not reject the null hypothesis saying that there is no signifcant difference in the average marks.")

We do not reject the null hypothesis saying that there is no signifcant difference in the average marks.


In [67]:
#Farzan
bike_a = [25.5,25.6,27,26.5] 
bike_b = [35,35.5,37,34.5]

test_after, pval = ttest_ind(bike_a,bike_b)

if pval<= 0.05:
    print('We reject the Null Hypothesis saying that there is a significant diff. in the avg of milage of two bikes')
else:
    print('We do not reject the Null Hypothesis')


We reject the Null Hypothesis saying that there is a significant diff. in the avg of milage of two bikes


In [69]:
#Namrata
#H0 = there is no significant difference between average no of students in chem and mec branch
#Ha = there is a significant differnce between averge no of students in chem and mec branch 
chem = [40,30,30,50,60,50,30,30,30,20,40,70,60,40,70]
Mec = [40,50,60,50,80,30,60,30,30,30,20,40,30,30]
num,pval = ttest_ind(chem,Mec)
pval
if pval<=0.05:
    print("we reject the null hypothesis saying that there is significant difference in number of students in Chemical and mechanical branch")
else:
    print("we do not reject the null hypothesis")

we do not reject the null hypothesis


In [76]:
#Snehal
#H0 =There is significant difference in the average Temperature of Delhi and Gurgaon 
#Ha =There is no significant difference in the average of Temperature of Delhi and Gurgaon 
Delhi_temp = [45,36,42,30,39,47]
Gurgaon_temp = [34,45,38,35,40,29]
t_score, pval = ttest_ind(Delhi_temp,Gurgaon_temp)
pval
if pval<=0.05:
    print("We reject the null hypothesis saying that there is a signifcant difference in the average Temperature of Delhi and Gurgaon.")
else:
    print("We do not reject the null hypothesis saying that there is no signifcant difference in the average Temperature of Delhi and Gurgaon.")


We do not reject the null hypothesis saying that there is no signifcant difference in the average Temperature of Delhi and Gurgaon.


## Paired t test

In [75]:
#H0: The average weight before and after the weight loss program is same. No significant difference.
#Ha: There is a significant difference in the weight before and after the weight loss program. 
#    There should be a significant amount weight reduction after weight loss program.

#10people
pre_weightloss_program = [140,120,100,120,110,100,90,90,80,100]
post_weightloss_program = [80,60,70,50,80,80,70,60,75,80]

from scipy.stats import ttest_rel

t_score, pval = ttest_rel(pre_weightloss_program,post_weightloss_program)
if pval<=0.05:
    print("We reject the null hypothesis saying that there is a signifcant difference im the average weights before and after the weightloss program. My training program is working.")
else:
    print("We do not reject the null hypothesis saying that there is no signifcant difference in the average weights before and after the weightloss program")

We reject the null hypothesis saying that there is a signifcant difference im the average weights before and after the weightloss program. My training program is working.


## ANOVA

In [77]:
from scipy.stats import f_oneway

In [78]:
#H0 = There is no significant statistical change in number of people having pizza from dominos,mojjo and ovenstory. 
# Ha = There is a significant statistical change in average number of people having pizza from dominos,mojjo and ovenstory
dom = [20,10,30,10,10,10]
mojjo = [50,60,60,10,10,10]
oven = [90,90,50,60,50,90]

num,pval = f_oneway(dom,mojjo,oven)
pval

if pval<=0.05:
    print("we reject the null hypothesis saying that there is significant difference in average no.of pizzas ordered from dom,mojjo,oven")
else:
    print("we do not reject the null hypothesis saying that there is significant difference in average no.of  pizzas ordered from dom,mojjo,oven")


we reject the null hypothesis saying that there is significant difference in average no.of pizzas ordered from dom,mojjo,oven


## 1 Sample Proportion Test

## 2 Sample proportion Test