# One- sample t-test
This test will tell us if mean of the sample and population are same or different.

In [1]:
ages = [10,55,54,74,16,45,63,95,23,28,34,36,10,9,74,88,44,67,40,35,69,5,78,44,66,55,71,30,37,41,25,27,41,36]

In [2]:
len(ages)

34

In [3]:
import numpy as np
mean = np.mean(ages)
print(mean)

44.85294117647059


In [4]:
# Let's take sample
sample_size =20
age_sample = np.random.choice(ages,sample_size)

In [5]:
age_sample

array([36, 16, 36, 71, 78, 71, 36, 23, 66, 41, 44, 41, 16, 25, 63, 30, 66,
       71, 27, 41])

In [6]:
from scipy.stats import ttest_1samp

In [7]:
ttest, p_value = ttest_1samp(age_sample,44) # popmean = The mean age of our population

In [8]:
print(p_value)

0.8439532002078209


In [9]:
if p_value <0.05:
    print('We are rejecting Null Hypothesis')
else:
    print('We are accepting Null Hypothesis')

We are accepting Null Hypothesis


Let's try with larger sample size:

In [10]:
# With 100 sample size
sample_size = 100
age_sample = np.random.choice(ages,sample_size)

In [11]:
ttest,p_value = ttest_1samp(age_sample,44)

In [12]:
print(p_value)

0.10733325571163867


In [13]:
if p_value <0.05:
    print('We are rejecting Null Hypothesis')
else:
    print('We are accepting Null Hypothesis')

We are accepting Null Hypothesis


In [14]:
# With 10000 sample size
sample_size = 10000
age_sample = np.random.choice(ages,sample_size)

In [15]:
ttest,p_value = ttest_1samp(age_sample,44)

In [16]:
print(p_value)

8.738538718270296e-05


In [17]:
if p_value <0.05:
    print('We are rejecting Null Hypothesis')
else:
    print('We are accepting Null Hypothesis')

We are rejecting Null Hypothesis


#### As we can see the sample size incraesed the p_value reduced and we got our verdict of rejecting the Null hypothesis

## Some more exmaple

In [18]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import math
np.random.seed(6)# number generator
school_height = stats.poisson.rvs(loc = 18, mu =140, size = 1000) # loc is left most value in bell curve
classA_height = stats.poisson.rvs(loc = 18, mu =155, size = 60)

In [19]:
classA_height.mean()

175.45

In [20]:
ttest, p_value = stats.ttest_1samp(a = classA_height, popmean = school_height.mean())

In [21]:
print(p_value)

3.210528554315708e-16


In [22]:
school_height.mean()

158.338

In [23]:
if p_value <0.05:
    print('We are rejecting Null Hypothesis')
else:
    print('We are accepting Null Hypothesis')

We are rejecting Null Hypothesis


# Two-sampled t-tes
 It is used to determine whether the means of two groups are significantly different from each other.

In [24]:
np.random.seed(12)
classB_height = stats.poisson.rvs(loc=18, mu = 135,size = 60 )

In [25]:
classB_height.mean()

153.11666666666667

In [26]:
ttest, p_value = stats.ttest_ind(a= classA_height, b=classB_height, equal_var = False) # equal_var = False, doesn't assume equal population variance

In [27]:
print(p_value)

6.153333069521077e-18


In [28]:
if p_value <0.05:
    print('We are rejecting Null Hypothesis')
else:
    print('We are accepting Null Hypothesis')

We are rejecting Null Hypothesis


# Paired T-test
When you want to check how different samples from same group are

In [29]:
weight1 = [45,41,60,66,42,28,29,25,30,33,38,47,50,52]
weight2 = weight1+stats.norm.rvs(scale = 5, loc = -1.25,size = 14)

In [30]:
print(weight1)
print(weight2)

[45, 41, 60, 66, 42, 28, 29, 25, 30, 33, 38, 47, 50, 52]
[58.95843103 36.61959389 66.27950456 61.81331956 47.57926457 32.91022437
 30.00444617 20.54295091 21.86201983 36.57873174 30.3299827  39.3771395
 56.36420881 58.05941216]


In [31]:
weight = pd.DataFrame({'weight1': np.array(weight1),'weight2':np.array(weight2),'weight difference': np.array(weight2)-np.array(weight1)})

In [32]:
weight

Unnamed: 0,weight1,weight2,weight difference
0,45,58.958431,13.958431
1,41,36.619594,-4.380406
2,60,66.279505,6.279505
3,66,61.81332,-4.18668
4,42,47.579265,5.579265
5,28,32.910224,4.910224
6,29,30.004446,1.004446
7,25,20.542951,-4.457049
8,30,21.86202,-8.13798
9,33,36.578732,3.578732


In [33]:
ttest, p_value = stats.ttest_rel(a = weight1, b = weight2)

In [34]:
print(p_value)

0.6674197543184104


In [35]:
if p_value <0.05:
    print('We are rejecting Null Hypothesis')
else:
    print('We are accepting Null Hypothesis')

We are accepting Null Hypothesis


# Chi Squared test
This is applied for two categorical variables from a single population. It is used to determine whether there is significant association between two variables.

In [38]:
import seaborn as sns

In [39]:
df = sns.load_dataset('tips')

In [40]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [41]:
df_table = pd.crosstab(df['sex'],df['smoker'])# they may assume homogeneity of variance across groups being compared.

In [42]:
df_table

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [43]:
observed_value = df_table.values
print('Observed Value:\n', observed_value )

Observed Value:
 [[60 97]
 [33 54]]


In [70]:
val = stats.chi2_contingency(df_table)

In [71]:
val # We get the expected value

Chi2ContingencyResult(statistic=0.0, pvalue=1.0, dof=1, expected_freq=array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]]))

In [72]:
expected_values = val[3]

In [73]:
expected_values # saving the array of expected values by initiliazing an object

array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]])

In [74]:
no_of_rows = len(df.iloc[0:2,0])
no_of_columns = len(df.iloc[0,0:2])
ddof = (no_of_rows-1)* (no_of_columns-1) # degree of freedpm

print('Degree of Freedom:',ddof)

Degree of Freedom: 1


In [75]:
from scipy.stats import chi2
chi_square = sum([(o-e)**2./e for o,e in zip(observed_value, expected_values)])
chi_square_stat = chi_square[0]+chi_square[1]

The zip() function in Python is used to combine two or more iterable dictionaries into a single iterable, where corresponding elements from the input iterable are paired together as tuples.

In [76]:
print('Chi Square Statistics: ',chi_square_stat)

Chi Square Statistics:  0.001934818536627623


In [80]:
alpha = 0.05
critical_value = chi2.ppf(q = 1-alpha,df = ddof) # ppf is opposite of cdf
print('Critical value: ',critical_value)

Critical value:  3.841458820694124


In [83]:
#p-value
p_value = 1-chi2.cdf(x= chi_square_stat,df = ddof)
print('Degree of Freedom:',ddof)
print('Significance level:',alpha)
print('P-Value',p_value)

Degree of Freedom: 1
Significance level: 0.05
P-Value 0.964915107315732


In [87]:
# Use critical-value or p-value, both works
if chi_square_stat>=critical_value:
    print('We are rejecting Null Hypothesis and that there is a relationship between two categorical features')
else:
    print('We are accepting Null Hypothesis and that there is NO relationship between two categorical features')
if p_value <=alpha:
    print('We are rejecting Null Hypothesis and that there is a relationship between two categorical features')
else:
    print('We are accepting Null Hypothesis and that there is NO relationship between two categorical features')

We are accepting Null Hypothesis and that there is NO relationship between two categorical features
We are accepting Null Hypothesis and that there is NO relationship between two categorical features
