In [155]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy import stats
import statsmodels

## Steps to hypothesis testing

step1 : Create the hypothesis(null&alternate) <br>
step2 : choose appropriate statistical test <br>
step3 : set Alpha or Type 1 error for the experiment <br>
step4 : Get the data and samples <br>
step5 : Analyse <br>
step6 : state the decision <br>

## <font color='red'>Question 1

## Creating Null Hypothesis and alternate hypothesis
### Ho(null hypothesis)---> μ1 = μ2
### H1(alternate hypothesis) --> μ1≠ μ2

In [4]:
df = pd.read_csv('cutlets.csv')

In [7]:
df.describe()

Unnamed: 0,Unit A,Unit B
count,35.0,35.0
mean,7.019091,6.964297
std,0.288408,0.343401
min,6.4376,6.038
25%,6.8315,6.7536
50%,6.9438,6.9399
75%,7.28055,7.195
max,7.5169,7.5459


### It is a two sided test

In [40]:
# for 95 percent confidence interval the z critical value is + or - 1.96
from statsmodels.stats.weightstats import ztest as ztest
ztest(df['Unit A'],df['Unit B'],value=0)

(0.7228688704678061, 0.46976045023906077)

### we comparing p-value with '$\alpha$' (significance level) [two tailed test]

### if P-value is greater than '$\alpha$=0.025' we fail to reject the null hypothesis
### if P-value is lesser than '$\alpha$=0.025' we reject null hypothesis

In [43]:
alpha = 0.025
p_value = 0.72
print('significance_level:',alpha,'p_value',p_value)
if p_value <= alpha:
    print('We reject Null Hypothesis there is a significance difference between two Units A and B')
else:
    print('We fail to reject Null hypothesis')

significance_level: 0.025 p_value 0.72
We fail to reject Null hypothesis


### Conclusion: Where the p-value is greater than significance value 0.025, then we fail to reject the null hypothesis and the test was not statistically significant

## <font color='red'>Question 2

In [45]:
data = pd.read_csv('LabTAT.csv')

In [48]:
data.head(10)

Unnamed: 0,Laboratory 1,Laboratory 2,Laboratory 3,Laboratory 4
0,185.35,165.53,176.7,166.13
1,170.49,185.91,198.45,160.79
2,192.77,194.92,201.23,185.18
3,177.33,183.0,199.61,176.42
4,193.41,169.57,204.63,152.6
5,179.45,197.0,181.51,161.12
6,191.37,166.36,214.21,154.02
7,166.81,169.6,183.43,163.25
8,158.81,175.36,191.6,152.79
9,165.88,198.68,208.43,161.98


## We are doing ANOVA (Analysis of variance) test which is used to check if the means of two or more groups are statistically different from eachother
### Ho(null hypothesis) --> $\mu$1 = $\mu$2 = $\mu$3 =$\mu$4
### H1(alternate hypothesis)--> atleast one mean is different

In [49]:
from scipy.stats import f_oneway

In [50]:
f_oneway(data['Laboratory 1'],data['Laboratory 2'],data['Laboratory 3'],data['Laboratory 4'])

F_onewayResult(statistic=118.70421654401437, pvalue=2.1156708949992414e-57)

### The F statistic and p-value turn out to be equal to 118.70 and 2.11 respectively.
### Conclusion: Hence p-value is greater than 0.05 significant value hence then we fail to reject null hypothesis there is a no significance difference between TAT of reports of the laboratories

## <font color='red'>Question 3
### Creating null and alternate hypothesis
#### Ho(Null Hypothesis) : There is no similarity between  male and female ratios
#### H1(alternate Hypothesis) : There is similarity between male and female ratios

In [156]:
df = pd.read_csv('BuyerRatio.csv')

In [157]:
df.head()

Unnamed: 0,Observed Values,East,West,North,South
0,Males,50,142,131,70
1,Females,435,1523,1356,750


In [158]:
observed_values = df.values
observed_values = observed_values[0:2,1:5]

In [159]:
from scipy.stats import chi2

In [161]:
val = stats.chi2_contingency(observed_values)


(1.595945538661058,
 0.6603094907091882,
 3,
 array([[  42.76531299,  146.81287862,  131.11756787,   72.30424052],
        [ 442.23468701, 1518.18712138, 1355.88243213,  747.69575948]]))

In [114]:
Expected_values = val[3]

In [124]:
no_of_rows = 2
no_of_columns = 4
dof = (no_of_rows-1)*(no_of_columns-1)
alpha = 0.05

In [121]:
chi_square = sum([(o-e)**2/e for o,e in zip(observed_values,Expected_values)])
chi_square_statistic = chi_square[0]+chi_square[1]+chi_square[2]+chi_square[3]
chi_square_statistic

1.5959455386610577

In [125]:
critical_value = chi2.ppf(q=1-alpha,df=dof)
critical_value

7.814727903251179

In [127]:
if chi_square_statistic > critical_value:
    print('Reject H0,There is dependency between male and female ratios')
else:
    print('Retain H0, There is no dependency between male and female ratios and are similar across regions')

Retain H0, There is no dependency between male and female ratios and are similar across regions


## <font color='red'>Question 4

#### Null-Hypothesis---> defectives  are not varies by center
#### Alternate-Hypothesis----> defectives varies by center

In [140]:
dataset = pd.read_csv('Costomer+OrderForm.csv')

In [141]:
dataset

Unnamed: 0,Phillippines,Indonesia,Malta,India
0,Error Free,Error Free,Defective,Error Free
1,Error Free,Error Free,Error Free,Defective
2,Error Free,Defective,Defective,Error Free
3,Error Free,Error Free,Error Free,Error Free
4,Error Free,Error Free,Defective,Error Free
...,...,...,...,...
295,Error Free,Error Free,Error Free,Error Free
296,Error Free,Error Free,Error Free,Error Free
297,Error Free,Error Free,Defective,Error Free
298,Error Free,Error Free,Error Free,Error Free


In [142]:
print(dataset['Phillippines'].value_counts(),dataset['Indonesia'].value_counts(),dataset['Malta'].value_counts(),dataset['India'].value_counts())


Error Free    271
Defective      29
Name: Phillippines, dtype: int64 Error Free    267
Defective      33
Name: Indonesia, dtype: int64 Error Free    269
Defective      31
Name: Malta, dtype: int64 Error Free    280
Defective      20
Name: India, dtype: int64


In [148]:
from scipy import stats
observed_values = [[271,267,269,280],[29,33,31,20]]
print(observed_values)

[[271, 267, 269, 280], [29, 33, 31, 20]]


In [164]:
stats,p,dof,Expected_values = stats.chi2_contingency(observed_values)
print('Statistics: ',stats,"\n",'P-value: ',p,"\n",'Degreesoffreedom: ',dof,"\n",'Expected values: ',Expected_values)



Statistics:  1.595945538661058 
 P-value:  0.6603094907091882 
 Degreesoffreedom:  3 
 Expected values:  [[  42.76531299  146.81287862  131.11756787   72.30424052]
 [ 442.23468701 1518.18712138 1355.88243213  747.69575948]]


In [168]:
alpha = 0.05
critical_value = chi2.ppf(q=1-alpha,df=dof)
if abs(stats) > critical_value:
    print('Reject Null Hypothesis, defectives are varies by center')
else:
    print('Retain Null Hypothesis, defectives are not varied by center')

Retain Null Hypothesis, defectives are not varied by center
