In [221]:
import statsmodels.api as sm
from scipy import stats 
import pandas as pd 
import numpy as np

# Q 1] determine whether there is any significant difference in the diameter of the cutlet between two units

In [222]:
# Null hypothesis
# H0 : Diameter of cutlets between two units are equal
# Alternate hypothesis
# H1 : Diameter of cutlets between two units are not equal

In [223]:
# load dataset
data1 = pd.read_csv('Cutlets.csv')

In [224]:
# dataset have 35 observations/samples
# and 2 columns 
data1.shape

(35, 2)

In [225]:
data1.head()

Unnamed: 0,Unit A,Unit B
0,6.809,6.7703
1,6.4376,7.5093
2,6.9157,6.73
3,7.3012,6.7878
4,7.4488,7.1522


In [226]:
# data is normally distributed
data1.skew()

Unit A   -0.123684
Unit B   -0.379945
dtype: float64

In [227]:
# checking for homoscedasticity
data1.var()
# variances are considerably same 

Unit A    0.083179
Unit B    0.117924
dtype: float64

In [228]:
# 5% significance_level(alpha) 
alpha = 0.05

In [229]:
# all the required conditions to perform statistical test are satisfied
# use independent two sample t-test
t_value, p_value, _ = sm.stats.ttest_ind(data1['Unit A'],data1['Unit B'])

In [230]:
print(f'alpha       : {alpha}')
print(f'T statistic : {t_value}')
print(f'P value     : {p_value}')

if p_value > alpha:
    print('We fail to reject null hypothesis H0\n')
    print('we can conclude that:\nDiameter of cutlets between two units are equal')
else :
    print('We reject null hypothesis H0\n')
    print('we can conclude that:\nDiameter of cutlets between two units are not equal')

alpha       : 0.05
T statistic : 0.7228688704678063
P value     : 0.47223947245995
We fail to reject null hypothesis H0

we can conclude that:
Diameter of cutlets between two units are equal


### since p_value(0.47) > alpha(0.05) we can conclude, Diameter of cutlets between two units are equal

# Q 2] determine whether there is any difference in the average Turn Around Time (TAT) of reports of the laboratories on their preferred list

In [231]:
# Null hypothesis
# H0 : average Turn Around Time (TAT) of reports are equal
# Alternate hypothesis 
# H1 : not all average Turn Around Time (TAT) of reports are equal

In [232]:
# load dataset
data2 = pd.read_csv('LabTAT.csv')

In [233]:
data2.head()

Unnamed: 0,Laboratory 1,Laboratory 2,Laboratory 3,Laboratory 4
0,185.35,165.53,176.7,166.13
1,170.49,185.91,198.45,160.79
2,192.77,194.92,201.23,185.18
3,177.33,183.0,199.61,176.42
4,193.41,169.57,204.63,152.6


In [234]:
# dataset have 120 observations/samples
# and 4 columns 
data2.shape

(120, 4)

In [235]:
# data is normally distributed
data2.skew()

Laboratory 1   -0.177091
Laboratory 2    0.040659
Laboratory 3    0.133584
Laboratory 4   -0.104030
dtype: float64

In [236]:
# checking for homoscedasticity
data2.var()
# variances are considerably same

Laboratory 1    173.543568
Laboratory 2    223.715251
Laboratory 3    273.539607
Laboratory 4    227.559632
dtype: float64

In [237]:
# 5% significance_level(alpha)
alpha = 0.05

In [238]:
# all conditions required to perform statistical test are satisfied
# use one_way anova-test 
f_value, p_value = stats.f_oneway(data2.iloc[:,0],
                                      data2.iloc[:,1],
                                      data2.iloc[:,2],
                                      data2.iloc[:,3])

In [239]:
print(f'alpha       : {alpha}')
print(f'T statistic : {f_value}')
print(f'P value     : {p_value}')

if p_value > alpha:
    print('We fail to reject null hypothesis H0\n')
    print('we can conclude that:\naverage Turn Around Time (TAT) of reports are equal')
else :
    print('We reject null hypothesis H0\n')
    print('we can conclude that:\nnot all average Turn Around Time (TAT) of reports are equal')

alpha       : 0.05
T statistic : 118.70421654401437
P value     : 2.1156708949992414e-57
We reject null hypothesis H0

we can conclude that:
not all average Turn Around Time (TAT) of reports are equal


### since p_value(2.12e-17) < alpha(0.05), we can conclude, not all average Turn Around Time (TAT) of reports are equal

# Q 3] Find if male-female buyer rations are similar across regions

In [240]:
# Null hypothesis
# H0 : male-female buyer ration are similar across regions
# Alternate hypothesis
# H1 : not all male-female buyer ration are similar across regions

In [241]:
# load dataset 
data3 = pd.read_csv('BuyerRatio.csv')

In [242]:
data3

Unnamed: 0,Observed Values,East,West,North,South
0,Males,50,142,131,70
1,Females,435,1523,1356,750


In [243]:
# slicing data
sliceData3 = data3.values[:,1:]
sliceData3

array([[50, 142, 131, 70],
       [435, 1523, 1356, 750]], dtype=object)

In [244]:
# 5% significance_level(alpha)
alpha = 0.05

In [245]:
# use one-way chi-square-test 
chi2_value, p_value, _,exp = stats.chi2_contingency(sliceData3)

In [246]:
print(f'alpha       : {alpha}')
print(f'Chi statistic : {f_value}')
print(f'P value     : {p_value}')

if p_value > alpha:
    print('We fail to reject null hypothesis H0\n')
    print('we can conclude that:\nmale-female buyer ration are similar across regions.')
else :
    print('We reject null hypothesis H0\n')
    print('we can conclude that:\nnot all male-female buyer ration are similar across regions.')

alpha       : 0.05
Chi statistic : 118.70421654401437
P value     : 0.6603094907091882
We fail to reject null hypothesis H0

we can conclude that:
male-female buyer ration are similar across regions.


### since p_value(0.66) > alpha(0.05), we can conclude, male-female buyer ration are similar across regions

# Q 4] check whether the defective %  varies by centre

In [247]:
# Null hypothesis 
# H0 : defect % does not varies by center
# Alternate hypothesis
# H1 : defect % does varies by center

In [248]:
# load datset 
data4 = pd.read_csv('Costomer+OrderForm.csv')

In [249]:
data4.head()

Unnamed: 0,Phillippines,Indonesia,Malta,India
0,Error Free,Error Free,Defective,Error Free
1,Error Free,Error Free,Error Free,Defective
2,Error Free,Defective,Defective,Error Free
3,Error Free,Error Free,Error Free,Error Free
4,Error Free,Error Free,Defective,Error Free


In [250]:
# dataset have 300 observations/samples
# and 4 columns 
data4.shape

(300, 4)

In [251]:
# dataset contains categorical data
data4.dtypes

Phillippines    object
Indonesia       object
Malta           object
India           object
dtype: object

In [252]:
# dataset contains no null values
data4.isnull().sum()

Phillippines    0
Indonesia       0
Malta           0
India           0
dtype: int64

In [253]:
# calculating frquencies of errorFree customer forms
# and defective cutsomer forms 
errorFree = []
defective = []
for i in range(len(data4.columns)):
    x = data4.iloc[:,i].value_counts()
    errorFree.append(x[0])
    defective.append(x[1])
freqArray = np.array([errorFree,defective])

In [254]:
freqArray

array([[271, 267, 269, 280],
       [ 29,  33,  31,  20]], dtype=int64)

In [255]:
# 5% significance_level(alpha)
alpha = 0.05

In [256]:
# if percentage of defective order forms varies 
# then ration of errorFree&defective order forms 
# will also vary 
# use one-way chi-squared 
chi2_value, p_value,_,exp = stats.chi2_contingency(freqArray)

In [257]:
print(f'alpha       : {alpha}')
print(f'Chi statistic : {chi2_value}')
print(f'P value     : {p_value}')

if p_value > alpha:
    print('We fail to reject null hypothesis H0\n')
    print('we can conclude that:\ndefect % does not vary by center')
else :
    print('We reject null hypothesis H0\n')
    print('we can conclude that:\ndefect % varies by center')

alpha       : 0.05
Chi statistic : 3.858960685820355
P value     : 0.2771020991233135
We fail to reject null hypothesis H0

we can conclude that:
defect % does not vary by center


### since p_value(0.28) > alpha(0.05), we can conclude, defect % does not vary by center