### Hypothesis Testing Case Study

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
import math

In [2]:
pwd()

'C:\\Users\\gaura\\OneDrive\\Desktop\\Class4,5 - Stats'

In [3]:
cust=pd.read_excel("cust_seg.xlsx")

In [4]:
cust.head(5)

Unnamed: 0,custid,sex,AqChannel,region,Marital_status,segment,pre_usage,Post_usage_1month,Latest_mon_usage,post_usage_2ndmonth
0,70,0,4,1,1,1,57,52,49.2,57.2
1,121,1,4,2,1,3,68,59,63.6,64.9
2,86,0,4,3,1,1,44,33,64.8,36.3
3,141,0,4,3,1,3,63,44,56.4,48.4
4,172,0,4,2,1,2,47,52,68.4,57.2


In [6]:
len(cust.columns)

10

In [13]:
cust.Latest_mon_usage.mean()

63.17400000000001

In [8]:
cust.Latest_mon_usage.std()

11.242137352892753

### One Sample T-Test


In [12]:
stats.ttest_1samp(a= cust.Latest_mon_usage,popmean= 63.17)  # sample data

Ttest_1sampResult(statistic=0.00503183164545989, pvalue=0.9959902367799631)

In [None]:
# If P-value is less than .05 we will reject the Null hypothesis 
# python always do 2-tailed .. 
# divide p-value by 2 then we can say it is 1-tailed test


In [14]:
cust.Latest_mon_usage.mean()

63.17400000000001

### Two Sample T-Test (Paired)

In [15]:
print (cust.pre_usage.mean())
print (cust.post_usage_2ndmonth.mean())

52.23
58.05250000000003


In [17]:
#stats.ttest_rel(a = before, b = after)
stats.ttest_rel(a=cust.post_usage_2ndmonth,
                b=cust.pre_usage)    # Assume samples have equal variance?

Ttest_relResult(statistic=8.866832246938742, pvalue=4.295733828012836e-16)

In [None]:
#Reject the Null hypothesis
# - A<B
# + B>A

### Two sample T-Test (Independent)

In [18]:
Males_spend = cust.Post_usage_1month[cust.sex==0]
FeMales_spend = cust.Post_usage_1month[cust.sex==1]

In [19]:
print (FeMales_spend.head(3))
print (Males_spend.head(3))

1     59
92    62
93    44
Name: Post_usage_1month, dtype: int64
0    52
2    33
3    44
Name: Post_usage_1month, dtype: int64


In [20]:
print (Males_spend.mean())
print (FeMales_spend.mean())

50.120879120879124
54.99082568807339


In [21]:
print (Males_spend.std())
print (FeMales_spend.std())

10.305160697259263
8.13371516959346


In [22]:
stats.ttest_ind(a= Males_spend, b= FeMales_spend, equal_var=False)    # Assume samples have equal variance?

Ttest_indResult(statistic=-3.6564080478875276, pvalue=0.00034088493594266187)

In [None]:
we will reject the null hypothesis -- neg signB>A

In [23]:
stats.ttest_ind(a= Males_spend,
                b= FeMales_spend,
                equal_var=True)    # Assume samples have equal variance?

Ttest_indResult(statistic=-3.7340738531536797, pvalue=0.00024625461203549315)

In [24]:
stats.f_oneway(Males_spend, FeMales_spend)

F_onewayResult(statistic=13.94330754080599, pvalue=0.0002462546120354903)

### ANOVA

In [25]:
cust.segment.value_counts()

2    105
3     50
1     45
Name: segment, dtype: int64

In [26]:
s1 = cust.Latest_mon_usage[cust.segment==1]
s2 = cust.Latest_mon_usage[cust.segment==2]
s3 = cust.Latest_mon_usage[cust.segment==3]

# Perform the ANOVA
stats.f_oneway(s1, s2, s3)

F_onewayResult(statistic=29.279283801321778, pvalue=7.36401083352674e-12)

In [27]:
cust.Latest_mon_usage.mean()
cust.Latest_mon_usage.std()

11.242137352892753

In [31]:
print (s1.mean(), s2.mean(),s3.mean())

60.026666666666685 68.08000000000003 55.703999999999986


### Chi-squre Test

In [34]:
t=pd.crosstab(cust.segment, cust.region, margins = True)
t

region,1,2,3,All
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,16,20,9,45
2,19,44,42,105
3,12,31,7,50
All,47,95,58,200


In [35]:
stats.chi2_contingency(observed= t)

(16.60444164948934,
 0.055282939487992365,
 9,
 array([[ 10.575,  21.375,  13.05 ,  45.   ],
        [ 24.675,  49.875,  30.45 , 105.   ],
        [ 11.75 ,  23.75 ,  14.5  ,  50.   ],
        [ 47.   ,  95.   ,  58.   , 200.   ]]))

In [None]:
#We are goint to accept the Null hypothesis
# CHi-square test always do right tailed test
#If we check the independent to each other just reversed the whole test 

### Correlation

In [37]:
# Ho: Two variable are independent to each other 
print (np.corrcoef(cust.Latest_mon_usage, cust.Post_usage_1month))

[[1.         0.61744926]
 [0.61744926 1.        ]]


In [38]:
print (stats.stats.pearsonr(cust.Latest_mon_usage, cust.Post_usage_1month))

(0.6174492644854919, 2.0866647416871388e-22)


### Simple regression

In [None]:
sm.OLS?

In [None]:
 statsmodels.tools.add_constant?

In [None]:
import statsmodels.api as sm
Y = cust.Latest_mon_usage
X = sm.add_constant(cust.Post_usage_1month )
mod = sm.OLS(Y, X)
res = mod.fit()
print res.summary()