## 6.1 Data for Comparing Groups

In [None]:
%config InlineBackend.figure_format = 'svg'
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 70)

In [None]:
import pandas as pd
seg_df = pd.read_csv('http://bit.ly/PMR-ch5')
seg_df.head()

In [None]:
from google.colab import files
import pandas as pd

uploaded = files.upload()
seg_df = pd.read_csv('segment_dataframe_Python_intro_Ch5.csv', index_col=0)
seg_df.head()

## 6.2 Testing Group Frequencies: chisq.test()

In [None]:
import numpy as np
tmp = np.repeat(range(4), [25, 25, 25, 20])
tmp

In [None]:
tmp_values, tmp_counts = np.unique(tmp, return_counts=True)
tmp_counts

In [None]:
from scipy import stats
stats.chisquare(tmp_counts)

In [None]:
tmp_values, tmp_counts = np.unique(np.repeat(range(4),
                                             [25, 25, 25, 10]),
                                   return_counts=True)
print(tmp_counts)
print(stats.chisquare(tmp_counts))
print('Expected values: {}'.format(np.ones(4)*tmp_counts.sum()/4))

In [None]:
tmp_counts_small = tmp_counts/5
print(tmp_counts_small)
print(stats.chisquare(tmp_counts_small))
print('Expected values: {}'.format(np.ones(4)*tmp_counts_small.sum()/4))

In [None]:
tmp_counts_large = tmp_counts*10 + 1900
print(tmp_counts_large)
print(stats.chisquare(tmp_counts_large))
print('Expected values: {}'.format(np.ones(4)*tmp_counts_large.sum()/4))

In [None]:
segment_values, segment_counts = np.unique(seg_df.Segment,
                                           return_counts=True)
print(segment_counts)
stats.chisquare(segment_counts)

In [None]:
seg_df.Segment.value_counts()

In [None]:
stats.chisquare(seg_df.Segment.value_counts())

In [None]:
pd.crosstab(seg_df.subscribe, columns=seg_df.own_home)

In [None]:
stats.chi2_contingency(pd.crosstab(seg_df.subscribe,
                                   columns=seg_df.own_home))

In [None]:
sub_by_home = pd.crosstab(seg_df.subscribe,
                          columns=seg_df.own_home)
print('chisq_stat: {0}\np_value: {1}\ndof: {2}\nexpected_values: {3}'
      .format(*stats.chi2_contingency(sub_by_home)))

In [None]:
print('chisq_stat: {0}\np_value: {1}\ndof: {2}\nexpected_values: {3}'
      .format(*stats.chi2_contingency(sub_by_home,
                                      correction=False)))

## 6.3 Testing Observed Proportions: binom.test()

In [None]:
from statsmodels.stats import proportion as sms_proportion
sms_proportion.binom_test(count=12, nobs=20, prop=0.5)

In [None]:
sms_proportion.proportion_confint(count=12, nobs=20, alpha=0.05)

In [None]:
sms_proportion.binom_test(count=120, nobs=200, prop=0.5)

In [None]:
sms_proportion.proportion_confint(count=120, nobs=200, alpha=0.05)

In [None]:
stats.binom.pmf([8, 9, 10, 11, 12], p=0.5, n=20).sum()

In [None]:
stats.binom.pmf(range(8,13), p=0.5, n=20).sum()

In [None]:
sms_proportion.proportion_confint(12, 20, method='agresti_coull')

In [None]:
sms_proportion.proportion_confint(0, 20, method='agresti_coull')

## 6.4 Testing Group Means: t.test()

In [None]:
from scipy import stats
import matplotlib.pyplot as plt

income_own_home = seg_df.income[seg_df.own_home]
income_dont_own_home = seg_df.income[~seg_df.own_home]

seg_df.income.hist() # Not shown
income_own_home.hist(alpha=0.5) # Not shown
income_dont_own_home.hist(alpha=0.5) # Not shown
seg_df.boxplot(column='income', by='own_home') # Not shown
plt.figure()
_ = stats.probplot(seg_df.income, dist='norm', plot=plt) # Not shown

In [None]:
income_dont_own_home.mean(), income_dont_own_home.std()

In [None]:
income_own_home.mean(), income_own_home.std()

In [None]:
stats.ttest_ind(income_dont_own_home, income_own_home,
                equal_var=False)

In [None]:
count_dont_own_home = income_dont_own_home.shape[0]
count_own_home = income_own_home.shape[0]
dof = count_dont_own_home + count_own_home - 2
geometric_mean_sem = np.sqrt(((count_dont_own_home - 1)
                              * stats.sem(income_dont_own_home)**2
                              + (count_own_home - 1)
                              * stats.sem(income_own_home)**2)/dof)
stats.t.interval(alpha=0.95,
                 df=dof,
                 loc=income_dont_own_home.mean()\
                     - income_own_home.mean(),
                 scale=geometric_mean_sem)

In [None]:
def ttest(a, b):
  # This function displays statistics on two groups, runs a t-test,
  # and finds the 95% confidence interval of the mean difference
  # between groups
  
  # Get means and standard deviation of each group
  mean_a = a.mean() 
  mean_b = b.mean()
  
  std_a = a.std()
  std_b = b.std()
  
  print('Group a - mean: {0}  standard deviation: {1}'
        .format(mean_a, std_a))
  print('Group b - mean: {0}  standard deviation: {1}\n'
        .format(mean_b, std_b))
  
  # Run a Welch's t-test between the groups
  ttest_out = stats.ttest_ind(a, b, equal_var=False)
  print("Welch's t-test statistic: {0}\np-value: {1}\n"
        .format(ttest_out.statistic, ttest_out.pvalue))
  
  # Find the 95% confidence interval using scipy.statst.interval
  # function. The difference in means is the location of the
  # distribution (loc parameter). The geometric mean of the
  # standard error of each group is the scale
  count_a = a.shape[0]
  count_b = b.shape[0]
  dof = count_a + count_b - 2
  
  geometric_mean_sem = np.sqrt(((count_a - 1) * stats.sem(a)**2
                                + (count_b -1) * stats.sem(b)**2)/dof)
  print('95% confidence interval of the mean difference between a and'
        ' b:\n{0}'
        .format(stats.t.interval(alpha=0.95, df=dof,
                                 loc=mean_a - mean_b,
                                 scale=geometric_mean_sem)))
  

In [None]:
ttest(income_dont_own_home, income_own_home)

In [None]:
traveler_subset = seg_df.loc[seg_df.Segment == 'travelers']
ttest(traveler_subset.income[~traveler_subset.own_home],
      traveler_subset.income[traveler_subset.own_home])

## 6.5 Testing Multiple Group Means: Analysis of Variance (ANOVA)

In [None]:
stats.f_oneway(income_dont_own_home, income_own_home)

In [None]:
import statsmodels.formula.api as smf
from statsmodels.stats import anova as sms_anova

In [None]:
income_home_lm = smf.ols('income ~ own_home', data=seg_df).fit()
sms_anova.anova_lm(income_home_lm)

In [None]:
income_segment_lm = smf.ols('income ~ Segment', data=seg_df).fit()
sms_anova.anova_lm(income_segment_lm)

In [None]:
income_home_segment_lm = smf.ols('income ~ Segment + own_home',
                                 data=seg_df).fit()
sms_anova.anova_lm(income_home_segment_lm)

In [None]:
income_home_segment_lm = smf.ols('income ~ Segment * own_home',
                                 data=seg_df).fit()
sms_anova.anova_lm(income_home_segment_lm)

In [None]:
sms_anova.anova_lm(smf.ols('income ~ Segment', data=seg_df).fit(),
                   smf.ols('income ~ Segment + own_home',
                           data=seg_df).fit(),
                   typ=1)

In [None]:
income_segment_lm = smf.ols('income ~ Segment', data=seg_df).fit()
income_segment_lm.summary()

In [None]:
income_segment_lm.params

In [None]:
income_segment_lm.conf_int()

In [None]:
income_segment_lm_adjusted = smf.ols('income ~ -1 + Segment',
                                     data=seg_df).fit()
income_segment_lm_adjusted.summary()

In [None]:
means = income_segment_lm_adjusted.params
means

In [None]:
ci = income_segment_lm_adjusted.conf_int()
ci

In [None]:
means.plot(kind='barh', xerr=ci[1]-means, color='0.7')

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
plt.barh(y=range(4), left=ci[0], width=ci[1]-ci[0],
         height=0.2, color='0.4')
plt.xlabel('income ($)')
plt.yticks(range(len(means)), ci.index)
plt.plot(means, range(4), 'ro')
plt.xlim((0, 68000))

In [None]:
def plot_confidence_intervals(centers, conf_ints, zero_line=False):
  '''Plot centers and confidence intervals'''
  plt.figure(figsize=(8,4))
  sort_index = np.argsort(centers.values)
  centers = centers[sort_index]
  conf_ints = conf_ints.iloc[sort_index]
  plt.barh(y=range(len(centers)), left=conf_ints[0],
           width=conf_ints[1]-conf_ints[0],
           height=0.2, color='0.4')
  plt.yticks(range(len(centers)), conf_ints.index)
  plt.plot(centers, range(len(centers)), 'ro')
  if zero_line:
    plt.plot([0,0],[-.5, len(centers) - 0.5], 'gray',
             linestyle='dashed')
    plt.xlim((-.05, 1.1 * conf_ints.iloc[:,1].max()))
  plt.ylim((-.5, len(centers) - 0.5))

In [None]:
plot_confidence_intervals(means, ci, zero_line=True)
plt.xlabel('income')