# DASC 512 - 21 - Two-Sample Tests of Centrality

***

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.graphics.api as smg

sns.set_style('whitegrid')

In [2]:
# Let's start by creating a DataFrame for the data in the slides
sales = pd.DataFrame({'Year':([2007] * 5 + [2008] * 6), 'Sales':[21742,13457,25690,17500,42389,15473,41989,28795,19300,22317,27315]})

In [3]:
sales

Unnamed: 0,Year,Sales
0,2007,21742
1,2007,13457
2,2007,25690
3,2007,17500
4,2007,42389
5,2008,15473
6,2008,41989
7,2008,28795
8,2008,19300
9,2008,22317


# Using Python 

### Two-sample paired t-test

In [4]:
sales07 = [13457, 42389, 25690, 17500, 21742]
sales08 = [15473, 41989, 28795, 19300, 22317]
a = sales07
b = sales08
alternative = 'less'

t, p_val = stats.ttest_rel(a=a, b=b, alternative=alternative)
print(f'The test statistic is t={t:.2f} and the p-value is p={p_val:.2f}.')

The test statistic is t=-2.34 and the p-value is p=0.04.


In [8]:
# Equivalent: Do a one-sample t-test on the differences.
d = np.subtract(a,b)
t, p_val = stats.ttest_1samp(a=d, popmean=0, alternative=alternative)
print(f'The test statistic is t={t:.2f} and the p-value is p={p_val:.2f}.')

The test statistic is t=-2.34 and the p-value is p=0.04.


### Two-Sample t-test, without pooled variance

In [41]:
a = sales[sales['Year']==2007]['Sales']
b = sales[sales['Year']==2008]['Sales']
equal_var = False
alternative='less'  # Left-tailed test

t, p_val = stats.ttest_ind(a=a, b=b, equal_var = equal_var, alternative=alternative)
print(f'The test statistic is t={t:.2f} and the p-value is p={p_val:.2f}.')

The test statistic is t=-0.27 and the p-value is p=0.40.


### Two-Sample t-test, with pooled variance

In [42]:
a = sales[sales['Year']==2007]['Sales']
b = sales[sales['Year']==2008]['Sales']
equal_var = True
alternative='less'  # Left-tailed test

t, p_val = stats.ttest_ind(a=a, b=b, equal_var = equal_var, alternative=alternative)
print(f'The test statistic is t={t:.2f} and the p-value is p={p_val:.2f}.')

The test statistic is t=-0.28 and the p-value is p=0.39.


### Mann-Whitney U Test

In [39]:
x = sales[sales['Year']==2007]['Sales']
y = sales[sales['Year']==2008]['Sales']
u, p_val = stats.mannwhitneyu(x=a, y=b, alternative='less')
print(f'The test statistic is U={u:.0f} and the p-value is p={p_val:.2f}.')

The test statistic is U=12 and the p-value is p=0.33.


***

# Optional: Manual Calculations

If you'd like to see these tests performed manually, see below. Feel free to skip this section if it adds confusion for you. I used to teach this portion, but I'm leaning farther to application using Python instead.

### Two-Sample t-Test without Pooled Variance

In [26]:
# Calculate means
mean07 = sales[sales['Year']==2007]['Sales'].mean()
mean08 = sales[sales['Year']==2008]['Sales'].mean()
print(f'2007: {mean07:.2f} \n2008: {mean08:.2f}')

2007: 24155.60 
2008: 25864.83


In [27]:
# Calculate variance
var07 = sales[sales['Year']==2007]['Sales'].var()
var08 = sales[sales['Year']==2008]['Sales'].var()
print(f'2007: {var07:.2f} \n2008: {var08:.2f}')

2007: 124848444.30 
2008: 86870393.77


In [28]:
np.sqrt(var08)

9320.428840276967

In [29]:
# Calculate standard error
n1 = 5
n2 = 6
se = np.sqrt(var07/n1 + var08/n2)
se

6280.771275974879

In [30]:
# Calculate degrees of freedom
numer = se ** 4
denom = ((var07 / n1) ** 2) / (n1-1) + ((var08 / n2) ** 2) / (n2-1)
df = np.floor(numer/denom)
df

7.0

In [31]:
# Test statistic
t = (mean07 - mean08) / se
t

-0.2721374904816976

In [32]:
# Calculate p-value
p_val = stats.t.cdf(t, df=df)
p_val

0.3966842279934425

There is insufficient evidence to reject the null hypothesis.

**What if we had use pooled variance?**

### 2-Sample t-Test with Pooled Variance

In [33]:
# Calculate standard error, pooled variance
sp_squared = ((n1-1)*var07 + (n2-1)*var08) / (n1 + n2 - 2)
se_pooled = np.sqrt(sp_squared * (1/n1 + 1/n2))
se_pooled

6167.778640395064

In [34]:
# Degrees of freedom, pooled variance
numer = se_pooled ** 4
denom = ((var07 / n1) ** 2) / (n1-1) + ((var08 / n2) ** 2) / (n2-1)
df_pooled = np.floor(numer/denom)
df_pooled

7.0

In [35]:
# Test statistic
t_pooled = (mean07 - mean08) / se_pooled
t_pooled

-0.2771230021354742

In [36]:
# Calculate p-value
p_val_pooled = stats.t.cdf(t_pooled, df=df_pooled)
p_val_pooled

0.3948453911681864

**That didn't make much difference. What if we had paired data?**

### Paired t-Test

In [37]:
sales07 = [13457, 42389, 25690, 17500, 21742]
sales08 = [15473, 41989, 28795, 19300, 22317]
diffs = np.subtract(sales07, sales08)
diffs

array([-2016,   400, -3105, -1800,  -575])

In [38]:
t, p_val = stats.ttest_1samp(a=diffs, popmean=0, alternative='less')
print(f'Now the test statistic is t={t:.2f} and the p-value is p={p_val:.2f}!')

Now the test statistic is t=-2.34 and the p-value is p=0.04!


We now have sufficient evidence to reject the null hypothesis. Sales have increased. All this with one fewer data point which supported the alternative hypothesis!

**Using the paired t-test for paired data makes an enormous difference!**