# Research Methods 
## 2012 Olympics Hypothesis Testing

**04. December 2017**

Fabian Karl & Robert Brown


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy import stats
import scipy.optimize as opt

import matplotlib.pyplot as plt

pd.options.display.max_columns = 2000

In [None]:
df = pd.read_csv('london-2012.csv', thousands=',')
df = df.drop(['Unnamed: 44'], axis=1)
df.head(10)

First, lets create a few simple plots and investigate linear correlations between a few variables using the fisher transform.

In [None]:
fisher_transform = lambda r_xy: 0.5*np.log((1+r_xy)/(1-r_xy))

In [None]:
fig, ax = plt.subplots()
ax.scatter(df['NOC SIZE'], df['Total'])
plt.show()

r = np.corrcoef(df['NOC SIZE'], df['Total'])[0][1]
z = fisher_transform(r)
print 'p value: {0}'.format(2*stats.norm.sf(z))

In [None]:
fig, ax = plt.subplots()
ax.scatter(df['GDP.2011'], df['Total'])
plt.show()

r = np.corrcoef(df['GDP.2011'], df['Total'])[0][1]
z = fisher_transform(r)
print 'p value: {0}'.format(2*stats.norm.sf(z))

GDP seems to have a similarly important linear relationship with metals won

In [None]:
fig, ax = plt.subplots()
ax.scatter(df['pop.2010'], df['Total'])
plt.show()

r = np.corrcoef(df['pop.2010'], df['Total'])[0][1]
z = fisher_transform(r)
print 'p value: {0}'.format(2*stats.norm.sf(z))

In [None]:
rows = df.median().keys()
def centeral_limit_theorem_samples(split_with, N = 30, M = 1000):
    median = df[split_with].median()
    below = df[df[split_with] <= median]
    above = df[df[split_with] > median]

    samples = {'above':[], 'below':[]}
    for _ in range(M):
        samples['above'].append(above.sample(N).mean())
        samples['below'].append(below.sample(N).mean())
        
    mu = {'above':{}, 'below':{}}
    for row in rows:
        mu['above'][row] = list(map(lambda x: x[row], samples['above']))
        mu['below'][row] = list(map(lambda x: x[row], samples['below']))
    return samples, mu

In [None]:
split_feature = 'GDP.2011'
samples, mu = centeral_limit_theorem_samples(split_feature, N = 25, M = 100)

for feature in ['Total', 'NOC SIZE', 'Athlete rank score']:
    a = mu['above'][feature]
    b = mu['below'][feature]

    fig = plt.figure(figsize=(8, 8))
    axes = fig.add_subplot(111)

    bins=np.histogram(np.hstack((a,b)), bins=15)[1]
    axes.hist(a, bins, label = '{0} above median'.format(split_feature))
    axes.hist(b, bins, label = '{0} below median'.format(split_feature), alpha=0.5)
    axes.set_title(feature)
    axes.legend(loc=2);
    plt.show(fig)

    res = stats.ttest_ind(a, b, equal_var=False)
    print 'p value: {0}'.format(res.pvalue)


These are all unbelievably significant (literally). Lets find a super biased feature and see if we can even reject a null-hypothesis.

In [None]:
split_feature = 'pop.2010'
samples, mu = centeral_limit_theorem_samples(split_feature, N = 25, M = 100)

for feature in ['Pop rank']:
    a = mu['above'][feature]
    b = mu['below'][feature]

    fig = plt.figure(figsize=(8, 8))
    axes = fig.add_subplot(111)

    bins=np.histogram(np.hstack((a,b)), bins=15)[1]
    axes.hist(a, bins, label = '{0} above median'.format(split_feature))
    axes.hist(b, bins, label = '{0} below median'.format(split_feature), alpha=0.5)
    axes.set_title(feature)
    axes.legend(loc=2);
    plt.show(fig)

    res = stats.ttest_ind(a, b, equal_var=False)
    print res.pvalue

In [None]:
split_feature = 'pop.2010'
for M in [25, 100, 500]:
    samples, mu = centeral_limit_theorem_samples(split_feature, M = M)

    b = map(lambda x: x + 3*np.std(a), a)

    fig = plt.figure(figsize=(8, 8))
    axes = fig.add_subplot(111)

    bins=np.histogram(np.hstack((a,b)), bins=15)[1]
    axes.hist(a, bins)
    axes.hist(b, bins, alpha=0.5)
    plt.show(fig)

    res = stats.ttest_ind(a, b, equal_var=False)
    print res.pvalue