### Exercise 1B
# Classroom
Statistical Fromulars

In [1]:
import numpy as np
import scipy.stats as sc
import pandas as pd

In [84]:
def pearson_corr(vec_x, vec_y):
    
    n = len(vec_x)
    av_x = sum(vec_x)/n
    av_y = sum(vec_y)/n
    
    upper_sum = 0
    lower_sum_x = 0
    lower_sum_y = 0
    
    for i in range(n):
        upper_sum = upper_sum + ((vec_x[i]-av_x) * (vec_y[i]-av_y))
        lower_sum_x = lower_sum_x + (vec_x[i]-av_x)**2
        lower_sum_y = lower_sum_y + (vec_y[i]-av_y)**2
        
    return upper_sum / (lower_sum_x**0.5 * lower_sum_y**0.5) 

In [85]:
vec_x = [1,2,3,4,5,6,7]
vec_y = [10,11,34,12,2,8,7]

print(sc.pearsonr(vec_x, vec_y)[0])
print(pearson_corr(vec_x, vec_y))
theta_1 = sc.linregress(vec_x, vec_y)[0]
theta_0 = sc.linregress(vec_x, vec_y)[1]
print('h = {} * {}x'.format(theta_0, theta_1))

-0.3538739286378901
-0.3538739286378901
h = 18.714285714285715 * -1.6785714285714284x


## Chi Squared

In [41]:
def add_sums_to_dataframe(df_raw):
    '''
    Add the horizontal and vertical sums to a dataframe which holds just the observed values.
    '''
    df_raw['h_sum'] = df_raw.sum(axis=1)
    df_v_sum = pd.DataFrame(columns=['v_sum'], data=df_raw.sum(axis=0)).transpose()
    df_raw = df_raw.append(df_v_sum)
    return df_raw

In [145]:
def calculate_expected_dataframe(df_obs):
    '''
    Calculates the expected values to a dataframe with observed values.
    The dateframe of observed values has to include vertical and horizontal sums.
    '''
    index = df_obs.index
    columns = df_obs.columns
    
    # Copy the observed values dataframe as a template.
    df_exp = df_obs.copy()
    # This is needed in case the observed values are integers. We will get floats for the expected ones.
    for column in columns:
        df_exp[column] = pd.to_numeric(df_exp[column], downcast='float')
    
    # Sort out the sum row and column labels for easier access.
    v_sum_row = index[len(index) - 1]
    h_sum_column = columns[len(columns) - 1]
    total_sum = df_obs.at[v_sum_row,h_sum_column]

    # Now calculate the expected values and set them to the expected values dataframe.
    for column in columns[:-1]:
        for row in index[:-1]:
            exp_value = float(df_obs.at[row,h_sum_column] * df_obs.at[v_sum_row,column] / total_sum)
            df_exp.at[row,column] = exp_value

    return df_exp    

In [101]:
def chi_squared(df_obs, df_exp):
    '''
    Calculated chi squared for a dataframe of observed values and a dataframe of expected values.
    '''
    chi_squared = 0
    
    for row in df_obs.index[:2]:
        for column in df_obs.columns[:5]:
            h_obs = df_obs.at[row, column]
            h_exp = df_exp.at[row, column]
            h_diff = h_obs - h_exp
            chi_squared = chi_squared + h_diff**2 / h_exp
            
    return chi_squared         

**Exercise Data Example** provided in class

In [149]:
# Create row and colum identifiers for the base dataframe.
columns = ['CDU/CSU','AfD','SPD','Gruene','Linke']
index = ['yes','no']

# Fill in the data and add a row for the vertical sum.
row_yes = np.array([312, 218, 231, 151, 111])
row_no = np.array([603, 693, 724, 794, 894])

# Combine the data into a matrix and finally into the dataframe.
data = [row_yes, row_no]
df_obs = pd.DataFrame(index=index, columns=columns, data=data)

# Add the horizontal and vertical dataframe sums.
df_obs = add_sums_to_dataframe(df_obs)
df_obs

Unnamed: 0,CDU/CSU,AfD,SPD,Gruene,Linke,h_sum
yes,312,218,231,151,111,1023
no,603,693,724,794,894,3708
v_sum,915,911,955,945,1005,4731


In [150]:
# Calculate the dataframe for the expected values.
df_exp = calculate_expected_dataframe(df_obs)
df_exp

Unnamed: 0,CDU/CSU,AfD,SPD,Gruene,Linke,h_sum
yes,197.853516,196.988586,206.502853,204.340515,217.314514,1023.0
no,717.146484,714.011414,748.497131,740.659485,787.685486,3708.0
v_sum,915.0,911.0,955.0,945.0,1005.0,4731.0


In [151]:
# Finally calculate chi squared.
print(chi_squared(df_obs, df_exp))

174.71537664998036


In [152]:
print(chi_squared(df_obs, df_exp))

174.71537664998036


**Check functionality** by comparing with the example from the lecture 5 page 22.

In [113]:
obs = np.array([[8,5],[42,20]])
columns = ['yes','no']
index = ['smoker','nonsmoker']
df_obs = pd.DataFrame(index=index, columns=columns, data=obs)
df_obs = add_sums_to_dataframe(df_obs)
df_obs

Unnamed: 0,yes,no,h_sum
smoker,8,5,13
nonsmoker,42,20,62
v_sum,50,25,75


In [143]:
df_exp = calculate_expected_dataframe(df_obs)
df_exp

Unnamed: 0,yes,no,h_sum
smoker,8.666667,4.333333,13.0
nonsmoker,41.333332,20.666666,62.0
v_sum,50.0,25.0,75.0


In [146]:
chi2 = chi_squared(df_obs, df_exp)
print('Chi2: {}'.format(chi2))
print('Ok, correct!')

Chi2: 0.186104213711693
Ok, correct!
