In [40]:
import numpy as np
import pandas as pd
from sklearn.metrics import matthews_corrcoef
from scipy.stats import chi2_contingency
import scipy.stats as sps

### Task 1

In [41]:
water_data = pd.read_csv('./water.txt', sep='\t')

In [42]:
water_data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [43]:
corr_data = water_data.iloc[:, 2:4]

In [44]:
corr_data.corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.654849
hardness,-0.654849,1.0


### Task 2

In [45]:
corr_data.corr(method='spearman')

Unnamed: 0,mortality,hardness
mortality,1.0,-0.631665
hardness,-0.631665,1.0


### Task 3

In [46]:
s_data = water_data[water_data['location'] == 'South'].iloc[:, 2:4]
n_data = water_data[water_data['location'] == 'North'].iloc[:, 2:4]

In [47]:
s_data.corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.602153
hardness,-0.602153,1.0


In [48]:
n_data.corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.368598
hardness,-0.368598,1.0


### Task 4

In [49]:
def calculate_corr(a, b, c, d, method='matthews'):
    if method == 'matthews':
        return (a * d - b * c) / np.sqrt((a + b) * (a + c) * (b + d) * (c + d))

In [50]:
a = 718
b = 203
c = 515
d = 239
mat_corr = calculate_corr(a, b, c, d,method='matthews')

In [51]:
mat_corr

0.10900237458678963

### Task 5

In [52]:
chi2, p, dof, expected = chi2_contingency([[a, b], [c, d]])

In [53]:
p

1.0558987006638725e-05

### Task 6

In [54]:
def proportions_confint_diff_ind(sample1, sample2, alpha = 0.05):    
    z = sps.norm.ppf(1 - alpha / 2.)   
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [55]:
women = np.zeros((921))
women[:203] = 1

men = np.zeros((754))
men[:239] = 1

In [56]:
proportions_confint_diff_ind(men, women)

(0.053905233215813156, 0.13922183141523897)

### Task 7

In [84]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [85]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - sps.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return sps.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - sps.norm.cdf(z_stat)

In [86]:
women = np.zeros((921))
women[:203] = 1

men = np.zeros((754))
men[:239] = 1

In [88]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(men, women)))

p-value: 0.000008


### Task 8

In [73]:
crosstab = np.array([[197, 111, 33], [382, 685, 331], [110, 342, 333]])

In [74]:
chi2, p, dof, expected = chi2_contingency(np.array(crosstab))

In [75]:
chi2

293.68311039689746

### Task 9

In [76]:
p

2.4964299580093467e-62

### Task 10

In [77]:
def cramers_stat(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    return np.sqrt(chi2 / (n * (np.min(confusion_matrix.shape) - 1)))

In [78]:
coeff_V_kramer = cramers_stat(crosstab)

In [79]:
print(coeff_V_kramer)

0.24120139345
