# Final Quiz (1st week) - Confidence Intervals

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as sps
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint
from statsmodels.stats.proportion import samplesize_confint_proportion
from matplotlib import pyplot as plt
from math import ceil
%matplotlib inline

# Task 1

In [6]:
?sps.norm

In [16]:
sample = sps.norm.rvs(size=100000, loc=0, scale=1)
proportion_confint(np.sum(sample), len(sample), alpha=0.5)

(0.0015716218007514205, 0.0017451982865812664)

# Tasks 5, 6, 7

In [26]:
patients = {}
patients['aspirin'] = np.zeros((11037))
patients['aspirin'][:104] = 1

patients['placebo'] = np.zeros((11034))
patients['placebo'][:189] = 1

In [27]:
def proportions_confint_diff_ind(sample1, sample2, alpha = 0.05):    
    z = sps.norm.ppf(1 - alpha / 2.)   
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [28]:
print(189/11034 - 104/11037)

0.0077060239760047815


In [29]:
print("confidence interval: [%f, %f]" % proportions_confint_diff_ind(patients['placebo'], patients['aspirin']))

confidence interval: [0.004688, 0.010724]


In [30]:
placebo_proba = 189 / 11034 
placebo_odds = placebo_proba / (1 - placebo_proba)

aspirin_proba = 104 / 11037 
aspirin_odds = aspirin_proba / (1 - aspirin_proba)

print(placebo_odds / aspirin_odds)

1.8320539419087138


# Task 8

In [31]:
def get_bootstrap_samples(data, n_samples):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples

In [32]:
def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [33]:
np.random.seed(0)

aspirin_chance_scores = list(map(lambda arr: np.sum(arr) / (len(arr) - np.sum(arr)), 
                                 get_bootstrap_samples(patients['aspirin'], 1000)))
placebo_chance_scores = list(map(lambda arr: np.sum(arr) / (len(arr) - np.sum(arr)), 
                                 get_bootstrap_samples(patients['placebo'], 1000)))

print("95% confidence interval for the aspirin infarct proba: ",  stat_intervals(aspirin_chance_scores, 0.05))
print("95% confidence interval for the placebo infarct proba:",  stat_intervals(placebo_chance_scores, 0.05))

95% confidence interval for the aspirin infarct proba:  [ 0.00757714  0.0113626 ]
95% confidence interval for the placebo infarct proba: [ 0.01499402  0.01996672]


In [34]:
chance_rel_scores = list(map(lambda x: x[1] / x[0], zip(aspirin_chance_scores, placebo_chance_scores)))

In [35]:
print("95% confidence interval for the difference between medians",  stat_intervals(chance_rel_scores, 0.05))

95% confidence interval for the difference between medians [ 1.44419465  2.34321168]
