In [1]:
import numpy as np
import pandas as pd
from sklearn import *
from functools import partial
import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint



In [2]:
round4 = partial(round,ndigits = 4)

In [3]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [4]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [5]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [6]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = zip(sample1, sample2)
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [7]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = zip(sample1, sample2)
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [8]:
n1 = 34
n2 = 16

p1 = 10.0/34.0
p2 = 4.0/16.0

In [9]:
P = (p1*n1+p2*n2)/(n1+n2)
z_st = (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [10]:
round(proportions_diff_z_test(z_st,alternative='greater'),4)

0.3729

In [11]:
frame = pd.read_csv('banknotes.txt', header=0, sep='\t')

frame.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [12]:
X = frame[['X1','X2','X3','X4','X5','X6']].values
y = frame.real.values

In [13]:
(X_train, 
 X_test, 
 y_train, y_test)=cross_validation.\
        train_test_split(X,y,test_size=50,
                         random_state=1)

In [14]:
X_1train = X_train[:,0:3]
X_1test= X_test[:,0:3]
X_2train = X_train[:,3:6]
X_2test = X_test[:,3:6]

In [15]:
log_regressor1 = linear_model.LogisticRegression()
log_regressor2 = linear_model.LogisticRegression()

log_regressor1.fit(X_1train, y_train)
log_regressor2.fit(X_2train, y_train)

lr_1 = log_regressor1.predict(X_1test)
lr_2 = log_regressor2.predict(X_2test)

In [16]:
z=proportions_diff_z_stat_rel(lr_2!=y_test,lr_1!=y_test)

In [17]:
proportions_diff_z_test(z)

0.0032969384555543435

In [18]:
map(round4,proportions_diff_confint_rel(lr_2!=y_test,lr_1!=y_test))

[-0.3001, -0.0599]

In [46]:
y = 525.0
X = 541.5
D = 100.0
n = 100.0
Z = (X-y)/(D*np.sqrt(1/n))

In [47]:
round4(proportions_diff_z_test(Z,alternative='greater'))

0.0495