# ТЕСТ непараметрические критерии

In [1]:
import numpy as np
import pandas as pd
import itertools

from scipy import stats
from statsmodels.stats.descriptivestats import sign_test
from statsmodels.stats.weightstats import zconfint
from statsmodels.stats.weightstats import *

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## вопрос 4

- вычисляем критерий Уилкоксона

In [3]:
data = np.array([49, 58, 75, 110, 112, 132, 151, 276, 281, 362])

In [4]:
m0 = 200
wilcRes = stats.wilcoxon(data - m0)
wilcRes

WilcoxonResult(statistic=17.0, pvalue=0.28450269791120752)

In [5]:
round(wilcRes.pvalue, 4)

0.2845

## вопрос 5

- вычисляем критерий Манна-Уитни

In [6]:
data_F12 = [22,22,15,13,19,19,18,20,21,13,13,15]
data_F9 = [17,18,18,15,12,4,14,15,10]

In [7]:
mannwhitn = stats.mannwhitneyu(data_F12, data_F9)
mannwhitn

MannwhitneyuResult(statistic=27.0, pvalue=0.029004992720873729)

In [8]:
round(mannwhitn.pvalue, 4)

0.029

## вопрос 6

- бутстреп
- независимые выборки
- считаем 95% доверительный интервал для разности средних

In [43]:
data6 = pd.read_csv('data/challenger.txt', sep='\t')

In [44]:
print len(data6)
data6.head()

23


Unnamed: 0.1,Unnamed: 0,Temperature,Incident
0,Apr12.81,18.9,0
1,Nov12.81,21.1,1
2,Mar22.82,20.6,0
3,Nov11.82,20.0,0
4,Apr04.83,19.4,0


In [63]:
def get_bootstrap_samples(data, n_samples):
    '''n_samples - the number of pseudo-samples that we generate from the sample data'''
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples

In [46]:
def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [47]:
data6_0 = data6[data6['Incident'] == 0]['Temperature'].values
data6_1 = data6[data6['Incident'] == 1]['Temperature'].values

In [48]:
data6_0

array([ 18.9,  20.6,  20. ,  19.4,  22.2,  22.8,  21.1,  25.6,  19.4,
        19.4,  23.9,  21.1,  27.2,  24.4,  26.1,  24.4])

In [49]:
np.random.seed(0)
bootstrap_0 = get_bootstrap_samples(data6_0, 1000)

In [50]:
bootstrap_1 = get_bootstrap_samples(data6_1, 1000)

In [52]:
incident_0_mean_T_scores = map(np.mean, bootstrap_0)
incident_1_mean_T_scores = map(np.mean, bootstrap_1)

In [54]:
delta_mean_scores = map(lambda x: x[0] - x[1], zip(incident_0_mean_T_scores, incident_1_mean_T_scores))

In [55]:
stat_intervals = stat_intervals(delta_mean_scores, 0.05)
stat_intervals

array([ 1.42299107,  7.93861607])

In [56]:
round(stat_intervals[0], 4)

1.423

## вопрос 7

- вычисляем перестановочный критерий
- две независимые выборки
- двусторонняя альтернатива

In [57]:
def permutation_t_stat_ind(sample1, sample2):
    return np.mean(sample1) - np.mean(sample2)

In [58]:
def get_random_combinations(n1, n2, max_combinations):
    index = range(n1 + n2)
    indices = set([tuple(index)])
    for i in range(max_combinations - 1):
        np.random.shuffle(index)
        indices.add(tuple(index))
    return [(index[:n1], index[n1:]) for index in indices]

In [59]:
def permutation_zero_dist_ind(sample1, sample2, max_combinations = None):
    joined_sample = np.hstack((sample1, sample2))
    n1 = len(sample1)
    n = len(joined_sample)
    
    if max_combinations:
        indices = get_random_combinations(n1, len(sample2), max_combinations)
    else:
        indices = [(list(index), filter(lambda i: i not in index, range(n))) \
                    for index in itertools.combinations(range(n), n1)]
    
    distr = [joined_sample[list(i[0])].mean() - joined_sample[list(i[1])].mean() \
             for i in indices]
    return distr

In [60]:
def permutation_test(sample1, sample2, max_combinations = None, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    t_stat = permutation_t_stat_ind(sample1, sample2)
    
    zero_distr = permutation_zero_dist_ind(sample1, sample2, max_combinations)
    
    if alternative == 'two-sided':
        return sum([1. if abs(x) >= abs(t_stat) else 0. for x in zero_distr]) / len(zero_distr)
    
    if alternative == 'less':
        return sum([1. if x <= t_stat else 0. for x in zero_distr]) / len(zero_distr)

    if alternative == 'greater':
        return sum([1. if x >= t_stat else 0. for x in zero_distr]) / len(zero_distr)

In [61]:
np.random.seed(0)
print 'p-value:', permutation_test(data6_0, data6_1, max_combinations=10000)

p-value: 0.007
