In [17]:
import pandas as pd
import numpy as np
from itertools import product
from scipy.stats import binomtest, mannwhitneyu, wilcoxon, ttest_1samp
import statsmodels.stats.weightstats as sts

In [18]:
%%markdown
# verizon

# verizon


In [19]:
df = pd.read_csv('../datasets/verizon.txt', sep='\t')

# task 1: Calculate the difference in mean repair times for ILEC and CLEC customers. Provide the answer rounded with 1 decimal point.
clec_df = df[df['Group'] == 'CLEC']
ilec_df = df[df['Group'] == 'ILEC']

clec_df.mean() - ilec_df.mean()

  clec_df.mean() - ilec_df.mean()


Time    8.09752
dtype: float64

In [20]:
# Let's start by testing the hypothesis that average repair time for CLEC customers is 8 hours or less against the alternative that it's greater
#  than 8 hours. Use sign test, provide the p-value rounded to 4 decimal points.

np.round(binomtest(len(clec_df[clec_df['Time'] > 8]), len(clec_df), 0.5, alternative='greater').pvalue, 4)

0.105

In [21]:
# Now let's compare averages in two samples! 
# Use t-test to test the hypothesis of equal means against the alternative that mean repair time for CLEC customers is higher. 
# Provide the p-value rounded to 4 decimal points.

np.round(sts.ttest_ind(clec_df.Time, ilec_df.Time, alternative='larger', usevar='unequal')[1], 4)

0.0299

In [23]:
# Moving on to the rank test for equal averages. Calculate the p-value, round the answer to 5 decimal points.

mannwhitneyu(clec_df['Time'], ilec_df['Time'], alternative='greater')[1].round(5)

0.00046

In [24]:
# Great, let's proceed to the permutation test with the difference of sample means as a statistic. What is it's p-value? Round the answer to 4 decimal points. The sample is too big to go through all the permutation – let's use 10000 of them. To get the same result as us, use the functions from the example notebook, and set random seed = 0 before calling permutation_test_2s function.

import numpy as np

def permutation_t_stat_2s(sample1, sample2):
    return np.mean(sample1) - np.mean(sample2)

def get_random_combinations(n1, n2, max_permutations):
    index = np.array(range(n1 + n2))
    indices = set([tuple(index)])
    for i in range(max_permutations - 1):
        np.random.shuffle(index)
        indices.add(tuple(index))
    return [(index[:n1], index[n1:]) for index in indices]

def permutation_null_dist_2s(sample1, sample2, max_permutations = None):
    pooled_sample = np.hstack((sample1, sample2))
    n1 = len(sample1)
    n2 = len(sample2)
    n = n1 + n2
    
    if max_permutations:
        indices = get_random_combinations(n1, n2, max_permutations)
    else:
        indices = [(list(index), filter(lambda i: i not in index, range(n))) \
                    for index in itertools.combinations(range(n), n1)]
    
    distr = [permutation_t_stat_2s(pooled_sample[list(i[0])], pooled_sample[list(i[1])]) \
             for i in indices]
    return distr

def permutation_test_2s(sample1, sample2, max_permutations = None, alternative = 'two-sided', return_distr = False):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    t_stat = permutation_t_stat_2s(sample1, sample2)
    
    null_distr = permutation_null_dist_2s(sample1, sample2, max_permutations)
    
    if alternative == 'two-sided':
        p = sum([1. if abs(x) >= abs(t_stat) else 0. for x in null_distr]) / len(null_distr)
    elif alternative == 'less':
        p = sum([1. if x <= t_stat else 0. for x in null_distr]) / len(null_distr)
    else: # alternative == 'greater':
        p = sum([1. if x >= t_stat else 0. for x in null_distr]) / len(null_distr)
    
    if return_distr:
        return {'t': t_stat, 'p': p, 'null_distr': null_distr}
    else:
        return {'t': t_stat, 'p': p}


np.random.seed(0)
np.round(permutation_test_2s(clec_df['Time'], ilec_df['Time'], max_permutations=10000, alternative='greater')['p'], 4)

0.0179

In [25]:
%%markdown
# software reliability



# software reliability


In [26]:
# Do failures on average happen more often than every 500 CPU seconds? Let's test the following hypothesis:

# 𝐻0: average time between failures is not greater than 500 CPU seconds
# 𝐻1: average time between failures is greater than 500 CPU seconds

# # First, let's use Student's t-test. What is its p-value? Round the answer to 4 decimal points.

df = pd.read_csv('../datasets/failure_times.txt', sep='\t', header=None)
df.columns = ['time']
df['shifted'] = df.time.shift(1)
df['dif'] = df.time - df.shifted
df.dropna(inplace=True)

print(f"Students t-test res: {np.round(ttest_1samp(df.dif, 500, alternative='greater').pvalue, 4)}")


Students t-test res: 0.0406


In [27]:
# what number of observations in the sample is above 500?
dif = len(df[df.dif > 500])
print(f"Num obs w dif > 500: {dif}")

Num obs w dif > 500: 49


In [28]:
# What is the p-value of the sign test? Round the answer to 4 decimal points.
print(f"Sign test res: {np.round(binomtest(dif, len(df), alternative = 'greater').pvalue, 4)}")


Sign test res: 0.9995


In [29]:
print(f"Signed rank test res: {np.round(wilcoxon(df.dif - 500, alternative='greater').pvalue, 4)}")

Signed rank test res: 0.8632


In [30]:
# Great, let's proceed to the permutation test with sum of the (centered) sample as a statistic. What is it's p-value? Round the answer to 4 decimal points.

# The sample is too big to go through all the permutation – let's use 10000 of them. 
# To get the same result as us, use the functions from the example notebook, and set random seed = 0 before calling permutation_test_1s function.

def permutation_t_stat_1s(sample, mean):
    t_stat = sum(sample - mean)
    return t_stat

def permutation_null_distr_1s(sample, mean, max_permutations = None):
    centered_sample = sample - mean
    if max_permutations:
        signs_array = set([tuple(x) for x in 2 * np.random.randint(2, size = (max_permutations, 
                                                                              len(sample))) - 1 ])
    else:
        signs_array =  product([-1, 1], repeat = len(sample))
    distr = [permutation_t_stat_1s(centered_sample * np.array(signs), 0) for signs in signs_array]
    return distr

def permutation_test_1s(sample, mean, max_permutations = None, alternative = 'two-sided', return_distr = False):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    t_stat = permutation_t_stat_1s(sample, mean)
    
    null_distr = permutation_null_distr_1s(sample, mean, max_permutations)
    
    if alternative == 'two-sided':
        p = sum([1. if abs(x) >= abs(t_stat) else 0. for x in null_distr]) / len(null_distr)
    elif alternative == 'less':
        p = sum([1. if x <= t_stat else 0. for x in null_distr]) / len(null_distr)
    else: # alternative == 'greater':
        p = sum([1. if x >= t_stat else 0. for x in null_distr]) / len(null_distr)
        
    if return_distr:
        return {'t': t_stat, 'p': p, 'null_distr': null_distr}
    else:
        return {'t': t_stat, 'p': p}

np.random.seed(0)

print(f"Permutation test res: {permutation_test_1s(np.array(df.dif), mean = 500, max_permutations=10000, alternative = 'greater')['p']}")

Permutation test res: 0.0366
