# Hypothesis test

In [1]:
# set up path to import my python scripts
import sys
import os
src_path = os.path.abspath(os.path.join('..'))
if src_path not in sys.path:
    sys.path.append(src_path+"/src")
    
%load_ext autoreload
%autoreload 2
import prep_data as prep
import main

In [2]:
# Establish required info
# link for data files
base_url = "http://kopiko.ifa.hawaii.edu/weather/archivedata/"

# Define reasonable ranges for each column
acceptable_ranges = {
    'temperature': (-273,40),
    'humidity': (0,100),
    'wind_speed': (0,100),
    'visibility': (0,100000),
    'precipitation': (0,100),
    'dewpoint': (-273,40)
    }
# Define the thresholds for ('Green', 'Red') weather - plan to use config file in future
thresholds = {
        'humidity': (75,85),
        'wind_sust': (10,12),
        'wind_gust': (15,15),
        'visibility': (50000,40000),
        'precipitation': (0,0),
        'dewpoint_delta': (6,3)
        }

In [3]:
# Run from jupyter
data_path_for_notebook = '../data/'
# get list of all data file urls
csv_urls = prep.get_csv_file_links(base_url)

# prep all data 
for url in csv_urls:
    year = url.split('/')[-1].split('.')[0]
    # for now skip 2020-2021 becuase I know the formating is not correct. Skip 1993 because it is empty.
    if year in ['1993','2020','2021']:
        continue
    # if prepped data file already exist for that year skip it
    elif main.prepped_data_exists(year,base_path=data_path_for_notebook):
        continue
    else:
        main.get_and_prep_data(url,acceptable_ranges,thresholds,save_results=True,save_path=data_path_for_notebook)

In [23]:
# combine the daily status hours for all years into one df
df = prep.combine_status_hour_dfs(base_path=data_path_for_notebook)
df = prep.normalize_daily_hours_to_24(df)
prep.add_month_year_columns(df)



Total rows          : 8965
-----------------------------
Number of NaNs per column:
Green               :  562
Yellow              : 1625
Red                 : 1717
month               :    0
year                :    0
None


In [25]:
df

Unnamed: 0_level_0,Green,Yellow,Red,month,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1994-09-11,21.600000,0.000000,2.400000,Sep,1994
1994-09-12,23.000000,0.000000,1.000000,Sep,1994
1994-09-13,22.500000,0.000000,1.500000,Sep,1994
1994-09-14,24.000000,0.000000,0.000000,Sep,1994
1994-09-15,23.741935,0.000000,0.258065,Sep,1994
...,...,...,...,...,...
2019-12-27,0.597499,0.054318,23.348183,Dec,2019
2019-12-28,4.137437,3.982820,15.879742,Dec,2019
2019-12-29,3.484241,5.154728,15.361032,Dec,2019
2019-12-30,4.839494,5.268781,13.891724,Dec,2019


## Hypothesis Testing Steps

1. Scientific Question: Does one month have a higher average daily green weather hours than the others?

2. $H_0$: There is no significant difference between the average daily green weather hours each month

3. $H_a$: multiple combinations (12 choose 2 = 66) : There is a significant difference in the daily green weather hours (two tailed test)

4. Create a Probabilistic Model of the Situation Assuming the Null Hypothesis is True

5. Decide how Surprised You Need to Be to Reject Your Skeptical Assumption - apply Bonferroni correction  
  $\alpha=0.5$

6. Collect Your Data - done

7. Calculate the Probability of Finding a Result Equally or More Extreme than Actually Observed Assuming the Null Hypothesis is True

8. Compare the p-value to Your Stated Rejection Threshold


In [98]:
from itertools import combinations
import scipy.stats as stats
import pandas as pd
import hypothesis_test as ht

In [96]:
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
# want months to be in order from the highest mean green hours to lowest so results are easier to read



months_sorted_by_mean = ht.sort_dict_keys_by_values(ht.get_monthly_means(df))

In [89]:

combos = list(combinations(months_sorted_by_mean,2))
num_combos = len(combos)

alpha = 0.5 
FWER = 1 - (1 - alpha)**num_combos
print(f'The family-wise error rate for alpha={alpha} and {num_combos} combinations is: {FWER}')
alpha_adj = alpha / num_combos
print(f'Apply a Bonferroni correction and use an adjusted alpha of {alpha_adj:.5f}')

The family-wise error rate for alpha=0.5 and 66 combinations is: 1.0
Apply a Bonferroni correction and use an adjusted alpha of 0.00758


In [90]:
combos_dict = {}
for combo in combos:
    sample_a = df[df['month']==combo[0]].Green
    sample_b = df[df['month']==combo[1]].Green
    t,p = stats.mannwhitneyu(sample_a,sample_b,alternative="two-sided")
    combos_dict[combo] = p


combos_dict


{('Sep', 'Aug'): 0.8094050371102686,
 ('Sep', 'Jun'): 0.9469387692846619,
 ('Sep', 'May'): 0.7917637819349043,
 ('Sep', 'Apr'): 0.19680686773311984,
 ('Sep', 'Jul'): 0.16698680401476962,
 ('Sep', 'Oct'): 0.08917339206373914,
 ('Sep', 'Jan'): 0.0003150300038435985,
 ('Sep', 'Nov'): 0.00042139461624090435,
 ('Sep', 'Mar'): 2.9303033808749776e-05,
 ('Sep', 'Feb'): 1.4810294349422488e-05,
 ('Sep', 'Dec'): 1.0229834421796834e-05,
 ('Aug', 'Jun'): 0.8654693977502486,
 ('Aug', 'May'): 0.9895917466208674,
 ('Aug', 'Apr'): 0.3407955960088104,
 ('Aug', 'Jul'): 0.26182658104123013,
 ('Aug', 'Oct'): 0.14089332019661865,
 ('Aug', 'Jan'): 0.0010834622750542017,
 ('Aug', 'Nov'): 0.0012390427816967471,
 ('Aug', 'Mar'): 0.0001132730103914881,
 ('Aug', 'Feb'): 7.039037913429713e-05,
 ('Aug', 'Dec'): 4.209621334190676e-05,
 ('Jun', 'May'): 0.8783227413308499,
 ('Jun', 'Apr'): 0.24449787594584205,
 ('Jun', 'Jul'): 0.20943896148383168,
 ('Jun', 'Oct'): 0.11732747780089621,
 ('Jun', 'Jan'): 0.00072527878419

In [99]:
results = ht.mwu_test_month_combos(df,combos,alpha=alpha_adj,is_alpha_adjusted=True)
results

Unnamed: 0,month_1,month_2,month_1_mean,month_2_mean,mean_diff,p_value,is_significant
0,Sep,Aug,13.773459,13.678097,0.095362,0.809405,0.0
1,Sep,Jun,13.773459,13.636574,0.136886,0.946939,0.0
2,Sep,May,13.773459,13.419626,0.353833,0.791764,0.0
3,Sep,Apr,13.773459,13.204426,0.569034,0.196807,0.0
4,Sep,Jul,13.773459,13.114836,0.658624,0.166987,0.0
...,...,...,...,...,...,...,...
61,Nov,Feb,12.312481,11.951620,0.360860,0.228493,0.0
62,Nov,Dec,12.312481,11.851589,0.460891,0.188630,0.0
63,Mar,Feb,12.002681,11.951620,0.051060,0.803983,0.0
64,Mar,Dec,12.002681,11.851589,0.151091,0.743219,0.0


In [93]:
results.head(60)

Unnamed: 0,month_1,month_2,month_1_mean,month_2_mean,mean_diff,p_value,is_significant
0,Sep,Aug,13.773459,13.678097,0.095362,0.809405,0.0
1,Sep,Jun,13.773459,13.636574,0.136886,0.946939,0.0
2,Sep,May,13.773459,13.419626,0.353833,0.791764,0.0
3,Sep,Apr,13.773459,13.204426,0.569034,0.196807,0.0
4,Sep,Jul,13.773459,13.114836,0.658624,0.166987,0.0
5,Sep,Oct,13.773459,12.939287,0.834173,0.089173,0.0
6,Sep,Jan,13.773459,12.477913,1.295546,0.000315,1.0
7,Sep,Nov,13.773459,12.312481,1.460979,0.000421,1.0
8,Sep,Mar,13.773459,12.002681,1.770779,2.9e-05,1.0
9,Sep,Feb,13.773459,11.95162,1.821839,1.5e-05,1.0


In [52]:
results.sort_values('month_1_mean',ascending=False).head(60)

Unnamed: 0,month_1,month_2,month_1_mean,month_2_mean,mean_diff,p_value,is_significant
62,Sep,Dec,13.773459,11.851589,1.92187,1e-05,1.0
61,Sep,Nov,13.773459,12.312481,1.460979,0.000421,1.0
60,Sep,Oct,13.773459,12.939287,0.834173,0.089173,0.0
59,Aug,Dec,13.678097,11.851589,1.826508,4.2e-05,1.0
58,Aug,Nov,13.678097,12.312481,1.365616,0.001239,1.0
57,Aug,Oct,13.678097,12.939287,0.73881,0.140893,0.0
56,Aug,Sep,13.678097,13.773459,-0.095362,0.809405,0.0
49,Jun,Nov,13.636574,12.312481,1.324093,0.001207,1.0
50,Jun,Dec,13.636574,11.851589,1.784984,3.3e-05,1.0
45,Jun,Jul,13.636574,13.114836,0.521738,0.209439,0.0


In [53]:
results.sort_values('month_1_mean',ascending=False).tail(10)

Unnamed: 0,month_1,month_2,month_1_mean,month_2_mean,mean_diff,p_value,is_significant
19,Feb,Nov,11.95162,12.312481,-0.36086,0.228493,0.0
20,Feb,Dec,11.95162,11.851589,0.100031,0.989733,0.0
18,Feb,Oct,11.95162,12.939287,-0.987666,0.005823,1.0
11,Feb,Mar,11.95162,12.002681,-0.05106,0.803983,0.0
12,Feb,Apr,11.95162,13.204426,-1.252805,0.00356,1.0
13,Feb,May,11.95162,13.419626,-1.468006,0.000122,1.0
14,Feb,Jun,11.95162,13.636574,-1.684953,5.7e-05,1.0
16,Feb,Aug,11.95162,13.678097,-1.726477,7e-05,1.0
17,Feb,Sep,11.95162,13.773459,-1.821839,1.5e-05,1.0
15,Feb,Jul,11.95162,13.114836,-1.163215,0.004151,1.0


Try to make sense of it all




In [62]:
for month in months:
    print()
    print(results[((results['month_1']==month) | (results['month_2']==month)) & (results['is_significant']==True)].drop('is_significant',axis=1))


  month_1 month_2  month_1_mean  month_2_mean  mean_diff   p_value
3     Jan     May     12.477913     13.419626  -0.941713  0.001527
4     Jan     Jun     12.477913     13.636574  -1.158660  0.000725
6     Jan     Aug     12.477913     13.678097  -1.200184  0.001083
7     Jan     Sep     12.477913     13.773459  -1.295546  0.000315

   month_1 month_2  month_1_mean  month_2_mean  mean_diff   p_value
12     Feb     Apr      11.95162     13.204426  -1.252805  0.003560
13     Feb     May      11.95162     13.419626  -1.468006  0.000122
14     Feb     Jun      11.95162     13.636574  -1.684953  0.000057
15     Feb     Jul      11.95162     13.114836  -1.163215  0.004151
16     Feb     Aug      11.95162     13.678097  -1.726477  0.000070
17     Feb     Sep      11.95162     13.773459  -1.821839  0.000015
18     Feb     Oct      11.95162     12.939287  -0.987666  0.005823

   month_1 month_2  month_1_mean  month_2_mean  mean_diff   p_value
21     Mar     Apr     12.002681     13.204426  -1

In [81]:
test_values = [1,2,3,4,5,6,7,6,5,4,3,2]
test_values

[1, 2, 3, 4, 5, 6, 7, 6, 5, 4, 3, 2]

In [82]:
test_dict = dict(zip(months,test_values))

In [83]:
test_dict

{'Jan': 1,
 'Feb': 2,
 'Mar': 3,
 'Apr': 4,
 'May': 5,
 'Jun': 6,
 'Jul': 7,
 'Aug': 6,
 'Sep': 5,
 'Oct': 4,
 'Nov': 3,
 'Dec': 2}

In [84]:
test_dict.keys()

dict_keys(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

In [86]:
sorted(test_dict,key=test_dict.get,reverse=True)

['Jul',
 'Jun',
 'Aug',
 'May',
 'Sep',
 'Apr',
 'Oct',
 'Mar',
 'Nov',
 'Feb',
 'Dec',
 'Jan']

In [76]:
import operator