In [1]:
#dependencies
import pandas as pd
from datetime import datetime as dt
from scipy import stats

In [2]:
# load csv of prcp (precipitation) and tobs (temp observations) data
df = pd.read_csv('resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
# convert date col dtype from string to datetime
df['date'] = pd.to_datetime(df['date'])
df['date']

0       2010-01-01
1       2010-01-02
2       2010-01-03
3       2010-01-04
4       2010-01-06
           ...    
19545   2017-08-19
19546   2017-08-20
19547   2017-08-21
19548   2017-08-22
19549   2017-08-23
Name: date, Length: 19550, dtype: datetime64[ns]

In [4]:
# set date column as index, then drop date column
df.set_index('date', inplace=True, drop=True)
df.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


### Compare June and December data across all years 

In [5]:
# filter data for june, december for all years
june_df = df[df.index.month == 6]
dec_df = df[df.index.month == 12]

In [6]:
# find average temperature for June
june_mean = june_df['tobs'].mean()
june_mean

74.94411764705882

In [7]:
# find average temperature for December
dec_mean = dec_df['tobs'].mean()
dec_mean

71.04152933421226

In [8]:
# lists of temp data for each month
june_temps = [x for x in june_df['tobs']]
dec_temps = [x for x in dec_df['tobs']]

In [9]:
# examining length of lists
len(june_temps), len(dec_temps)

(1700, 1517)

In [10]:
# if wanting to use a paired t-test, the arrays must be the same length
# a sample from the june temps has to be taken to make the lengths the same
import random
june_temps_sel = random.sample(june_temps, 1517)

In [11]:
# running the paired t-test 10000 times to see if sampling changes causes change in significance
test_dict = {}
for x in range(0, 10000):
    june_temps_sel = random.sample(june_temps, 1517)
    results = stats.ttest_rel(june_temps_sel, dec_temps)
    if results[1] <= 0.05:
        test_dict[x] = results[1]
print(f'percentage of statistically significant results: {(len(test_dict)/10000) * 100}% ({len(test_dict)} out of 10000 tests)')
#test_dict

percentage of statistically significant results: 100.0% (10000 out of 10000 tests)


### Analysis