# Bonus: Temperature Analysis I

In [28]:
import pandas as pd
from datetime import datetime as dt

In [29]:
# "tobs" is "temperature observations"
df = pd.read_csv('hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [30]:
# Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'])
df['date']

0       2010-01-01
1       2010-01-02
2       2010-01-03
3       2010-01-04
4       2010-01-06
           ...    
19545   2017-08-19
19546   2017-08-20
19547   2017-08-21
19548   2017-08-22
19549   2017-08-23
Name: date, Length: 19550, dtype: datetime64[ns]

In [31]:
df[df.date.dt.month == 6]

Unnamed: 0,station,date,prcp,tobs
133,USC00519397,2010-06-01,0.00,78
134,USC00519397,2010-06-02,0.01,76
135,USC00519397,2010-06-03,0.00,78
136,USC00519397,2010-06-04,0.00,76
137,USC00519397,2010-06-05,0.00,77
...,...,...,...,...
19492,USC00516128,2017-06-26,0.02,79
19493,USC00516128,2017-06-27,0.10,74
19494,USC00516128,2017-06-28,0.02,74
19495,USC00516128,2017-06-29,0.04,76


In [33]:
# Set the date column as the DataFrame index and drop the date column
df.set_index('date', inplace=True, drop=True)
df.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


### Compare June and December data across all years 

In [34]:
from scipy import stats

In [41]:
# Filter data for desired months
june_df = df[df.index.month == 6]
dec_df = df[df.index.month == 12]

In [53]:
jun_days = []
for item in june_df.index.unique():
    jun_days.append(str(item)[5:])
set(jun_days)

{'06-01 00:00:00',
 '06-02 00:00:00',
 '06-03 00:00:00',
 '06-04 00:00:00',
 '06-05 00:00:00',
 '06-06 00:00:00',
 '06-07 00:00:00',
 '06-08 00:00:00',
 '06-09 00:00:00',
 '06-10 00:00:00',
 '06-11 00:00:00',
 '06-12 00:00:00',
 '06-13 00:00:00',
 '06-14 00:00:00',
 '06-15 00:00:00',
 '06-16 00:00:00',
 '06-17 00:00:00',
 '06-18 00:00:00',
 '06-19 00:00:00',
 '06-20 00:00:00',
 '06-21 00:00:00',
 '06-22 00:00:00',
 '06-23 00:00:00',
 '06-24 00:00:00',
 '06-25 00:00:00',
 '06-26 00:00:00',
 '06-27 00:00:00',
 '06-28 00:00:00',
 '06-29 00:00:00',
 '06-30 00:00:00'}

In [54]:
dec_days = []
for item in dec_df.index.unique():
    dec_days.append(str(item)[5:])
set(dec_days)

{'12-01 00:00:00',
 '12-02 00:00:00',
 '12-03 00:00:00',
 '12-04 00:00:00',
 '12-05 00:00:00',
 '12-06 00:00:00',
 '12-07 00:00:00',
 '12-08 00:00:00',
 '12-09 00:00:00',
 '12-10 00:00:00',
 '12-11 00:00:00',
 '12-12 00:00:00',
 '12-13 00:00:00',
 '12-14 00:00:00',
 '12-15 00:00:00',
 '12-16 00:00:00',
 '12-17 00:00:00',
 '12-18 00:00:00',
 '12-19 00:00:00',
 '12-20 00:00:00',
 '12-21 00:00:00',
 '12-22 00:00:00',
 '12-23 00:00:00',
 '12-24 00:00:00',
 '12-25 00:00:00',
 '12-26 00:00:00',
 '12-27 00:00:00',
 '12-28 00:00:00',
 '12-29 00:00:00',
 '12-30 00:00:00',
 '12-31 00:00:00'}

In [43]:
# Identify the average temperature for June
june_mean = june_df['tobs'].mean()
june_mean

74.94411764705882

In [44]:
# Identify the average temperature for December
dec_mean = dec_df['tobs'].mean()
dec_mean

71.04152933421226

In [45]:
# Create collections of temperature data
june_temps = [x for x in june_df['tobs']]
dec_temps = [x for x in dec_df['tobs']]

In [47]:
#examining length of lists
len(june_temps), len(dec_temps)

(1700, 1517)

In [64]:
# if wanting to use a paired t-test, the arrays must be the same length
# a sample from the june temps has to be taken to make the lengths the same
import random
june_temps_sel = random.sample(june_temps, 1517)

In [67]:
# paired t-test
results = stats.ttest_rel(june_temps_sel, dec_temps)
results

Ttest_relResult(statistic=31.603140199323516, pvalue=8.043001768915528e-169)

In [74]:
# running the t-test 1000 times to see if sampling changes causes change in significance
test_dict = {}
for x in range(0, 1000):
    june_temps_sel = random.sample(june_temps, 1517)
    results = stats.ttest_rel(june_temps_sel, dec_temps)
    if results[1] <= 0.05:
        test_dict[x] = results[1]
print(f'percentage of statistically significant results: {(len(test_dict)/1000) * 100}% ({len(test_dict)} out of 1000 tests)')
#test_dict

percentage of statistically significant results: 100.0% (1000 out of 1000 tests)


In [57]:
# unpaired t-test
stats.ttest_ind(june_temps, dec_temps)

Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)

### Analysis