# Bonus: Temperature Analysis I

In [1]:
import pandas as pd
from datetime import datetime as dt

In [4]:
# "tobs" is "temperature observations"
df = pd.read_csv('Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [5]:
# Convert the date column format from string to datetime
df['date'] = df['date'].apply(lambda x: dt.strptime(x,'%Y-%m-%d'))
df

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.00,63
2,USC00519397,2010-01-03,0.00,74
3,USC00519397,2010-01-04,0.00,76
4,USC00519397,2010-01-06,,73
...,...,...,...,...
19545,USC00516128,2017-08-19,0.09,71
19546,USC00516128,2017-08-20,,78
19547,USC00516128,2017-08-21,0.56,76
19548,USC00516128,2017-08-22,0.50,76


In [6]:
# Set the date column as the DataFrame index
df.set_index('date')

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.00,63
2010-01-03,USC00519397,0.00,74
2010-01-04,USC00519397,0.00,76
2010-01-06,USC00519397,,73
...,...,...,...
2017-08-19,USC00516128,0.09,71
2017-08-20,USC00516128,,78
2017-08-21,USC00516128,0.56,76
2017-08-22,USC00516128,0.50,76


In [7]:
# Drop the date column
df.drop(columns=['date'])

Unnamed: 0,station,prcp,tobs
0,USC00519397,0.08,65
1,USC00519397,0.00,63
2,USC00519397,0.00,74
3,USC00519397,0.00,76
4,USC00519397,,73
...,...,...,...
19545,USC00516128,0.09,71
19546,USC00516128,,78
19547,USC00516128,0.56,76
19548,USC00516128,0.50,76


### Compare June and December data across all years 

In [8]:
from scipy import stats
import statistics as st

In [9]:
# Filter data for desired months
june_df = df.loc[pd.DatetimeIndex(df['date']).month == 6]
dec_df = df.loc[pd.DatetimeIndex(df['date']).month == 12]
dec_df

Unnamed: 0,station,date,prcp,tobs
305,USC00519397,2010-12-01,0.04,76
306,USC00519397,2010-12-03,0.00,74
307,USC00519397,2010-12-04,0.00,74
308,USC00519397,2010-12-06,0.00,64
309,USC00519397,2010-12-07,0.00,64
...,...,...,...,...
19323,USC00516128,2016-12-27,0.14,71
19324,USC00516128,2016-12-28,0.14,71
19325,USC00516128,2016-12-29,1.03,69
19326,USC00516128,2016-12-30,2.37,65


In [10]:
# Identify the average temperature for June
avg_temp_june = round(st.mean(june_df['tobs']),2)
print(f'The average temperature is {avg_temp_june} F')

The average temperature is 74.94 F


In [11]:
# Identify the average temperature for December
avg_temp_dec = round(st.mean(dec_df['tobs']),2)
print(f'The average temperature is {avg_temp_dec} F')

The average temperature is 71.04 F


In [13]:
# Create collections of temperature data
june_temp = june_df['tobs']
dec_temp = dec_df['tobs']

In [21]:
# Run paired t-test
stats.ttest_ind(a=june_temp,b=dec_temp, equal_var=True)

Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)

### Analysis

### The p_value of 3.9e-191, vastly lower than 0.05, indicates a stark difference in the
### means of both the June temperature data and the 
### December temperature data. We would reject the null hypothesis and say
### there is a statistically significant difference in the temperatures 
### during the months of June and December.