# Bonus: Temperature Analysis I

In [13]:
import pandas as pd
from datetime import datetime as dt
from sqlalchemy import func

In [32]:
# "tobs" is "temperature observations"
tobs_df = pd.read_csv('Resources/hawaii_measurements.csv')
tobs_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.00,63
2,USC00519397,2010-01-03,0.00,74
3,USC00519397,2010-01-04,0.00,76
4,USC00519397,2010-01-06,,73
...,...,...,...,...
19545,USC00516128,2017-08-19,0.09,71
19546,USC00516128,2017-08-20,,78
19547,USC00516128,2017-08-21,0.56,76
19548,USC00516128,2017-08-22,0.50,76


In [45]:
# Convert the date column format from string to datetime
tobs_df['date'] = pd.to_datetime(tobs_df['date'], infer_datetime_format=True)

# Check the format of 'Date' column
tobs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   station  19550 non-null  object        
 1   date     19550 non-null  datetime64[ns]
 2   prcp     18103 non-null  float64       
 3   tobs     19550 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 611.1+ KB


In [37]:
# Set the date column as the DataFrame index
hawaii_df = tobs_df.set_index(tobs_df['date'])
# df.set_index(['date'])
hawaii_df

Unnamed: 0_level_0,station,date,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,USC00519397,2010-01-01,0.08,65
2010-01-02,USC00519397,2010-01-02,0.00,63
2010-01-03,USC00519397,2010-01-03,0.00,74
2010-01-04,USC00519397,2010-01-04,0.00,76
2010-01-06,USC00519397,2010-01-06,,73
...,...,...,...,...
2017-08-19,USC00516128,2017-08-19,0.09,71
2017-08-20,USC00516128,2017-08-20,,78
2017-08-21,USC00516128,2017-08-21,0.56,76
2017-08-22,USC00516128,2017-08-22,0.50,76


In [39]:
# Drop the date column
hawaii_df = hawaii_df.drop(columns='date')
hawaii_df.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


### Compare June and December data across all years 

In [8]:
from scipy import stats

In [54]:
# Filter data for desired months. I couldn't figure out to how filter the index for the month so it was easier to make a new 'month' column 
# and use loc to filter the different months.
tobs_df['month'] = pd.DatetimeIndex(tobs_df['date']).month
tobs_df

Unnamed: 0,station,date,prcp,tobs,month
0,USC00519397,2010-01-01,0.08,65,1
1,USC00519397,2010-01-02,0.00,63,1
2,USC00519397,2010-01-03,0.00,74,1
3,USC00519397,2010-01-04,0.00,76,1
4,USC00519397,2010-01-06,,73,1
...,...,...,...,...,...
19545,USC00516128,2017-08-19,0.09,71,8
19546,USC00516128,2017-08-20,,78,8
19547,USC00516128,2017-08-21,0.56,76,8
19548,USC00516128,2017-08-22,0.50,76,8


In [55]:
june_df = tobs_df.loc[tobs_df.month == 6]
june_df

Unnamed: 0,station,date,prcp,tobs,month
133,USC00519397,2010-06-01,0.00,78,6
134,USC00519397,2010-06-02,0.01,76,6
135,USC00519397,2010-06-03,0.00,78,6
136,USC00519397,2010-06-04,0.00,76,6
137,USC00519397,2010-06-05,0.00,77,6
...,...,...,...,...,...
19492,USC00516128,2017-06-26,0.02,79,6
19493,USC00516128,2017-06-27,0.10,74,6
19494,USC00516128,2017-06-28,0.02,74,6
19495,USC00516128,2017-06-29,0.04,76,6


In [56]:
december_df = tobs_df.loc[tobs_df.month == 12]
december_df

Unnamed: 0,station,date,prcp,tobs,month
305,USC00519397,2010-12-01,0.04,76,12
306,USC00519397,2010-12-03,0.00,74,12
307,USC00519397,2010-12-04,0.00,74,12
308,USC00519397,2010-12-06,0.00,64,12
309,USC00519397,2010-12-07,0.00,64,12
...,...,...,...,...,...
19323,USC00516128,2016-12-27,0.14,71,12
19324,USC00516128,2016-12-28,0.14,71,12
19325,USC00516128,2016-12-29,1.03,69,12
19326,USC00516128,2016-12-30,2.37,65,12


In [57]:
# Identify the average temperature for June
average_june = june_df['tobs'].mean()
print(average_june)

74.94411764705882


In [64]:
# Identify the average temperature for December
average_december= december_df['tobs'].mean()
print(average_december)

71.04152933421226


In [61]:
# Create collections of temperature data
junetemp = june_df.tobs
junetemp

133      78
134      76
135      78
136      76
137      77
         ..
19492    79
19493    74
19494    74
19495    76
19496    75
Name: tobs, Length: 1700, dtype: int64

In [63]:
decembertemp = december_df.tobs
decembertemp

305      76
306      74
307      74
308      64
309      64
         ..
19323    71
19324    71
19325    69
19326    65
19327    65
Name: tobs, Length: 1517, dtype: int64

In [65]:
# Run paired t-test
stats.ttest_ind(junetemp,decembertemp)

Ttest_indResult(statistic=31.60372399000329, pvalue=3.9025129038616655e-191)

### Analysis

In [None]:
#A paired t-test would be best in this case because the data comes from the same population, just different months. The p-value is very small so difference between
#the means is statistically insignificant. The actual termperature difference is only 3.9 degrees so whether you visit in June or December, the weather will surely be nice!