## Temperature Analysis 1

In [1]:
# Dependancies
import pandas as pd
from datetime import datetime as dt

In [2]:
# "tobs" is "temperature observations"
df = pd.read_csv('../Resources/hawaii_measurements.csv')
df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


## Data Exportation
- Convert the date column format from string to datetime.
- Set the date column as the DataFrame index.
- Drop the date column.

In [3]:
# View dataframe data type before conversion
df.dtypes

station     object
date        object
prcp       float64
tobs         int64
dtype: object

In [4]:
# Convert the date column format from string to datetime
df['date'] = pd.to_datetime(df['date'])

In [5]:
# View dataframe data type post conversion
df.dtypes

station            object
date       datetime64[ns]
prcp              float64
tobs                int64
dtype: object

In [6]:
# Set the date column as the DataFrame index
df.set_index('date', inplace=True)

In [7]:
# Preview dataframe with date as index
df.head()

Unnamed: 0_level_0,station,prcp,tobs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,USC00519397,0.08,65
2010-01-02,USC00519397,0.0,63
2010-01-03,USC00519397,0.0,74
2010-01-04,USC00519397,0.0,76
2010-01-06,USC00519397,,73


In [8]:
# Save new dataframe for later
df.to_csv('../Resources/new_hawaii_measurements.csv')

In [9]:
# Drop the date column
df = df.reset_index(drop=True)

In [10]:
# Preview dataframe with date column drop
df.head()

Unnamed: 0,station,prcp,tobs
0,USC00519397,0.08,65
1,USC00519397,0.0,63
2,USC00519397,0.0,74
3,USC00519397,0.0,76
4,USC00519397,,73


### Compare June and December data across all years
- Identify the average temperature in June at all stations across all available years in the dataset. Do the same for the temperature in December.
- Use the t-test to determine whether the difference in means, if any, is statistically significant. Will you use a paired t-test or an unpaired t-test? Why?

In [11]:
# Additional Dependancy
from scipy import stats
import numpy as np
from numpy import mean

In [12]:
# Import dataframe
new_df = pd.read_csv('../Resources/new_hawaii_measurements.csv')
new_df.head()

Unnamed: 0,date,station,prcp,tobs
0,2010-01-01,USC00519397,0.08,65
1,2010-01-02,USC00519397,0.0,63
2,2010-01-03,USC00519397,0.0,74
3,2010-01-04,USC00519397,0.0,76
4,2010-01-06,USC00519397,,73


In [13]:
new_df.dtypes

date        object
station     object
prcp       float64
tobs         int64
dtype: object

In [14]:
new_df

Unnamed: 0,date,station,prcp,tobs
0,2010-01-01,USC00519397,0.08,65
1,2010-01-02,USC00519397,0.00,63
2,2010-01-03,USC00519397,0.00,74
3,2010-01-04,USC00519397,0.00,76
4,2010-01-06,USC00519397,,73
...,...,...,...,...
19545,2017-08-19,USC00516128,0.09,71
19546,2017-08-20,USC00516128,,78
19547,2017-08-21,USC00516128,0.56,76
19548,2017-08-22,USC00516128,0.50,76


In [15]:
new_df['date']= pd.to_datetime(new_df['date'])

In [16]:
new_df.dtypes

date       datetime64[ns]
station            object
prcp              float64
tobs                int64
dtype: object

In [17]:
# Filter data for desired months

june_tobs = new_df[new_df['date'].dt.month == 6]
june_tobs = june_tobs.reset_index(drop=True)
dec_tobs = new_df[new_df['date'].dt.month == 12]
dec_tobs = dec_tobs.reset_index(drop=True)

In [18]:
# Identify the average temperature for June
june_avg = june_tobs.groupby(['station']).mean()
june_avg

Unnamed: 0_level_0,prcp,tobs
station,Unnamed: 1_level_1,Unnamed: 2_level_1
USC00511918,0.015157,74.139394
USC00513117,0.118248,74.050847
USC00514830,0.114192,76.005376
USC00516128,0.495748,71.93722
USC00517948,0.057975,76.655405
USC00518838,0.094615,73.394737
USC00519281,0.151525,73.271186
USC00519397,0.022661,77.559322
USC00519523,0.050044,76.668103


In [19]:
# Identify the average temperature for December
dec_avg = dec_tobs.groupby(['station']).mean()
dec_avg

Unnamed: 0_level_0,prcp,tobs
station,Unnamed: 1_level_1,Unnamed: 2_level_1
USC00511918,0.138146,69.684211
USC00513117,0.203241,71.069444
USC00514830,0.154966,73.224719
USC00516128,0.507005,69.291262
USC00517948,0.152727,71.834862
USC00518838,0.638182,72.421053
USC00519281,0.244931,69.903226
USC00519397,0.075314,71.109524
USC00519523,0.16201,72.433333


In [20]:
# Create collections of temperature data
frames = [june_avg, dec_avg]
result = pd.concat(frames)
print(frames)

[                 prcp       tobs
station                         
USC00511918  0.015157  74.139394
USC00513117  0.118248  74.050847
USC00514830  0.114192  76.005376
USC00516128  0.495748  71.937220
USC00517948  0.057975  76.655405
USC00518838  0.094615  73.394737
USC00519281  0.151525  73.271186
USC00519397  0.022661  77.559322
USC00519523  0.050044  76.668103,                  prcp       tobs
station                         
USC00511918  0.138146  69.684211
USC00513117  0.203241  71.069444
USC00514830  0.154966  73.224719
USC00516128  0.507005  69.291262
USC00517948  0.152727  71.834862
USC00518838  0.638182  72.421053
USC00519281  0.244931  69.903226
USC00519397  0.075314  71.109524
USC00519523  0.162010  72.433333]


In [21]:
# Run paired t-test

stats.ttest_rel(june_avg['tobs'], dec_avg['tobs'])

Ttest_relResult(statistic=6.95696617044294, pvalue=0.00011759380231523222)

### Analysis

- Paired t-test used to determine the relationship between June and December averages as these months are from the same station a paired t-test is best to use. If I was to compare two months from different stations (groups) then an unpaired t-test would of been best to use.

- Since pvalue is under .0001 it deplicts statistically significant which suggests a meaningful relationship. 