In [33]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load the data files
measurements_df = pd.read_csv('Resources/hawaii_measurements.csv')
stations_df = pd.read_csv('Resources/hawaii_stations.csv')

In [45]:
# Inspect the measurements data
print(len(measurements_df))
print(measurements_df.info())
print(measurements_df.head())

19550
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19550 entries, 0 to 19549
Data columns (total 4 columns):
station    19550 non-null object
date       19550 non-null object
prcp       18103 non-null float64
tobs       19550 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 611.0+ KB
None
       station        date  prcp  tobs
0  USC00519397  2010-01-01  0.08    65
1  USC00519397  2010-01-02  0.00    63
2  USC00519397  2010-01-03  0.00    74
3  USC00519397  2010-01-04  0.00    76
4  USC00519397  2010-01-06   NaN    73


In [None]:
# Other columns are fine, but prcp has a lot of NaNs
# Figure out what to do about the NaNs

In [46]:
# First let's see if prcp recording rate is consistant across stations
by_station = measurements_df.groupby('station')

prcp_rate_df = by_station.count()
prcp_rate_df['prcp_rate'] = [p / d for p, d in \
                             zip(prcp_rate_df.prcp, prcp_rate_df.date)]

prcp_rate_df

Unnamed: 0_level_0,date,prcp,tobs,prcp_rate
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
USC00511918,1979,1932,1979,0.976251
USC00513117,2709,2696,2709,0.995201
USC00514830,2202,1937,2202,0.879655
USC00516128,2612,2484,2612,0.950995
USC00517948,1372,683,1372,0.497813
USC00518838,511,342,511,0.669276
USC00519281,2772,2772,2772,1.0
USC00519397,2724,2685,2724,0.985683
USC00519523,2669,2572,2669,0.963657


In [47]:
# It looks like a couple of the stations have really bad prcp rates
# It wouldn't be ideal to drop all the NaNs
## because it might skew the data from those particular stations
# Let's see if maybe they can be considered 0

In [49]:
measurements_df.prcp.value_counts()

0.00     8185
0.01     1198
0.02      966
0.03      707
0.04      483
0.05      466
0.06      375
0.08      312
0.07      308
0.10      297
0.09      230
0.12      217
0.11      179
0.13      179
0.14      169
0.15      154
0.16      139
0.20      119
0.17      113
0.19      112
0.18      106
0.22      104
0.30       88
0.23       88
0.21       82
0.25       79
0.26       72
0.24       66
0.29       60
0.27       60
         ... 
1.86        1
2.35        1
4.88        1
5.96        1
3.18        1
2.36        1
2.47        1
5.71        1
3.47        1
2.37        1
8.81        1
3.23        1
11.53       1
2.79        1
3.68        1
2.81        1
2.42        1
4.95        1
2.96        1
2.49        1
5.35        1
2.63        1
2.71        1
3.38        1
3.03        1
3.99        1
3.46        1
6.83        1
3.44        1
4.68        1
Name: prcp, Length: 342, dtype: int64

In [50]:
# 8185 out of the 18103 recorded prcp values are 0.00
# That's 44%, a significant amount
# Let's see if it's true for the low prcp rate stations

In [59]:
# See if the vast majority of recorded prpc values are 0.00 for USC00517948
measurements_df.loc[measurements_df['station'] == 'USC00517948'].prcp.value_counts()

0.00    454
0.02     45
0.05     20
0.04     19
0.01     17
0.03     16
0.08     12
0.10     10
0.06      9
0.07      6
0.26      5
0.12      5
0.14      5
0.20      5
0.09      3
0.25      3
0.17      3
0.16      3
0.28      3
0.27      2
0.22      2
0.44      2
0.13      2
1.50      2
0.23      2
0.35      2
0.50      2
0.48      1
0.86      1
0.40      1
2.40      1
0.38      1
1.48      1
1.75      1
1.71      1
0.33      1
2.80      1
0.32      1
1.60      1
0.78      1
0.15      1
1.65      1
0.30      1
0.63      1
0.72      1
0.70      1
0.47      1
0.57      1
1.09      1
0.11      1
1.80      1
Name: prcp, dtype: int64

In [60]:
# See if the vast majority of recorded prpc values are 0.00 for USC00518838
measurements_df.loc[measurements_df['station'] == 'USC00518838'].prcp.value_counts()

0.00    86
0.01    35
0.02    29
0.03    24
0.04    13
0.10    10
0.05     9
0.06     9
0.08     8
0.11     6
0.30     6
0.20     5
0.07     5
0.25     5
0.13     4
0.09     4
0.12     4
0.53     3
0.21     3
0.87     3
0.16     3
0.15     3
0.26     2
0.41     2
0.34     2
0.36     2
0.35     2
0.14     2
0.40     2
0.60     2
        ..
0.84     1
0.64     1
0.74     1
1.80     1
0.82     1
0.89     1
0.19     1
0.90     1
0.46     1
1.10     1
6.30     1
0.52     1
1.26     1
0.80     1
0.70     1
1.15     1
0.54     1
0.73     1
0.29     1
0.23     1
2.12     1
1.42     1
0.78     1
0.39     1
1.43     1
2.70     1
1.03     1
2.10     1
2.40     1
0.44     1
Name: prcp, Length: 76, dtype: int64

In [69]:
# Having checked the percentage of the prpc recordings being 0.00
## I can't really say that NaN should be changed to 0.00
## I've decided to drop all rows where prpc is NaN just to be safe
### We'll assuming that any record not containing prpc is unrealiable

measurements_df.dropna(inplace=True)
measurements_df.reset_index(drop=True, inplace=True)

# Export to csv
measurements_df.to_csv('Resources/clean_hawaii_measurements.csv', index=False)

In [26]:
# Inspect the stations data
stations_df

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4


In [70]:
# There are no missing values in the stations data
# We can use it as is

In [71]:
measurements_df

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.00,63
2,USC00519397,2010-01-03,0.00,74
3,USC00519397,2010-01-04,0.00,76
4,USC00519397,2010-01-07,0.06,70
5,USC00519397,2010-01-08,0.00,64
6,USC00519397,2010-01-09,0.00,68
7,USC00519397,2010-01-10,0.00,73
8,USC00519397,2010-01-11,0.01,64
9,USC00519397,2010-01-12,0.00,61
