In [1]:
                    #MODULE IMPORT
import pandas as pd
import numpy as np

In [2]:
                    #CONVERT CSV FILES INTO DATAFRAMES
# The metDF dataframe collects metheorogical data from the stations
# (station number, date of the sampling, precipations, temperature 
# observed).
metDF = pd.read_csv("hawaii_measurements.csv")

# The logDF dataframe collects logistic data from the stations (station
# number, name, latitude, longitude, elevation). 
logDF = pd.read_csv("hawaii_stations.csv")

# Print metDF
metDF.head(10)

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73
5,USC00519397,2010-01-07,0.06,70
6,USC00519397,2010-01-08,0.0,64
7,USC00519397,2010-01-09,0.0,68
8,USC00519397,2010-01-10,0.0,73
9,USC00519397,2010-01-11,0.01,64


In [3]:
# Print logDF
logDF

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4


In [5]:
                    #CLEANING UP DATAFRAMES
# By looking at the metDF dataframe, NaN values are already observable in
# the precipations column.
# Instead, the much simpler logDF dataframe, containing the logistic 
# information of the nine Hawaii stations, doesn't require any cleaning.
# Back to the metDF dataframe, counting the number of valid rows in each
# column can reveal where the issues are.

metDF.count()

station    19550
date       19550
prcp       18103
tobs       19550
dtype: int64

In [6]:
# The count above shows that there the column prcp of the metDF dataframe 
# indeed has various NaN values, whilst the column tobs has the expected
# number of valid rows.
# A possible solution for cleaning the metDF dataframe would be to remove
# all the rows where NaN are present.  
# However, since in this case the precipitaions in one day are not likely 
# to be too different from those in the previous day, we can use the 
# function "interpolate" to fill in the missing values.  
# For instance, applying interpolate to the series [0,1,NaN,3] would give
# [0,1,2,3].

cleanMetDF = metDF.interpolate()
cleanMetDF.head(10)

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,0.03,73
5,USC00519397,2010-01-07,0.06,70
6,USC00519397,2010-01-08,0.0,64
7,USC00519397,2010-01-09,0.0,68
8,USC00519397,2010-01-10,0.0,73
9,USC00519397,2010-01-11,0.01,64


In [7]:
# To verify that the new dataframe is clean, again we check the number of
# valid rows, which this time should be equal across the various columns.

cleanMetDF.count()

station    19550
date       19550
prcp       19550
tobs       19550
dtype: int64

In [8]:
# We now need to create a new clean CSV file, that the homework
# instructions tell us that should be called "clean_measurents".
cleanMetDF.to_csv("clean_measurements.csv")

In [9]:
# Even though we didn't really touch the logDF dataframe, the homework
# instructions suggest to anyway create a new CSV file in this case as 
# well.
logDF.to_csv("clean_stations.csv")