# Data Engineering
## Inspect and clean the data

In [1]:
# Dependencies
import pandas as pd
import numpy as np    

In [2]:
# Read file hawaii_measurements.csv
measurements = "raw_data/hawaii_measurements.csv"
measurements_df = pd.read_csv(measurements)
measurements_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [3]:
# Use `isnull` to check for NaNs (Missing Values)
measurements_df.isnull().values.sum()

1447

In [4]:
# Identify incomplete rows (The ones with less data in them)
measurements_df.count()

station    19550
date       19550
prcp       18103
tobs       19550
dtype: int64

In [5]:
# Fill in those missing values with 0
measurements_df = measurements_df.fillna(0)
print(f"NaNs Remaining: {measurements_df.isnull().values.sum()}")

NaNs Remaining: 0


In [6]:
# Check the data type of measurements dataframe
measurements_df.dtypes

station     object
date        object
prcp       float64
tobs         int64
dtype: object

In [7]:
#n Preview the cleaned data frame
measurements_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,0.0,73


In [8]:
# Export the cleaned dataframe to csv
measurements_df.to_csv("ResourcesForAnalysis/clean_hawaii_measurements.csv", index=False)

In [9]:
# Read file hawaii_stations.csv
stations = "raw_data/hawaii_stations.csv"
stations_df =  pd.read_csv(stations)
stations_df.head()

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [10]:
# Use `isnull` to check for NaNs (Missing Values)
stations_df.isnull().values.sum()

0

In [11]:
# Export the cleaned dataframe to csv
stations_df.to_csv("ResourcesForAnalysis/clean_hawaii_stations.csv", index=False)