### Imports

In [2]:
# Import Matplot lib and use the `nbagg` backend
import matplotlib
matplotlib.use('nbagg')
from matplotlib import style
style.use('seaborn')
import matplotlib.pyplot as plt
import pandas as pd

### Measurements Data Cleanup

In [3]:
# read in the csv
measurements = pd.read_csv("Resources/hawaii_measurements.csv")
measurements.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [5]:
measurements.dtypes

station     object
date        object
prcp       float64
tobs         int64
dtype: object

In [6]:
measurements.count()
#measurements.value_counts()

station    19550
date       19550
prcp       18103
tobs       19550
dtype: int64

In [8]:
measurements.describe()

Unnamed: 0,prcp,tobs
count,18103.0,19550.0
mean,0.160644,73.097954
std,0.468746,4.523527
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.11,76.0
max,11.53,87.0


In [24]:
# Check columns
measurements.columns

Index(['station', 'date', 'prcp', 'tobs'], dtype='object')

In [9]:
# Checking the shape of the dataframe
measurements.shape

(19550, 4)

In [10]:
# Checking for duplicates in the measurements DataFrame
measurements.duplicated().sum()

0

In [11]:
# Checking for null values
## alternate for checking null values: df.isnull().values.sum()
measurements.isnull().sum()

station       0
date          0
prcp       1447
tobs          0
dtype: int64

In [12]:
# Removing the null values
hi_measurements = measurements.dropna()

# If I wanted to fill NaN values with 0.00, then I would have used the following:
#hawaii_measurements = hawaii_measurements.fillna(0.00)

In [13]:
# Rechecking the shape of the database
hi_measurements.shape

(18103, 4)

In [26]:
# Establish new .csv file with dropped NaNs
hi_measurements.to_csv("Resources/cleaned_hi_measurements.csv")

### Stations Data Cleanup

In [4]:
# Read in the csv
stations = pd.read_csv("Resources/hawaii_stations.csv")
stations.head()

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [17]:
stations.dtypes

station       object
name          object
latitude     float64
longitude    float64
elevation    float64
dtype: object

In [18]:
stations.count()

station      9
name         9
latitude     9
longitude    9
elevation    9
dtype: int64

In [19]:
stations.describe()

Unnamed: 0,latitude,longitude,elevation
count,9.0,9.0,9.0
mean,21.393826,-157.867098,60.977778
std,0.086442,0.103873,103.465547
min,21.2716,-158.0111,0.9
25%,21.3331,-157.9751,7.0
50%,21.3934,-157.8374,14.6
75%,21.45167,-157.8025,32.9
max,21.5213,-157.71139,306.6


In [25]:
# Check columns
stations.columns

Index(['station', 'name', 'latitude', 'longitude', 'elevation'], dtype='object')

In [20]:
# Checking the shape of the dataframe
stations.shape

(9, 5)

In [22]:
# Checking for duplicates in the stations DataFrame
stations.duplicated().sum()

0

In [23]:
# Checking for null values
## alternate for checking null values: df.isnull().values.sum()
stations.isnull().sum()

station      0
name         0
latitude     0
longitude    0
elevation    0
dtype: int64

In [None]:
# Creating a new csv for the cleaned data - not necessary as I did not change anything.
#stations.to_csv("Resources/clean_hi_stations.csv", index=False)