# Data Extraction, Cleaning and Compilation

Performed by Jonthan Owens

- The following is performed in three parts:
    1. Data extraction of weather information from weather_daily.csv and assess data
        
        Source: Applied Climate Information System (ACIS), NOAA Regional Climate Centers (RCCs)
        
    2. Cleaning of data

    3. Compiliation into a clean data file, weather_daily_clean.csv.


In [1]:
# Import necessary libraries
from pathlib import Path
import csv
import pandas as pd


## 1. Data Extraction of Weather Information and Assessment


In [2]:
# Capture data path
csv_path = Path('./raw_data/weather_daily.csv')

# Read in data and format dates and times
weather_data = pd.read_csv(csv_path, infer_datetime_format=True, parse_dates=True)
weather_data.head()


Unnamed: 0,Unnamed: 1,OMAHA EPPLEY AIRFIELD
2000-10-26,T,62.0
2000-10-27,0.00,54.5
2000-10-28,T,56.5
2000-10-29,0.68,56.5
2000-10-30,0.00,60.0


In [3]:
# Reset index to set columns names
weather_data = weather_data.reset_index()
#clean_weather_data.head()

# Name columns to assess data
weather_data.rename(columns={'level_0' : 'date', 'level_1' : 'precipitation', 'OMAHA EPPLEY AIRFIELD' : 'average_temperature'}, inplace=True)
weather_data.head()


Unnamed: 0,date,precipitation,average_temperature
0,2000-10-26,T,62.0
1,2000-10-27,0.00,54.5
2,2000-10-28,T,56.5
3,2000-10-29,0.68,56.5
4,2000-10-30,0.00,60.0


In [4]:
# Check data types for each column
weather_data.dtypes


date                   datetime64[ns]
precipitation                  object
average_temperature            object
dtype: object

In [5]:
# Look for any null values
weather_data.isnull().sum()


date                   0
precipitation          0
average_temperature    0
dtype: int64

In [6]:
# Look for any missing data in columns compared to other columns
weather_data.count()


date                   7308
precipitation          7308
average_temperature    7308
dtype: int64

In [7]:
# Missing data appears as value 'M' per source documentation
weather_data[weather_data.average_temperature == 'M'].count()


date                   2
precipitation          2
average_temperature    2
dtype: int64

In [8]:
# Traces of participation values may appear as value 'T' per source documentation
weather_data[weather_data.precipitation == 'T'].count()


date                   1099
precipitation          1099
average_temperature    1099
dtype: int64

## 2. Cleaning of Data


In [9]:
# Drop rows with values of 'M'
index_rows = weather_data[weather_data.average_temperature == 'M'].index
clean_weather_data = weather_data.drop(index_rows)


In [10]:
# Check rows with 'M' value data no longer appear
clean_weather_data[clean_weather_data.average_temperature == 'M'].count()


date                   0
precipitation          0
average_temperature    0
dtype: int64

In [11]:
# Replace values of 'T' with 0.0001 to represent a value other than 0 or False
clean_weather_data['precipitation'] = clean_weather_data['precipitation'].replace('T', 0.0001)
clean_weather_data.head()

Unnamed: 0,date,precipitation,average_temperature
0,2000-10-26,0.0001,62.0
1,2000-10-27,0.0,54.5
2,2000-10-28,0.0001,56.5
3,2000-10-29,0.68,56.5
4,2000-10-30,0.0,60.0


In [12]:
# Check rows with 'T'
clean_weather_data[clean_weather_data.precipitation == 'T'].count()


date                   0
precipitation          0
average_temperature    0
dtype: int64

In [13]:
# Check rows with 0.0001
clean_weather_data[clean_weather_data.precipitation == 0.0001].count()


date                   1099
precipitation          1099
average_temperature    1099
dtype: int64

In [14]:
# Confirm object types that need to be converted
clean_weather_data.dtypes

date                   datetime64[ns]
precipitation                  object
average_temperature            object
dtype: object

In [15]:
# Change precipitation values from object to type float
clean_weather_data['precipitation'] = clean_weather_data['precipitation'].astype(float)

# Change average_temperature values from object to type float
clean_weather_data['average_temperature'] = clean_weather_data['average_temperature'].astype(float)


In [16]:
# Verify type changes
clean_weather_data.dtypes


date                   datetime64[ns]
precipitation                 float64
average_temperature           float64
dtype: object

In [17]:
# Set index to date and view clean dataframe
clean_weather_data.set_index('date', inplace=True)
clean_weather_data.head()


Unnamed: 0_level_0,precipitation,average_temperature
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-10-26,0.0001,62.0
2000-10-27,0.0,54.5
2000-10-28,0.0001,56.5
2000-10-29,0.68,56.5
2000-10-30,0.0,60.0


## 3. Compilation Into A Clean Data File


In [18]:
# Create output path
csv_output_path = Path('./clean_data/weather_daily_clean.csv')


In [19]:
# Write data to csv
clean_weather_data.to_csv(csv_output_path)
