# Darksky Cleaner

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import scipy
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime

import seaborn as sns
# sns.set()
sns.set_style('whitegrid')
sns.set_context("poster")

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
day_labels = ['MON','TUE','WED','THU','FRI','SAT','SUN']
day_labels_full = ['MONDAY','TUESDAY','WEDNESDAY','THURSDAY','FRIDAY','SATURDAY','SUNDAY']
month_labels = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']

ordered_zipcodes = [94107, 95113, 94041, 94301, 94063]

In [4]:
def zip_to_landmark(zip_code):
    ''' Return zipcode for given landmark'''
    if zip_code == 94107:
        return 'San Francisco'
    if zip_code == 94063:
        return 'Redwood City'
    if zip_code == 94301:
        return 'Palo Alto'
    if zip_code == 94041:
        return 'Mountain View'
    if zip_code == 95113:
        return 'San Jose'
    return False

# Load Station Data For Identification
<p>Cross reference cleaned station data to assign `station_id` column to each record</p>

In [None]:
stations = pd.read_csv('../clean_data/bayareabikeshare/station_data_cleaned.csv', index_col=0, parse_dates=['first_service_date', 'last_service_date'])
stations = stations[['station_id', 'lat', 'long', 'zip_code', 'region']].copy()
stations.head()

Unnamed: 0,station_id,lat,long,zip_code,region
0,2,37.329732,-121.901782,95113,San Jose
1,3,37.330698,-121.888979,95113,San Jose
2,4,37.333988,-121.894902,95113,San Jose
3,5,37.331415,-121.8932,95113,San Jose
4,6,37.336721,-121.894074,95113,San Jose


# Load DarkSky Data

In [None]:
print('Started Loading Weather Data...')
file_path_slug = '../source_data/darksky/*.csv'
file_list = glob(file_path_slug)

weather_df = pd.DataFrame()

num_files = len(file_list)
chunks = []

for i, file in enumerate(file_list):

    chunk = pd.read_csv(file, index_col=0, parse_dates=['time_corrected'])
    
    precision = 6
    chunk = chunk.round({'latitude' : precision, 'longitude' : precision})
    
    chunk = chunk.merge(stations, left_on=['latitude', 'longitude'], right_on=['lat', 'long'])
    
    chunk.fillna(0, inplace=True)
    
    chunks.append(chunk)
    
    if (i + 1) == 1 or (i + 1) % math.ceil(num_files/10) == 0 or (i + 1) == num_files:
        print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), str(i+1).rjust(8), str(num_files).rjust(8)))

    
weather_df = pd.concat(chunks)

weather_df.drop_duplicates(inplace=True)
weather_df.reset_index(inplace=True, drop=True)

print('Data Loaded Successfully!')

Started Loading Weather Data...
	[15:42:44.097470] finished chunk        1 of    66114


In [None]:
weather = weather_df.copy()

In [None]:
weather.head()

In [None]:
weather.info()

## Encode 'precipType'
<ul>
    <li>rain -> 1</li>
    <li>nan -> 0</li>
</ul>

In [None]:
weather.precipType.fillna(0, inplace=True)
weather.precipType = weather.precipType.apply(lambda x: 0 if x == 0 else 1)

In [None]:
weather.describe()

# Write to File by Region

In [None]:
for z in weather.zip_code.unique():
    region = zip_to_landmark(z)
    
    # get dataframe on just weather in region
    df = weather[weather.zip_code == z].copy()
    df.reset_index(inplace=True, drop=True)
    df.to_csv('../clean_data/darksky/%s_darksky_cleaned.csv' % region.lower().replace(' ', ''))
    

In [None]:
# Station Temperatures

In [None]:
for sid in sorted(weather.station_id.unique()):
    
    df = weather[weather.station_id == sid].copy()
    df.sort_values('time_corrected', inplace=True)
    df.set_index('time_corrected', inplace=True)
    
    df.rolling('1D').mean().apparentTemperature.plot(figsize=(24,6))
    title = '%s - %s' % (zip_to_landmark(df.zip_code.unique()), sid)
    plt.title(title)
    plt.show()
    