# Darksky Cleaner

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import scipy
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime

import seaborn as sns
# sns.set()
sns.set_style('whitegrid')
sns.set_context("poster")

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
day_labels = ['MON','TUE','WED','THU','FRI','SAT','SUN']
day_labels_full = ['MONDAY','TUESDAY','WEDNESDAY','THURSDAY','FRIDAY','SATURDAY','SUNDAY']
month_labels = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']

ordered_zipcodes = [94107, 95113, 94041, 94301, 94063]

In [4]:
def zip_to_landmark(zip_code):
    ''' Return zipcode for given landmark'''
    if zip_code == 94107:
        return 'San Francisco'
    if zip_code == 94063:
        return 'Redwood City'
    if zip_code == 94301:
        return 'Palo Alto'
    if zip_code == 94041:
        return 'Mountain View'
    if zip_code == 95113:
        return 'San Jose'
    return False

# Load Station Data For Identification
<p>Cross reference cleaned station data to assign `station_id` column to each record</p>

In [5]:
stations = pd.read_csv('../clean_data/bayareabikeshare/station_data_cleaned.csv', index_col=0, parse_dates=['first_service_date', 'last_service_date'])
stations = stations[['station_id', 'lat', 'long', 'zip_code', 'region']].copy()
stations.head()

Unnamed: 0,station_id,lat,long,zip_code,region
0,2,37.329732,-121.901782,95113,San Jose
1,3,37.330698,-121.888979,95113,San Jose
2,4,37.333988,-121.894902,95113,San Jose
3,5,37.331415,-121.8932,95113,San Jose
4,6,37.336721,-121.894074,95113,San Jose


# Load DarkSky Data

In [6]:
print('Started Loading Weather Data...')
file_path_slug = '../source_data/darksky/*.csv'
file_list = glob(file_path_slug)

weather_df = pd.DataFrame()

num_files = len(file_list)
chunks = []

for i, file in enumerate(file_list):

    chunk = pd.read_csv(file, index_col=0, parse_dates=['time_corrected'])
    
    
    chunks.append(chunk)
    
    if (i + 1) == 1 or (i + 1) % math.ceil(num_files/10) == 0 or (i + 1) == num_files:
        print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), str(i+1).rjust(8), str(num_files).rjust(8)))

    
weather_df = pd.concat(chunks)

weather_df.drop_duplicates(inplace=True)
weather_df.reset_index(inplace=True, drop=True)

print('Data Loaded Successfully!')

Started Loading Weather Data...
	[17:12:39.447108] finished chunk        1 of    66114
	[17:13:04.426319] finished chunk     6612 of    66114
	[17:13:28.875487] finished chunk    13224 of    66114
	[17:13:53.599949] finished chunk    19836 of    66114
	[17:14:18.041539] finished chunk    26448 of    66114
	[17:14:41.588892] finished chunk    33060 of    66114
	[17:15:06.079027] finished chunk    39672 of    66114
	[17:15:31.634472] finished chunk    46284 of    66114
	[17:15:56.373678] finished chunk    52896 of    66114
	[17:16:20.244850] finished chunk    59508 of    66114
	[17:16:44.787574] finished chunk    66114 of    66114
Data Loaded Successfully!


In [7]:
weather_df.head()

Unnamed: 0,apparentTemperature,cloudCover,daily_icon,daily_summary,dewPoint,hourly_icon,hourly_summary,humidity,latitude,longitude,...,precipIntensity,precipProbability,precipType,pressure,temperature,time,time_corrected,visibility,windBearing,windSpeed
0,53.93,0.31,rain,"Heavy rain starting overnight, continuing unti...",50.44,partly-cloudy-night,Partly Cloudy,0.88,37.795392,-122.394203,...,0.0059,0.59,rain,1012.43,53.93,1418716800,2014-12-16 00:00:00,9.29,131.0,10.26
1,54.72,,rain,"Heavy rain starting overnight, continuing unti...",50.83,clear-night,Clear,0.87,37.795392,-122.394203,...,0.0059,0.59,rain,1012.13,54.72,1418720400,2014-12-16 01:00:00,8.41,146.0,8.84
2,53.94,,rain,"Heavy rain starting overnight, continuing unti...",50.07,clear-night,Clear,0.87,37.795392,-122.394203,...,0.0112,0.85,rain,1012.32,53.94,1418724000,2014-12-16 02:00:00,8.11,149.0,8.02
3,53.86,0.31,rain,"Heavy rain starting overnight, continuing unti...",51.22,rain,Rain,0.91,37.795392,-122.394203,...,0.1668,0.85,rain,1012.71,53.86,1418727600,2014-12-16 03:00:00,8.29,149.0,7.16
4,53.78,0.75,rain,"Heavy rain starting overnight, continuing unti...",51.19,rain,Rain,0.91,37.795392,-122.394203,...,0.1323,0.85,rain,1012.64,53.78,1418731200,2014-12-16 04:00:00,9.47,129.0,9.66


In [8]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586743 entries, 0 to 1586742
Data columns (total 21 columns):
apparentTemperature    1586743 non-null float64
cloudCover             1363580 non-null float64
daily_icon             1586743 non-null object
daily_summary          1586743 non-null object
dewPoint               1586743 non-null float64
hourly_icon            1586743 non-null object
hourly_summary         1586743 non-null object
humidity               1586743 non-null float64
latitude               1586743 non-null float64
longitude              1586743 non-null float64
offset                 1586743 non-null int64
precipIntensity        1586743 non-null float64
precipProbability      1586743 non-null float64
precipType             88330 non-null object
pressure               1586743 non-null float64
temperature            1586743 non-null float64
time                   1586743 non-null int64
time_corrected         1586743 non-null datetime64[ns]
visibility             1586

In [9]:
weather = weather_df.merge(stations, left_on=['latitude', 'longitude'], right_on=['lat', 'long'], how='left')

## Encode 'precipType'
<ul>
    <li>rain -> 1</li>
    <li>nan -> 0</li>
</ul>

In [10]:
try:
    weather.precipType.fillna(0, inplace=True)
    weather.precipType = weather.precipType.apply(lambda x: 0 if x == 0 else 1)
except:
    pass

In [11]:
weather.describe()

Unnamed: 0,apparentTemperature,cloudCover,dewPoint,humidity,latitude,longitude,offset,precipIntensity,precipProbability,precipType,pressure,temperature,time,visibility,windBearing,windSpeed,station_id,lat,long,zip_code
count,1586743.0,1363580.0,1586743.0,1586743.0,1586743.0,1586743.0,1586743.0,1586743.0,1586743.0,1586743.0,1586743.0,1586743.0,1586743.0,1586743.0,1586186.0,1586314.0,1586743.0,1586743.0,1586743.0,1586743.0
mean,59.26332,0.1512079,49.48934,0.7186864,37.62201,-122.2428,-7.332466,0.001299324,0.02583651,0.05566749,1016.222,59.36134,1424951000.0,9.153926,241.1017,5.830687,45.53355,37.62201,-122.2428,94315.03
std,7.598377,0.1818528,6.731419,0.1550477,0.2023085,0.2081051,0.4710971,0.01098214,0.132941,0.2292786,4.596603,7.391343,27181390.0,1.152798,85.41428,3.604809,23.59421,0.2023085,0.2081051,408.7736
min,25.89,0.0,7.27,0.12,37.32973,-122.419,-8.0,0.0,0.0,0.0,993.61,29.78,1377760000.0,0.23,0.0,0.0,2.0,37.32973,-122.419,94041.0
25%,54.62,0.0,46.31,0.62,37.39436,-122.4027,-8.0,0.0,0.0,0.0,1013.06,54.62,1401635000.0,8.89,205.0,3.03,28.0,37.39436,-122.4027,94107.0
50%,58.82,0.12,50.42,0.75,37.77865,-122.3942,-7.0,0.0,0.0,0.0,1015.75,58.82,1424938000.0,9.53,263.0,5.26,49.0,37.77865,-122.3942,94107.0
75%,63.4,0.24,54.35,0.84,37.78963,-122.0819,-7.0,0.0,0.0,0.0,1019.19,63.39,1447772000.0,9.88,303.0,7.95,66.0,37.78963,-122.0819,94301.0
max,96.61,1.0,65.6,1.0,37.80477,-121.8773,-7.0,0.6281,1.0,1.0,1031.96,96.61,1472710000.0,10.0,359.0,40.71,91.0,37.80477,-121.8773,95113.0


In [12]:
weather.drop(['latitude', 'longitude'], axis=1, inplace=True)
weather.fillna(0, inplace=True)
weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1586743 entries, 0 to 1586742
Data columns (total 24 columns):
apparentTemperature    1586743 non-null float64
cloudCover             1586743 non-null float64
daily_icon             1586743 non-null object
daily_summary          1586743 non-null object
dewPoint               1586743 non-null float64
hourly_icon            1586743 non-null object
hourly_summary         1586743 non-null object
humidity               1586743 non-null float64
offset                 1586743 non-null int64
precipIntensity        1586743 non-null float64
precipProbability      1586743 non-null float64
precipType             1586743 non-null int64
pressure               1586743 non-null float64
temperature            1586743 non-null float64
time                   1586743 non-null int64
time_corrected         1586743 non-null datetime64[ns]
visibility             1586743 non-null float64
windBearing            1586743 non-null float64
windSpeed              158

# Write to File by Region and by Station

In [13]:
for z in weather.zip_code.unique():
    region = zip_to_landmark(z)
    
    # get dataframe on just weather in region
    region_weather = weather[weather.zip_code == z].copy()
    region_weather.reset_index(inplace=True, drop=True)
    region_weather.to_csv('../clean_data/darksky/%s_darksky_cleaned.csv' % region.lower().replace(' ', ''))
    
    for sid in region_weather.station_id.unique():
        
        df = region_weather[region_weather.station_id == sid].copy()
        df.reset_index(inplace=True, drop=True)
        df.to_csv('../clean_data/darksky/%s/station_%s_darksky_cleaned.csv' % (region.lower().replace(' ', ''), str(sid).lower().replace(' ', '')))
    print('Finished %s' % region)

Finished San Francisco
Finished Redwood City
Finished Mountain View
Finished San Jose
Finished Palo Alto


# Regional Temperatures

In [24]:
for region in sorted(weather.zip_code.unique()):
    
    df = weather[weather.zip_code == region].copy()
    df.sort_values('time_corrected', inplace=True)
    df.set_index('time_corrected', inplace=True)    
    df = df.groupby(df.index).mean()

    region = zip_to_landmark(df.zip_code.unique())
    
    df = df.resample('1H').mean()
    
    ax = df.rolling('7D').max().apparentTemperature.plot(linestyle='', marker='.', alpha=0.5, color='r', figsize=(24,6), label='Max')
    df.rolling('7D').min().apparentTemperature.plot(linestyle='', marker='.', alpha=0.5, color='b', ax=ax, label='Min')
    df.rolling('30D').median().apparentTemperature.plot(color='c', ax=ax, label='Median')
    df.rolling('30D').mean().apparentTemperature.plot(color='g', ax=ax, label='Mean')
    
    title = '%s Weather' % (region)
    ax.set_title(title)
    ax.set_ylabel('Temperature (F)')
    ax.set_xlabel('')
    
    y_max = math.ceil(df.apparentTemperature.max()/10)*10
    y_min = math.floor(df.apparentTemperature.min()/10)*10
    ax.set_ylim([y_min, y_max])
    ax.set_yticks([x for x in range(y_min, y_max + 10, 10)])
    
    ax.legend(loc=4, frameon=True)
    plt.savefig('../charts/darksky/%s_Weather.png' % (region))
    plt.close('all')    