# Darksky Cleaner

In [4]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

import datetime

import seaborn as sns
sns.set_style('whitegrid')
sns.set_context("poster")

In [5]:
from wranglingutils import time_marker as tm
from wranglingutils import csv_chunk_importer
from wranglingutils import zip_to_landmark

In [6]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

font = {'size'   : 50}
matplotlib.rc('font', **font)

LABEL_FONT_SIZE = 15
TITLE_FONT_SIZE = 25
TICK_FONT_SIZE = LABEL_FONT_SIZE*0.8
FIG_SIZE = (15,6)

In [7]:
day_labels = ['MON','TUE','WED','THU','FRI','SAT','SUN']
day_labels_full = ['MONDAY','TUESDAY','WEDNESDAY','THURSDAY','FRIDAY','SATURDAY','SUNDAY']
month_labels = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']

ordered_zipcodes = [94107, 95113, 94041, 94301, 94063]

# Load Station Data For Identification
<p>Cross reference cleaned station data to assign `station_id` column to each record</p>

In [8]:
stations = pd.read_csv('../../clean_data/bayareabikeshare/station_data_cleaned.csv', index_col=0, parse_dates=['first_service_date', 'last_service_date'])
stations = stations[['station_id', 'lat', 'long', 'zip_code', 'region']].copy()
stations.head()

Unnamed: 0,station_id,lat,long,zip_code,region
0,2,37.329732,-121.901782,95113,San Jose
1,3,37.330698,-121.888979,95113,San Jose
2,4,37.333988,-121.894902,95113,San Jose
3,5,37.331415,-121.8932,95113,San Jose
4,6,37.336721,-121.894074,95113,San Jose


# Load DarkSky Data

In [9]:
weather_df = csv_chunk_importer(file_path_slug='../../source_data/darksky/*.csv', drop_dups=True)

[17:20:10.373045] Started Loading Station Data...
[17:20:10.725880] Finished file! (1 of 66114)
[17:20:13.830030] Finished file! (1001 of 66114)
[17:20:16.665645] Finished file! (2001 of 66114)
[17:20:19.528711] Finished file! (3001 of 66114)
[17:20:22.445508] Finished file! (4001 of 66114)
[17:20:25.369432] Finished file! (5001 of 66114)
[17:20:28.198432] Finished file! (6001 of 66114)
[17:20:31.614249] Finished file! (7001 of 66114)
[17:20:34.688874] Finished file! (8001 of 66114)
[17:20:37.601852] Finished file! (9001 of 66114)
[17:20:40.478447] Finished file! (10001 of 66114)
[17:20:44.085477] Finished file! (11001 of 66114)
[17:20:47.842391] Finished file! (12001 of 66114)
[17:20:51.692309] Finished file! (13001 of 66114)
[17:20:55.404211] Finished file! (14001 of 66114)
[17:20:59.080181] Finished file! (15001 of 66114)
[17:21:03.331321] Finished file! (16001 of 66114)
[17:21:07.026338] Finished file! (17001 of 66114)
[17:21:10.748036] Finished file! (18001 of 66114)
[17:21:14.542

In [None]:
# print('Started Loading Weather Data...')
# file_path_slug = '../../source_data/darksky/*.csv'
# file_list = glob(file_path_slug)

# weather_df = pd.DataFrame()

# num_files = len(file_list)
# chunks = []

# for i, file in enumerate(file_list):

#     chunk = pd.read_csv(file, index_col=0, parse_dates=['time_corrected'])
    
    
#     chunks.append(chunk)
    
#     if (i + 1) == 1 or (i + 1) % math.ceil(num_files/10) == 0 or (i + 1) == num_files:
#         print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), str(i+1).rjust(8), str(num_files).rjust(8)))

    
# weather_df = pd.concat(chunks)

# weather_df.drop_duplicates(inplace=True)
# weather_df.reset_index(inplace=True, drop=True)

# print('Data Loaded Successfully!')

In [10]:
weather_df.head()

Unnamed: 0.1,Unnamed: 0,apparentTemperature,cloudCover,daily_icon,daily_summary,dewPoint,hourly_icon,hourly_summary,humidity,latitude,...,precipIntensity,precipProbability,precipType,pressure,temperature,time,time_corrected,visibility,windBearing,windSpeed
0,0,53.93,0.31,rain,"Heavy rain starting overnight, continuing unti...",50.44,partly-cloudy-night,Partly Cloudy,0.88,37.795392,...,0.0059,0.59,rain,1012.43,53.93,1418716800,2014-12-16 00:00:00,9.29,131.0,10.26
1,1,54.72,,rain,"Heavy rain starting overnight, continuing unti...",50.83,clear-night,Clear,0.87,37.795392,...,0.0059,0.59,rain,1012.13,54.72,1418720400,2014-12-16 01:00:00,8.41,146.0,8.84
2,2,53.94,,rain,"Heavy rain starting overnight, continuing unti...",50.07,clear-night,Clear,0.87,37.795392,...,0.0112,0.85,rain,1012.32,53.94,1418724000,2014-12-16 02:00:00,8.11,149.0,8.02
3,3,53.86,0.31,rain,"Heavy rain starting overnight, continuing unti...",51.22,rain,Rain,0.91,37.795392,...,0.1668,0.85,rain,1012.71,53.86,1418727600,2014-12-16 03:00:00,8.29,149.0,7.16
4,4,53.78,0.75,rain,"Heavy rain starting overnight, continuing unti...",51.19,rain,Rain,0.91,37.795392,...,0.1323,0.85,rain,1012.64,53.78,1418731200,2014-12-16 04:00:00,9.47,129.0,9.66


In [11]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586743 entries, 0 to 1586742
Data columns (total 22 columns):
Unnamed: 0             1586743 non-null int64
apparentTemperature    1586743 non-null float64
cloudCover             1363580 non-null float64
daily_icon             1586743 non-null object
daily_summary          1586743 non-null object
dewPoint               1586743 non-null float64
hourly_icon            1586743 non-null object
hourly_summary         1586743 non-null object
humidity               1586743 non-null float64
latitude               1586743 non-null float64
longitude              1586743 non-null float64
offset                 1586743 non-null int64
precipIntensity        1586743 non-null float64
precipProbability      1586743 non-null float64
precipType             88330 non-null object
pressure               1586743 non-null float64
temperature            1586743 non-null float64
time                   1586743 non-null int64
time_corrected         1586743 non-n

## Merge subset of `Station Data` to weather records

In [None]:
weather = weather_df.merge(stations, left_on=['latitude', 'longitude'], right_on=['lat', 'long'], how='left')

## Encode `precipType`
<p>Change to `is_raining` as either True or False</p>

In [None]:
weather.precipType.fillna(0, inplace=True)
weather.precipType = weather.precipType.apply(lambda x: False if x == 0 else True)

weather.rename(columns={'precipType': 'is_raining'}, inplace=True)

In [None]:
weather.drop(['latitude', 'longitude'], axis=1, inplace=True)
weather.fillna(0, inplace=True)
weather.info()

# Write to File by Region and by Station

In [None]:
for z in weather.zip_code.unique():
    region = zip_to_landmark(z)
    
    # get dataframe on just weather in region
    region_weather = weather[weather.zip_code == z].copy()
    region_weather.reset_index(inplace=True, drop=True)
    region_weather.to_csv('../../clean_data/darksky/%s_darksky_cleaned.csv' % region.lower().replace(' ', ''))
    
    tm('Finished {}'.format(region))

# Regional Temperatures

In [None]:
df = weather.copy()
df.sort_values('time_corrected', inplace=True)
df.set_index('time_corrected', inplace=True)    
df = df.groupby(df.index).mean()

df = df.resample('1H').mean()

plt.subplots(figsize=FIG_SIZE)
ax = df.rolling('30D').median().apparentTemperature.plot(color='c', label='Rolling Median')
df.rolling('30D').mean().apparentTemperature.plot(color='g', ax=ax, label='Rolling Mean')

title = 'Bay Area Temperature Trends'
ax.set_title(title, size=TITLE_FONT_SIZE)
ax.set_ylabel('Temperature (F)', size=LABEL_FONT_SIZE)
ax.set_xlabel('')

ax.set_ylim([45, 70])
ax.set_yticks([y for y in range(45, 75, 5)])

for y in [y for y in range(45, 75, 5)]:
    ax.axhline(y, linestyle=':', color='k', alpha=0.15)

ax.legend(loc=4, frameon=True)
ax.grid(False)

plt.savefig('../../charts/bay_area_temperature_trends.png')
plt.show()
plt.close()