# Collect More detailed Weather Information from DarkSky.net

# Example request

In [17]:
%matplotlib inline

import os
import datetime
from glob import glob
import math


import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns; sns.set()

sns.set_style('whitegrid')
sns.set_context("poster")

In [18]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

font = {'size'   : 50}
matplotlib.rc('font', **font)

TITLE_FONT_SIZE = 25
LABEL_FONT_SIZE = 20
TICK_FONT_SIZE  = 15

In [19]:
# get DarkSky API Key

DARKSKY_KEY = os.environ.get('DARKSKY_KEY')
# print(DARKSKY_KEY)

In [20]:
# load trip data, prune to commuters only

print('[%s] Loading Trip Data Data...' % datetime.datetime.now().time())

trips_df = pd.DataFrame()
trip_data_file = '../clean_data/bayareabikeshare/trip_data_extended_cleaned.csv'

# Chunk Settings
chunks = []
chunk_counter = 1
chunksize = 10000
num_chunks = math.ceil(sum(1 for row in open(trip_data_file, 'r'))/chunksize)

# import file in chunks
for chunk in pd.read_csv(trip_data_file, chunksize=chunksize, iterator=True, index_col=0, parse_dates=['start_date', 'end_date']):
    
    # append chunk to chunks list
    chunks.append(chunk)

    if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
        print('\t\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
    chunk_counter += 1

trips_df = pd.concat(chunks)
trips_df.user_type = trips_df.user_type.astype('category')

trips_df.drop_duplicates(inplace=True)

print('[%s] Complete!' % datetime.datetime.now().time())

[16:22:49.217114] Loading Trip Data Data...
		[16:22:50.626481] finished chunk 1 of 100
		[16:22:52.011201] finished chunk 10 of 100
		[16:22:53.267235] finished chunk 20 of 100
		[16:22:54.589008] finished chunk 30 of 100
		[16:22:55.703038] finished chunk 40 of 100
		[16:22:56.856349] finished chunk 50 of 100
		[16:22:57.948871] finished chunk 60 of 100
		[16:22:59.383392] finished chunk 70 of 100
		[16:23:00.500553] finished chunk 80 of 100
		[16:23:02.249855] finished chunk 90 of 100
		[16:23:03.342765] finished chunk 100 of 100
[16:23:08.790247] Complete!


In [21]:
trip_data = trips_df.copy()

In [22]:
# subscribers = trip_data[trip_data.user_type == 'Subscriber'].copy()
# weekday_subscribers = subscribers[(subscribers.start_date.dt.dayofweek < 5) | (subscribers.end_date.dt.dayofweek < 5)].copy()

# commuters = weekday_subscribers[((weekday_subscribers.start_date.dt.hour >=  7) & (weekday_subscribers.end_date.dt.hour < 10)) | 
#                                 ((weekday_subscribers.start_date.dt.hour >= 16) & (weekday_subscribers.end_date.dt.hour < 19))].copy()


# commuters.reset_index(inplace=True, drop=True)
# commuters.info()

# Count Trips by Date, Hour, Terminal ID

In [23]:
trip_dates = trip_data.groupby([trip_data.start_date.dt.date, 'start_terminal'])['trip_id'].count().to_frame()

trip_dates.index.rename(['start_date', 'start_terminal'], inplace=True)
trip_dates.columns = ['trip_count']
trip_dates = trip_dates.reset_index()

trip_dates.start_date     = pd.to_datetime(trip_dates.start_date)
trip_dates.start_terminal = trip_dates.start_terminal.astype('int')
trip_dates.trip_count     = trip_dates.trip_count.astype('int')
                      
trip_dates.head(5)

Unnamed: 0,start_date,start_terminal,trip_count
0,2013-08-29,2,5
1,2013-08-29,3,9
2,2013-08-29,4,3
3,2013-08-29,5,3
4,2013-08-29,6,4


In [24]:
trip_dates.shape[0]

63132

# Import and Merge Station Data

In [139]:
print('[%s] Loading Station Data Data...' % datetime.datetime.now().time())

stations_df = pd.DataFrame()
station_data_file = '../clean_data/bayareabikeshare/station_data_cleaned.csv'

# Chunk Settings
chunks = []
chunk_counter = 1
chunksize = 10000
num_chunks = math.ceil(sum(1 for row in open(station_data_file, 'r'))/chunksize)

# import file in chunks
for chunk in pd.read_csv(station_data_file, chunksize=chunksize, iterator=True, index_col=0, parse_dates=['first_service_date', 'last_service_date']):
    
    # append chunk to chunks list
    chunks.append(chunk)

    if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
        print('\t\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
    chunk_counter += 1

stations_df = pd.concat(chunks)

stations_df.drop_duplicates(inplace=True)

print('[%s] Complete!' % datetime.datetime.now().time())

[22:32:34.426558] Loading Station Data Data...
		[22:32:34.439725] finished chunk 1 of 1
[22:32:34.446762] Complete!


In [140]:
stations = stations_df.copy()

In [141]:
stations.head(3)

Unnamed: 0,station_id,name,lat,long,dock_count,landmark,first_service_date,last_service_date,zip_code,days_in_service
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,2013-08-29,2016-08-31,95113,1098
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,2013-08-29,2016-08-31,95113,1098
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,2013-08-29,2016-08-31,95113,1098


# Merge Dataframes and Prune Columns

In [147]:
mrg = trip_dates.merge(stations, left_on='start_terminal', right_on='station_id')

mrg.sort_values(['start_date', 'start_terminal'], inplace=True)
mrg.reset_index(inplace=True, drop=True)

mrg.head(5)

Unnamed: 0,start_date,start_terminal,trip_count,station_id,name,lat,long,dock_count,landmark,first_service_date,last_service_date,zip_code,days_in_service
0,2013-08-29,2,5,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,2013-08-29,2016-08-31,95113,1098
1,2013-08-29,3,9,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,2013-08-29,2016-08-31,95113,1098
2,2013-08-29,4,3,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,2013-08-29,2016-08-31,95113,1098
3,2013-08-29,5,3,5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,2013-08-29,2016-08-31,95113,1098
4,2013-08-29,6,4,6,San Pedro Square,37.336721,-121.894074,15,San Jose,2013-08-29,2016-08-31,95113,1098


In [148]:
drop_cols = ['station_id', 'name', 'dock_count', 
             'first_service_date', 'last_service_date', 
             'days_in_service']

for dc in drop_cols:
    try:
        mrg.drop(dc, axis=1, inplace=True)
    except:
        pass

In [149]:
mrg.head()

Unnamed: 0,start_date,start_terminal,trip_count,lat,long,landmark,zip_code
0,2013-08-29,2,5,37.329732,-121.901782,San Jose,95113
1,2013-08-29,3,9,37.330698,-121.888979,San Jose,95113
2,2013-08-29,4,3,37.333988,-121.894902,San Jose,95113
3,2013-08-29,5,3,37.331415,-121.8932,San Jose,95113
4,2013-08-29,6,4,37.336721,-121.894074,San Jose,95113


# Append API Request

<p>Example Call URL  https://api.darksky.net/forecast/[key]/[latitude],[longitude],[time]</p>

In [162]:
def make_url_req(row):
    
    _lat = row.lat
    _long = row.long
    _date = str(row.start_date).split(' ')[0]
    
    _req = 'https://api.darksky.net/forecast/%s/%f,%f,%sT00:00:00' % (DARKSKY_KEY, _lat, _long, _date)
    
    return _req
    
# req = 'https://api.darksky.net/forecast/%s/%f,%f,%sT%s:00:00' % (DARKSKY_KEY, lat, long, start_date, str(start_hour).zfill(2))


In [163]:
mrg['api_call'] = mrg.apply(lambda row: make_url_req(row), axis=1)

In [168]:
req_list = mrg.api_call

In [176]:
req_list.to_csv('../clean_data/darksky_api_calls.csv', index=False)

# Sample API Call - Darksky

In [191]:
import urllib.request
import json
from math import ceil, floor

In [270]:
def get_stream_json(_url):

    # make json request across network
    df = pd.DataFrame()
    try:
        with urllib.request.urlopen(_url) as url:
            url_json_response = json.loads(url.read().decode())
    except:
        print('\tURL %s had no response, skipping...' % _url)
        return False

    # parse hourly data from json response
    df_hourly = pd.DataFrame(url_json_response['hourly'])

    # split 'data' column of dictionary into separate rows
    df_hourly_details = df_hourly['data'].apply(pd.Series)
    
    # drop column, no longer needed
    df_hourly.drop('data', axis=1, inplace=True)
    
    # merge details into results
    forecast = df_hourly.merge(df_hourly_details, left_index=True, right_index=True)
    
    # extract components of forecast url result
    df = pd.DataFrame(url_json_response)
    df.transpose()
    forecast['latitude'] = df.latitude[0]
    forecast['longitude'] = df.longitude[0]
    forecast['offset'] = df.offset[0]

    forecast.rename(columns={'icon_x': 'daily_icon', 'summary_x': 'daily_summary', 
                             'icon_y': 'hourly_icon', 'summary_y': 'hourly_summary'}, inplace=True)
    
    forecast['time_corrected'] = forecast.time + (3600 * forecast.offset)
    forecast['time_corrected'] = pd.to_datetime(forecast['time_corrected'],unit='s')
#     t.head(30)
    
    print(forecast.columns)
    
    return forecast

In [271]:
df = get_stream_json(sample_call)

Index(['daily_icon', 'daily_summary', 'apparentTemperature', 'cloudCover',
       'dewPoint', 'humidity', 'hourly_icon', 'precipIntensity',
       'precipProbability', 'pressure', 'hourly_summary', 'temperature',
       'time', 'visibility', 'windBearing', 'windSpeed', 'latitude',
       'longitude', 'offset', 'time_corrected'],
      dtype='object')


In [273]:
t = df.copy()
t.head(30)

Unnamed: 0,daily_icon,daily_summary,apparentTemperature,cloudCover,dewPoint,humidity,hourly_icon,precipIntensity,precipProbability,pressure,hourly_summary,temperature,time,visibility,windBearing,windSpeed,latitude,longitude,offset,time_corrected
0,clear-day,Clear throughout the day.,62.49,,60.67,0.95,clear-night,0,0,1014.06,Clear,62.11,1377759600,7.39,267,4.92,37.776377,-122.39607,-7,2013-08-29 00:00:00
1,clear-day,Clear throughout the day.,61.99,,60.62,0.97,clear-night,0,0,1014.6,Clear,61.59,1377763200,7.39,255,6.02,37.776377,-122.39607,-7,2013-08-29 01:00:00
2,clear-day,Clear throughout the day.,61.84,,60.67,0.97,clear-night,0,0,1014.7,Clear,61.42,1377766800,7.6,249,4.84,37.776377,-122.39607,-7,2013-08-29 02:00:00
3,clear-day,Clear throughout the day.,61.32,,59.69,0.95,clear-night,0,0,1014.8,Clear,61.04,1377770400,7.75,252,4.31,37.776377,-122.39607,-7,2013-08-29 03:00:00
4,clear-day,Clear throughout the day.,61.58,,59.92,0.95,clear-night,0,0,1014.79,Clear,61.27,1377774000,7.9,240,3.37,37.776377,-122.39607,-7,2013-08-29 04:00:00
5,clear-day,Clear throughout the day.,61.67,0.75,59.67,0.94,partly-cloudy-night,0,0,1015.11,Mostly Cloudy,61.41,1377777600,10.0,261,4.46,37.776377,-122.39607,-7,2013-08-29 05:00:00
6,clear-day,Clear throughout the day.,61.58,,59.73,0.95,clear-night,0,0,1015.38,Clear,61.31,1377781200,10.0,266,3.6,37.776377,-122.39607,-7,2013-08-29 06:00:00
7,clear-day,Clear throughout the day.,62.05,,59.73,0.93,clear-day,0,0,1016.16,Clear,61.8,1377784800,10.0,256,2.42,37.776377,-122.39607,-7,2013-08-29 07:00:00
8,clear-day,Clear throughout the day.,62.69,,60.55,0.94,clear-day,0,0,1016.71,Clear,62.35,1377788400,10.0,201,3.66,37.776377,-122.39607,-7,2013-08-29 08:00:00
9,clear-day,Clear throughout the day.,62.97,0.0,60.2,0.92,clear-day,0,0,1017.15,Clear,62.7,1377792000,10.0,208,0.8,37.776377,-122.39607,-7,2013-08-29 09:00:00


In [None]:
test = pd.read_csv('../source_data/darksky/')