# Assemble Boston dataframe

In [2]:
import pandas as pd
%matplotlib inline
import glob
import numpy as np
import matplotlib.pyplot as plt


In [3]:
# read in the data
datafiles = glob.glob('../Data/Boston/hubway_trips.csv')
datafiles

['../Data/Boston/hubway_trips.csv']

In [4]:
dall = [pd.read_csv(datafile) for datafile in datafiles]
dconcat = pd.concat(dall)

In [5]:
temp = pd.DatetimeIndex(dconcat['start_date'])
dconcat['start_day'] = temp.date
#dconcat['start_time'] = temp.time
dconcat['nrides'] = np.ones(len(dconcat))
dconcat = dconcat.rename(columns={'strt_statn': 'stationid'})

###### Add features for number of rides per station.  Appoximate number of stations as number of unique start stations.

In [6]:
databydaygroup = dconcat.groupby(["stationid", "start_day"], as_index=False)

In [28]:
databydaygroup

<pandas.core.groupby.DataFrameGroupBy object at 0x107ef7650>

In [6]:
dconcat.columns

Index([u'seq_id', u'hubway_id', u'status', u'duration', u'start_date',
       u'stationid', u'end_date', u'end_statn', u'bike_nr', u'subsc_type',
       u'zip_code', u'birth_date', u'gender', u'start_day', u'nrides'],
      dtype='object')

In [7]:
dconcat.to_csv('../Data/Boston/hubway_trips_datetime.csv')

In [29]:
databyday = databydaygroup.agg({"nrides": lambda x: x.sum(), "duration": lambda x: np.median(x)})

In [31]:
databyday['ndays'] = np.ones(len(databyday))

In [37]:
databystation = databyday.groupby('stationid', as_index=False).agg({'nrides': lambda x: x.sum(), 'ndays': lambda x: x.sum(), 'duration': lambda x: np.median(x)})

###### Add features for day of week, day of year, and years since start of program

In [9]:
tempdate = pd.DatetimeIndex(databyday['start_day'])
databyday['deltayear'] = tempdate.year - tempdate.year[0]
databyday['dayofweek'] = tempdate.dayofweek
databyday['dayofyear'] = tempdate.dayofyear

KeyError: 'start_day'

###### Add climate features: daily high temperature [centi-Celsius], daily low temperature [centi-Celsius], daily precipitation [centi-mm]

In [82]:
climatedata = pd.read_csv('../Data/climate/boston20112014.csv', \
                          converters={'DATE': lambda x: x[0:4] + '-' + x[4:6] + '-' + x[6:8]})

In [83]:
climatedata1 = climatedata[['DATE', 'PRCP', 'SNOW', 'TMAX', 'TMIN']]

In [84]:
climatedaydata = climatedata1.groupby('DATE', as_index=False).agg(lambda x: np.median(x[x > -9999]))

In [85]:
tempdate = pd.DatetimeIndex(climatedaydata['DATE'])
climatedaydata['start_day'] = tempdate.date
climatedaydata = climatedaydata.drop('DATE', axis=1)

In [86]:
databyday = databyday.merge(climatedaydata, on='start_day')

###### Add feature for origin and destination density

In [10]:
station = pd.read_csv('../Data/Boston/stationdensity.csv')

In [11]:
databyday = databyday.merge(station, on='stationid')
databyday = databyday.drop(['terminal', 'station', 'status', 'municipal'], axis=1)

###### Save final dataframe

In [13]:
databyday.to_csv('../Data/Boston/BostonFeaturesByStation.csv')

In [14]:
databyday

Unnamed: 0.1,stationid,duration,nrides,Unnamed: 0,lat,lng,popdensity,empdensity
0,3,720.0,9734,0,42.340021,-71.100812,20099.358399,16692.753933757544
1,4,573.0,18058,1,42.345392,-71.069616,18086.238605,21454.659662862308
2,5,600.0,10630,2,42.341814,-71.090179,19797.945738,21044.219877649099
3,6,660.0,23322,3,42.361285,-71.065140,18666.673244,13881.169877648354
4,7,720.0,9163,4,42.353412,-71.044624,16503.835555,9005.8250046214089
5,8,969.0,7570,5,42.353334,-71.137313,16280.786425,9477.040112003684
6,9,728.0,17180,6,42.351313,-71.116174,20584.230737,12054.449577254825
7,10,660.0,16159,7,42.350075,-71.105884,21684.013392,16397.786388501525
8,11,678.0,12393,8,42.338629,-71.106500,19313.555852,13586.902291264385
9,12,596.0,10456,9,42.335911,-71.088496,17855.964344,19837.757629573345
