In [147]:
import pandas as pd
import numpy as np
import feather
from sklearn import linear_model
from sklearn import cross_validation
from sklearn import preprocessing
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

Importing trip data for March 2016 and Weather data from Weather Underground. 

The source data is read in a separate Python class. It is manipulated as a DataFrame and finally exported into Feather format. The exported file is imported in this notebook. To read more about Feather, read https://blog.rstudio.org/2016/03/29/feather/

The benefit of doing this were:
- The ETL part of the analysis is in a separate file (CitiBike_ETL.py)
- The Feather file can read in R also
- During analysis, this notebook will be run multiple times and it is efficient to just import the DataFrame

In [148]:
bikedata = feather.read_dataframe('../../Data/CitiBike_Data/bikedata.feather')
print bikedata.columns
print bikedata.head()

Index([u'bikeid', u'birth year', u'date', u'dtstartdatehour',
       u'dtstopdatehour', u'end station id', u'end station latitude',
       u'end station longitude', u'end station name', u'female', u'male',
       u'start station id', u'start station latitude',
       u'start station longitude', u'start station name', u'starttime',
       u'stoptime', u'tripduration', u'usertype'],
      dtype='object')
   bikeid  birth year        date dtstartdatehour dtstopdatehour  \
0   22285        1958  2016-01-01      2016-01-01     2016-01-01   
1   17827        1969  2016-01-01      2016-01-01     2016-01-01   
2   21997        1982  2016-01-01      2016-01-01     2016-01-01   
3   22794        1961  2016-01-01      2016-01-01     2016-01-01   
4   14562        1952  2016-01-01      2016-01-01     2016-01-01   

   end station id  end station latitude  end station longitude  \
0            3002             40.711512             -74.015756   
1             498             40.748549             -

In [149]:
# How many bikes will leave a particular station at a particular time?
# First let's start with a single station. In our case, let's start with E 40 St & 5 Ave
dfFocusStation = bikedata[bikedata['start station name'] == "E 40 St & 5 Ave"]

dfGroupBy = dfFocusStation.groupby(by=['dtstartdatehour'])

# Number of departures per hour
departures = dfGroupBy.bikeid.count().reset_index()

# Date column
departures['date'] =  dfGroupBy.date.max().reset_index().date

# Male to female ratio
sum_male = dfGroupBy.male.sum().reset_index().male
sum_female = dfGroupBy.female.sum().reset_index().female
departures['male_to_female_ratio'] = (sum_male).astype(float) / (sum_male + sum_female)

print departures.head()

      dtstartdatehour  bikeid        date  male_to_female_ratio
0 2016-01-04 11:00:00       2  2016-01-04              1.000000
1 2016-01-04 12:00:00       3  2016-01-04              0.666667
2 2016-01-04 13:00:00       3  2016-01-04              1.000000
3 2016-01-04 14:00:00       8  2016-01-04              0.750000
4 2016-01-04 15:00:00       3  2016-01-04              1.000000


In [154]:
# Join with weather data
weather_file = './data/temperature/weather.csv'
weather = pd.read_csv(weather_file)
weather.drop('Unnamed: 0', axis=1, inplace=True)

final_df = pd.merge(departures, weather, on='date', how='left')
final_df.head()

Unnamed: 0,dtstartdatehour,bikeid,date,male_to_female_ratio,fog,rain,snow,hail,thunder,tornado,meantempm,maxtempm,meanvisi,maxhumidity
0,2016-01-04 11:00:00,2,2016-01-04,1.0,0,0,0,0,0,0,-4,2,10,59
1,2016-01-04 12:00:00,3,2016-01-04,0.666667,0,0,0,0,0,0,-4,2,10,59
2,2016-01-04 13:00:00,3,2016-01-04,1.0,0,0,0,0,0,0,-4,2,10,59
3,2016-01-04 14:00:00,8,2016-01-04,0.75,0,0,0,0,0,0,-4,2,10,59
4,2016-01-04 15:00:00,3,2016-01-04,1.0,0,0,0,0,0,0,-4,2,10,59


In [155]:
final_df.describe()

Unnamed: 0,bikeid,male_to_female_ratio,fog,rain,snow,hail,thunder,tornado,meantempm,maxtempm,meanvisi,maxhumidity
count,1132.0,1132.0,1132.0,1132.0,1132.0,1132,1132,1132,1132.0,1132.0,1132.0,1132.0
mean,5.261484,0.834926,0.034452,0.245583,0.122792,0,0,0,5.616608,9.74735,8.986749,69.04947
std,5.08379,0.233313,0.182468,0.430623,0.328343,0,0,0,6.184434,7.078425,1.763367,15.220316
min,1.0,0.0,0.0,0.0,0.0,0,0,0,-14.0,-9.0,3.0,44.0
25%,2.0,0.75,0.0,0.0,0.0,0,0,0,1.0,4.0,9.0,55.0
50%,4.0,0.928571,0.0,0.0,0.0,0,0,0,6.0,10.0,10.0,70.0
75%,7.0,1.0,0.0,0.0,0.0,0,0,0,10.0,15.0,10.0,82.0
max,46.0,1.0,1.0,1.0,1.0,0,0,0,22.0,26.0,10.0,96.0


In [156]:
# Create linear regression object
regr = linear_model.LinearRegression()

# LabelEncoder for the dtstartdatehour column
le = preprocessing.LabelEncoder()
le.fit(final_df.dtstartdatehour)
final_df['startdatehour'] = le.transform(final_df.dtstartdatehour)
# LabelEncoder ends

cols = [col for col in final_df.columns if col not in ['bikeid', 'dtstartdatehour', 'date']]

x = final_df[cols]
y = final_df.bikeid

X_train, X_test, y_train, y_test = cross_validation.train_test_split(x, y, random_state = 1)
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, pred))
r2_score = metrics.r2_score(y_test, pred)

print 'RMSE: ', rmse
print 'R2 score: ', r2_score

RMSE:  5.87642821047
R2 score:  0.0551908719648
