In [145]:
import pandas as pd
import numpy as np
import feather
from sklearn import linear_model
from sklearn import cross_validation
from sklearn import preprocessing
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

Importing trip data for March 2016 and Weather data from Weather Underground. 

The source data is read in a separate Python class. It is manipulated as a DataFrame and finally exported into Feather format. The exported file is imported in this notebook. To read more about Feather, read https://blog.rstudio.org/2016/03/29/feather/

The benefit of doing this were:
- The ETL part of the analysis is in a separate file (CitiBike_ETL.py)
- The Feather file can read in R also
- During analysis, this notebook will be run multiple times and it is efficient to just import the DataFrame

In [87]:
bikedata = feather.read_dataframe('../../Data/CitiBike_Data/bikedata.feather')
print bikedata.columns
print bikedata.head()

Index([u'bikeid', u'birth year', u'date', u'dtstartdatehour',
       u'dtstopdatehour', u'end station id', u'end station latitude',
       u'end station longitude', u'end station name', u'female', u'male',
       u'start station id', u'start station latitude',
       u'start station longitude', u'start station name', u'starttime',
       u'stoptime', u'tripduration', u'usertype'],
      dtype='object')
   bikeid  birth year        date     dtstartdatehour      dtstopdatehour  \
0   23914        1982  2016-03-01 2016-03-01 06:00:00 2016-03-01 07:00:00   
1   23697        1978  2016-03-01 2016-03-01 07:00:00 2016-03-01 07:00:00   
2   21447        1960  2016-03-01 2016-03-01 07:00:00 2016-03-01 07:00:00   
3   22351        1986  2016-03-01 2016-03-01 07:00:00 2016-03-01 07:00:00   
4   20985        1978  2016-03-01 2016-03-01 07:00:00 2016-03-01 08:00:00   

   end station id  end station latitude  end station longitude  \
0             427             40.701907             -74.013942   

In [142]:
# How many bikes will leave a particular station at a particular time?
# First let's start with a single station. In our case, let's start with E 40 St & 5 Ave
dfFocusStation = bikedata[bikedata['start station name'] == "E 40 St & 5 Ave"]

dfGroupBy = dfFocusStation.groupby(by=['dtstartdatehour'])

# Number of departures per hour
departures = dfGroupBy.bikeid.count().reset_index()

# Date column
departures['date'] =  dfGroupBy.date.max().reset_index().date

# Male to female ratio
sum_male = dfGroupBy.male.sum().reset_index().male
sum_female = dfGroupBy.female.sum().reset_index().female
departures['male_to_female_ratio'] = (sum_male).astype(float) / (sum_male + sum_female)

print departures.head()

      dtstartdatehour  bikeid        date  male_to_female_ratio
0 2016-03-01 01:00:00       1  2016-03-01                  1.00
1 2016-03-01 07:00:00       6  2016-03-01                  1.00
2 2016-03-01 08:00:00      20  2016-03-01                  0.85
3 2016-03-01 09:00:00       3  2016-03-01                  1.00
4 2016-03-01 10:00:00       4  2016-03-01                  1.00


In [143]:
# Join with weather data
weather_file = './data/temperature/weather.csv'
weather = pd.read_csv(weather_file)
weather.drop('Unnamed: 0', axis=1, inplace=True)

final_df = pd.merge(departures, weather, on='date', how='left')
final_df.head()

Unnamed: 0,dtstartdatehour,bikeid,date,male_to_female_ratio,fog,rain,snow,hail,thunder,tornado,meantempm,maxtempm,meanvisi,maxhumidity
0,2016-03-01 01:00:00,1,2016-03-01,1.0,0,0,0,0,0,0,8,11,10,80
1,2016-03-01 07:00:00,6,2016-03-01,1.0,0,0,0,0,0,0,8,11,10,80
2,2016-03-01 08:00:00,20,2016-03-01,0.85,0,0,0,0,0,0,8,11,10,80
3,2016-03-01 09:00:00,3,2016-03-01,1.0,0,0,0,0,0,0,8,11,10,80
4,2016-03-01 10:00:00,4,2016-03-01,1.0,0,0,0,0,0,0,8,11,10,80


In [144]:
# Create linear regression object
regr = linear_model.LinearRegression()

# LabelEncoder for the dtstartdatehour column
le = preprocessing.LabelEncoder()
le.fit(final_df.dtstartdatehour)
final_df['startdatehour'] = le.transform(final_df.dtstartdatehour)
# LabelEncoder ends

cols = [col for col in final_df.columns if col not in ['bikeid', 'dtstartdatehour', 'date']]

x = final_df[cols]
y = final_df.bikeid

X_train, X_test, y_train, y_test = cross_validation.train_test_split(x, y, random_state = 1)
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, pred))
r2_score = metrics.r2_score(y_test, pred)

print 'RMSE: ', rmse
print 'R2 score: ', r2_score

RMSE:  5.41174243009
R2 score:  0.0203012176324
