In [2]:
# bike2.ipynb
# prediction for the Kaggle bike sharing demand contest

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time as systime
import datetime as dtime

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_validation import train_test_split

def splitDatetime(data): 
    sub = pd.DataFrame(data.datetime.str.split(' ').tolist(), columns = "date time".split())
    date = pd.DataFrame(sub.date.str.split('-').tolist(), columns="year month day".split())
    time = pd.DataFrame(sub.time.str.split(':').tolist(), columns = "hour minute second".split())
    data['year'] = date['year'].astype(int)
    data['month'] = date['month'].astype(int)
    data['day'] = date['day'].astype(int)
    data['hour'] = time['hour'].astype(int)
    data['weekday']=0 #weekday=0: monday, weekday=6: sunday.
    for i in range(0,len(train.year)):
        data.weekday[i]=dtime.date(train.year[i], train.month[i], train.day[i]).weekday()   
    return data

def normalize(data): #feature normalization
    data = (data - data.mean()) / (data.max() - data.min())
    return data

def predict(est, train, test, features, target):

    est.fit(train[features], train[target])

    with open("pred.csv", 'wb') as f:
        f.write("datetime,count\n")

        for index, value in enumerate(list(est.predict(test[features]))):
            f.write("%s,%s\n" % (test['datetime'].loc[index], int(value)))
            
def crossval(est, train, features, target): #cross-validation
    start = systime.time()    
    
    train_train, train_test, target_train, target_test = train_test_split(train[features], train[target], test_size=0.33, random_state=42)

    est.fit(train_train, target_train)

    end = systime.time()
    print 'rmsle:', rmsle(target_test, est.predict(train_test))
    print 'elapsed time:', end-start
    
def sle(actual, predicted): #squared log error
    return (np.power(np.log(np.array(actual)+1) - 
            np.log(np.array(predicted)+1), 2))
    
def rmsle(targets, predictions): #root mean squared log error
    return np.sqrt((sle(targets, predictions)**2).mean())

In [3]:
#read in the data, generate weekday feature

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train = splitDatetime(train)
test = splitDatetime(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
# divide sample into workdays and non-workdays

train_wd = train[train.workingday==1]
train_nwd = train[train.workingday==0]
test_wd = test[test.workingday==1]
test_nwd = test[test.workingday==0]

In [5]:
# take a look at the raw data

train_wd.head(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour,weekday
47,2011-01-03 00:00:00,1,0,1,1,9.02,9.85,44,23.9994,0,5,5,2011,1,3,0,0
48,2011-01-03 01:00:00,1,0,1,1,8.2,8.335,44,27.9993,0,2,2,2011,1,3,1,0
49,2011-01-03 04:00:00,1,0,1,1,6.56,6.82,47,26.0027,0,1,1,2011,1,3,4,0
50,2011-01-03 05:00:00,1,0,1,1,6.56,6.82,47,19.0012,0,3,3,2011,1,3,5,0
51,2011-01-03 06:00:00,1,0,1,1,5.74,5.305,50,26.0027,0,30,30,2011,1,3,6,0


In [6]:
# determine the features to be used in the Machine Learning algorithm

target = 'count'

features = [
#                'season',
#                'holiday',
                'workingday',
#                'weather',
                'temp',
#                'atemp',
                'humidity',
#                'windspeed',
                'year',
                'month',
#                'day',
                'hour',
                'weekday'
                 ]

# features that are important for casual bike-riders:
features_cas = [
#                'season',
#                'holiday',
#                'workingday',
#                'weather',
#                'temp',
                'atemp',
                'humidity',
                'windspeed',
                'year',
                'month',
                'day',
                'hour',
                'weekday'
                 ]

# features that are imporant for regular bike-riders:
features_reg = [#'season',
#                'holiday',
#                'workingday',
#                'weather',
#                'temp',
                'atemp',
                'humidity',
#                'windspeed',
                'year',
                'month',
#                'day',
                'hour',
                'weekday'
                ]

In [7]:
#normalize the features

train[features_reg] = normalize(train[features_reg])
train[features_cas] = normalize(train[features_cas])

In [12]:
#define the ML algorithm

x=500
est = RandomForestRegressor(n_estimators=x) #one regressor for all


# crossvalidate the one-for-all regressor

crossval(est, train, features, 'casual')

rmsle: 0.633364404226
elapsed time: 7.22376894951


In [16]:
# based on the estimator, evaluate the feature importances

importances = est.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking all:")
for f in range(len(features)):
    print("%d. feature: %s (%f)" % (f + 1, features[indices[f]], importances[indices[f]]))

Feature ranking all:
1. feature: hour (0.383353)
2. feature: temp (0.261897)
3. feature: workingday (0.167886)
4. feature: humidity (0.081047)
5. feature: weekday (0.040105)
6. feature: year (0.034634)
7. feature: month (0.031077)


In [8]:
# make a prediction

predict(est, train, test, features, target)

In [136]:
#create output containing prediction for the test set

with open("pred.csv", 'wb') as f:
    f.write("datetime,count\n")
    for index, value in enumerate(list(test['count'])):
        f.write("%s,%s\n" % (test['datetime'].loc[index], int(value)))

In [None]:
# I also tried 4 separate regressors (for casual vsregulars vs workdays vs weekends):
# but for some reason it performes worse that the one-for-all regressor

est_reg_wd = RandomForestRegressor(n_estimators=x)
est_cas_wd = RandomForestRegressor(n_estimators=x)
est_reg_nwd = RandomForestRegressor(n_estimators=x)
est_cas_nwd = RandomForestRegressor(n_estimators=x)

start = systime.time()    
    
train_train, train_test, target_train, target_test = train_test_split(train_wd[features_cas], train_wd['casual'], test_size=0.33, random_state=42)
est_cas_wd.fit(train_train, target_train)
predcas = est_cas_wd.predict(train_test)
rmsle_cas_wd = rmsle(target_test, predcas)

train_train, train_test, target_train, target_test = train_test_split(train_wd[features_reg], train_wd['registered'], test_size=0.33, random_state=42)
est_reg_wd.fit(train_train, target_train)
predreg = est_reg_wd.predict(train_test)
rmsle_reg_wd = rmsle(target_test, predreg)
predcount = predcas + predreg

train_train, train_test, target_train, target_test = train_test_split(train_wd[features_reg], train_wd['count'], test_size=0.33, random_state=42)
end = systime.time()
print '---workingdays---'
print 'rmsle cas reg total:', rmsle_cas_wd, rmsle_reg_wd, rmsle(target_test, predcount)
print 'elapsed time:', end-start


start = systime.time()    
    
train_train, train_test, target_train, target_test = train_test_split(train_nwd[features_cas], train_nwd['casual'], test_size=0.33, random_state=42)
est_cas_nwd.fit(train_train, target_train)
predcas = est_cas_nwd.predict(train_test)
rmsle_cas_nwd = rmsle(target_test, predcas)

train_train, train_test, target_train, target_test = train_test_split(train_nwd[features_reg], train_nwd['registered'], test_size=0.33, random_state=42)
est_reg_nwd.fit(train_train, target_train)
predreg = est_reg_nwd.predict(train_test)
rmsle_reg_nwd = rmsle(target_test, predreg)
predcount = predcas + predreg

train_train, train_test, target_train, target_test = train_test_split(train_nwd[features_reg], train_nwd['count'], test_size=0.33, random_state=42)
end = systime.time()
print '---no workingdays---'
print 'rmsle cas reg total:', rmsle_cas_nwd, rmsle_reg_nwd, rmsle(target_test, predcount)
print 'elapsed time:', end-start

In [None]:
# feature ranking

importances = est_reg_wd.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking registered:")
for f in range(len(features_reg)):
    print("%d. feature: %s (%f)" % (f + 1, features_reg[indices[f]], importances[indices[f]]))
    
importances = est_cas_wd.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking casual:")
for f in range(len(features_cas)):
    print("%d. feature: %s (%f)" % (f + 1, features_cas[indices[f]], importances[indices[f]]))

In [9]:
est_reg_wd.fit(train_wd[features], train_wd['registered'])
test_wd['registered'] = est_reg_wd.predict(test_wd[features])
est_cas_wd.fit(train_wd[features], train_wd['casual'])
test_wd['casual'] = est_cas_wd.predict(test_wd[features])
test_wd['count'] = test_wd.registered + test_wd.casual
est_reg_nwd.fit(train_nwd[features], train_nwd['registered'])
test_nwd['registered'] = est_reg_nwd.predict(test_nwd[features])
est_cas_nwd.fit(train_nwd[features], train_nwd['casual'])
test_nwd['casual'] = est_cas_nwd.predict(test_nwd[features])
test_nwd['count'] = test_nwd['registered'] + test_nwd['casual']

test = test_wd.merge(test_nwd, how='outer')

---workingdays---
rmsle cas reg total: 0.637318711798 0.367398369022 0.353534781443
elapsed time: 12.3438110352
---no workingdays---
rmsle cas reg total: 0.703491689307 0.370432493012 0.381738656948
elapsed time: 6.03031682968
