In [1]:
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error
import random
import math
import datetime


## 1. Load file

In [2]:
# Each line is of the format:

#pickupGeohash, dropOffGeohash,time_num,day_of_week, count

names = ["pickup_geohash","dropoff_geohash","time_num","day_of_week", "count"]
df=pd.read_csv("./tmplocaldata/final/singlefile/part-00000", header=None, names = names)
print df.shape

(15285988, 5)


## 2. Feature Extraction

In [3]:
# Get the longitude and latitude from the geohash
def decodegeo(geo, which):
    if len(geo) >= 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0
    
def further_data_prep(df):
  
    df['time_sin'] = (df['time_num'] * 2 * math.pi).apply(math.sin)
    df['time_cos'] = (df['time_num'] * 2 * math.pi).apply(math.cos)
    df['pickup_lat'] = df['pickup_geohash'].apply(lambda geo: decodegeo(geo, 0))
    df['pickup_long'] = df['pickup_geohash'].apply(lambda geo: decodegeo(geo, 1))
    df['dropoff_lat'] = df['dropoff_geohash'].apply(lambda geo: decodegeo(geo, 0))
    df['dropoff_long'] = df['dropoff_geohash'].apply(lambda geo: decodegeo(geo, 1))
    
    return df

In [4]:
df = further_data_prep(df)

## 3. Train-test split

In [5]:
features_df = df[['time_num', 'time_sin', 'time_cos','day_of_week', 'pickup_lat', 'pickup_long', 'count']]
target_df = df[['dropoff_lat', 'dropoff_long']]

In [6]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split( features_df, target_df, test_size=0.2)

In [7]:
pickup_count_train = X_train[['count']]
X_train.drop('count', axis=1, inplace=True)

pickup_count_test = X_test[['count']]
X_test.drop('count', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Machine learning (without using sample weights)

In [9]:
reg = RandomForestRegressor(n_estimators=20, max_depth=20, n_jobs=-1, verbose=4)
reg.fit(X_train,y_train) #, pickup_count_train)

[Parallel(n_jobs=-1)]: Done   1 out of  20 | elapsed:  2.5min remaining: 47.8min
[Parallel(n_jobs=-1)]: Done   4 out of  20 | elapsed:  2.5min remaining: 10.1min
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:  5.3min remaining:  5.3min
[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed:  5.6min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  6.9min finished


building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=20, n_jobs=-1, oob_score=False, random_state=None,
           verbose=4, warm_start=False)

In [10]:
training_accuracy = reg.score(X_train, y_train)
valid_accuracy = reg.score(X_test, y_test)
rmsetrain = np.sqrt(mean_squared_error(reg.predict(X_train),y_train))
rmsevalid = np.sqrt(mean_squared_error(reg.predict(X_test),y_test))
print " R^2 (train) = %0.3f, R^2 (valid) = %0.3f, RMSE (train) = %0.3f, RMSE (valid) = %0.3f" % (training_accuracy, valid_accuracy, rmsetrain, rmsevalid)

[Parallel(n_jobs=8)]: Done   1 out of  20 | elapsed:    6.2s remaining:  2.0min
[Parallel(n_jobs=8)]: Done   4 out of  20 | elapsed:    7.0s remaining:   28.3s
[Parallel(n_jobs=8)]: Done  10 out of  20 | elapsed:   11.1s remaining:   11.1s
[Parallel(n_jobs=8)]: Done  16 out of  20 | elapsed:   12.8s remaining:    3.1s
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:   16.5s finished
[Parallel(n_jobs=8)]: Done   4 out of  20 | elapsed:    1.2s remaining:    5.0s
[Parallel(n_jobs=8)]: Done   1 out of  20 | elapsed:    1.3s remaining:   26.7s
[Parallel(n_jobs=8)]: Done  10 out of  20 | elapsed:    2.6s remaining:    2.6s
[Parallel(n_jobs=8)]: Done  16 out of  20 | elapsed:    3.0s remaining:    0.7s
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:    3.9s finished
[Parallel(n_jobs=8)]: Done   1 out of  20 | elapsed:    5.7s remaining:  1.8min
[Parallel(n_jobs=8)]: Done   4 out of  20 | elapsed:    6.6s remaining:   26.7s
[Parallel(n_jobs=8)]: Done  10 out of  20 | elapsed:   11.0s

 R^2 (train) = 0.254, R^2 (valid) = 0.141, RMSE (train) = 0.105, RMSE (valid) = 0.115


## Machine learning (with using sample weights)

In [None]:
# Not working
reg1 = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=-1)
reg1.fit(X_train,y_train , sample_weight = pickup_count_train.values)