In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from TaxiDataFormatter.dataformatter import formatter
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale

fig=plt.figure(figsize=(16, 9), dpi= 80, facecolor='w', edgecolor='k')
%matplotlib inline

In [2]:
!ls

[34mTaxiDataFormatter[m[m     [31msample_submission.zip[m[m [31mtest.zip[m[m
Taxi_v1.ipynb         sub_1.csv             train.csv
Taxi_v2.ipynb         sub_2.csv             [31mtrain.zip[m[m
sample_submission.csv test.csv


# Load Data and Analyze

In [3]:
train = pd.read_csv('train.csv', delimiter=',')
test = pd.read_csv('test.csv', delimiter=',')

In [4]:
train.shape

(1458644, 11)

In [5]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [6]:
train = train.sort_values(by=['trip_duration'])

#remove top 500 entries
train = train.iloc[500:] 
train = train.sample(frac=1)

In [7]:
y = train[['trip_duration']].copy()
Y = np.log(y)
train = train.drop(['trip_duration','dropoff_datetime'],axis=1)

In [8]:
mod_df1 = formatter(train)
mod_df1 = mod_df1.drop(['id'],axis=1)

X = mod_df1.values
X.shape

(1458144, 8)

In [9]:
test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [10]:
mod_df2 = formatter(test)
mod_df2 = mod_df2.drop(['id'],axis=1)
X_test = mod_df2.values

In [11]:
#normalize the data

X_std = minmax_scale(X, axis=0)
Xt_std = minmax_scale(X_test, axis=0)

pca = PCA()
X_std_pca = pca.fit_transform(X_std_norm)
X_std = X_std_pca[:,:10]

Xt_std_pca = pca.fit_transform(Xt_std_norm)
Xt_std = Xt_std_pca[:,:10]



# Split the data

In [12]:
from sklearn.model_selection import train_test_split
xtr, xtv, ytr, ytv = train_test_split(X_std,Y,test_size=0.20, random_state=43)

In [13]:
from sklearn.metrics import mean_squared_error

# Build the model

In [14]:
#build your model xgb regressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [15]:
import time
t1 = time.time()
xgb = XGBRegressor(nthread=-1,reg_alpha=0.5,learning_rate=0.02,max_depth=3,
                   min_child_weight=3,n_estimators=200,gamma=1,seed=31,random_state=31) 
xgb.fit(xtr, ytr)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.02, max_delta_step=0,
       max_depth=3, min_child_weight=3, missing=None, n_estimators=200,
       n_jobs=1, nthread=-1, objective='reg:linear', random_state=31,
       reg_alpha=0.5, reg_lambda=1, scale_pos_weight=1, seed=31,
       silent=True, subsample=1)

In [16]:
y_train_hat = xgb.predict(xtr)
y_test_hat = xgb.predict(xtv)

train_rmse = mean_squared_error(ytr, y_train_hat)
test_rmse = mean_squared_error(ytv, y_test_hat)
print("train Error: ",train_rmse)
print("test Error: ",test_rmse)
t2 = time.time()
print(t2-t1)

train Error:  0.23726130499215609
test Error:  0.24572128380348865
135.21742796897888


# Load Test Data

In [17]:
submission = pd.read_csv('sample_submission.csv')
submission.head()

Unnamed: 0,id,trip_duration
0,id3004672,959
1,id3505355,959
2,id1217141,959
3,id2150126,959
4,id1598245,959


In [18]:
yt_hat = xgb.predict(Xt_std)

In [19]:
submission['trip_duration'] = np.exp(yt_hat)

In [20]:
submission.head()

Unnamed: 0,id,trip_duration
0,id3004672,777.337585
1,id3505355,777.337585
2,id1217141,457.01416
3,id2150126,1145.836792
4,id1598245,355.513184


In [21]:
submission.to_csv('sub_1.csv',index=False)

In [22]:
1000*100/1458644

0.06855682400914823

In [23]:
###


In [24]:
X_norm = minmax_scale(X, axis=0)

In [25]:
pca = PCA()
X_norm_pca = pca.fit_transform(X_norm)
X_norm_pca_10 = X_norm_pca[:,:10]