In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
pd.options.display.max_rows=10
pd.options.mode.chained_assignment = None

  return f(*args, **kwds)


In [2]:
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()
    df['abs_dist_travel'] = df.abs_diff_longitude + df.abs_diff_latitude

In [3]:
def add_loc_bias_feature(df):
    lat_mean=40.75
    long_mean=-73.97
    df['up_diff_center'] = np.sqrt((df.pickup_latitude-lat_mean)**2 + (df.pickup_longitude-long_mean)**2)
    df['off_diff_center'] = np.sqrt((df.dropoff_latitude-lat_mean)**2 + (df.dropoff_longitude-long_mean)**2)

In [4]:
def add_time_features(df):
    timecol=pd.to_datetime(df['pickup_datetime'],infer_datetime_format=True)
    df['pickup_year'] = timecol.dt.year
    df['pickup_hour'] = timecol.dt.hour

In [5]:
def get_input_matrix(df):
    return (np.column_stack((np.ones(len(df)), df.abs_diff_longitude, df.abs_diff_latitude, df.abs_dist_travel, df.up_diff_center, df.off_diff_center, df.pickup_year, df.pickup_hour)),df.fare_amount.values.reshape((df.shape[0],1)))

In [6]:
def train_weight(df_train):
    df_train=df_train.dropna(how = 'any', axis = 'rows')
    df_train=df_train[(df_train.pickup_latitude>39)&
                      (df_train.pickup_latitude<42)&
                      (df_train.pickup_longitude>-74.5)&
                      (df_train.pickup_longitude<-72)&
                      (df_train.dropoff_latitude>39)&
                      (df_train.dropoff_latitude<42)&
                      (df_train.dropoff_longitude>-74.5)&
                      (df_train.dropoff_longitude<-72)&
                      (df_train.fare_amount > 0)&
                      (df_train.passenger_count > 0)]
    add_travel_vector_features(df_train)
    add_loc_bias_feature(df_train)
    add_time_features(df_train)
    
    train_X,train_y = get_input_matrix(df_train.iloc[0:df_train.shape[0]*9//10,:])
    valid_X,valid_y = get_input_matrix(df_train.iloc[df_train.shape[0]*9//10:,:])
    
    (w_lsr, _, _, _) = np.linalg.lstsq(train_X, train_y, rcond = None)
    w_lsr=w_lsr.reshape(len(w_lsr),1)
    valid_y_est = np.matmul(valid_X, w_lsr).round(decimals = 2)
    RMSE_lsr=((valid_y-valid_y_est) ** 2).mean() ** .5
    print('RMSE = '+str(RMSE_lsr))
    return w_lsr, RMSE_lsr

In [7]:
df_test=pd.read_csv('../input/test.csv')
chunker = pd.read_csv('../input/train.csv',chunksize=200000)
W=np.zeros((8,0))
E=np.zeros((1,0))
i=1

In [8]:
for piece in chunker:
    print('Piece: '+ str(i))
    w,e = train_weight(piece)
    W = np.c_[W,w]
    E = np.c_[E,e]
    i += 1

Piece: 1
RMSE = 5.638389770647216
Piece: 2
RMSE = 6.03102568085073
Piece: 3
RMSE = 5.823117706232769
Piece: 4
RMSE = 5.4309840271538
Piece: 5
RMSE = 5.712058309511129
Piece: 6
RMSE = 5.437167054804181
Piece: 7
RMSE = 5.518291811337159
Piece: 8
RMSE = 5.427126622474414
Piece: 9
RMSE = 5.420534411539426
Piece: 10
RMSE = 5.180118018362579
Piece: 11
RMSE = 5.939887049133567
Piece: 12
RMSE = 6.364219220569886
Piece: 13
RMSE = 5.665275093120925
Piece: 14
RMSE = 5.5805350053477225
Piece: 15
RMSE = 4.974963722213174
Piece: 16
RMSE = 5.888458549358139
Piece: 17
RMSE = 5.704377631880081
Piece: 18
RMSE = 5.261232730418779
Piece: 19
RMSE = 5.304928519461461
Piece: 20
RMSE = 5.576208075696302
Piece: 21
RMSE = 5.626537245839204
Piece: 22
RMSE = 5.803616456341846
Piece: 23
RMSE = 5.279382313601138
Piece: 24
RMSE = 5.905086376184509
Piece: 25
RMSE = 5.737301159850406
Piece: 26
RMSE = 5.524442624190085
Piece: 27
RMSE = 5.991845102019688
Piece: 28
RMSE = 5.9913493103855275
Piece: 29
RMSE = 6.16953840880

Piece: 232
RMSE = 5.632539673254125
Piece: 233
RMSE = 5.977820403754878
Piece: 234
RMSE = 5.204915666926365
Piece: 235
RMSE = 5.156692081032676
Piece: 236
RMSE = 6.004480280172913
Piece: 237
RMSE = 5.691040309576808
Piece: 238
RMSE = 5.588351725595612
Piece: 239
RMSE = 5.390417341323276
Piece: 240
RMSE = 6.093416354452878
Piece: 241
RMSE = 5.364091765567647
Piece: 242
RMSE = 5.9253981448364215
Piece: 243
RMSE = 6.034701348424739
Piece: 244
RMSE = 5.291176012748869
Piece: 245
RMSE = 5.29861021205592
Piece: 246
RMSE = 6.213938457756349
Piece: 247
RMSE = 5.509603892642242
Piece: 248
RMSE = 5.895180917366362
Piece: 249
RMSE = 5.35081278482392
Piece: 250
RMSE = 5.496017184645833
Piece: 251
RMSE = 5.3607893467156
Piece: 252
RMSE = 5.461925465943391
Piece: 253
RMSE = 5.5029421065814335
Piece: 254
RMSE = 6.053966221581879
Piece: 255
RMSE = 5.132206211134021
Piece: 256
RMSE = 5.758875681352675
Piece: 257
RMSE = 5.869229224209488
Piece: 258
RMSE = 5.4983019120398025
Piece: 259
RMSE = 5.738406566

In [10]:
Model={'W': W, 'E': E}
import pickle
with open('ModelParameter_2.p','wb') as fp:
    pickle.dump(Model,fp)

In [20]:
selection=(E<5.5)
selection=selection.reshape((selection.size,))
W_smallerr=W[:,selection]
W_final=np.mean(W_smallerr,axis=1).reshape((W_smallerr.shape[0],1))

In [17]:
np.mean(E[:,selection])

5.255530208711872

In [18]:
W_smallerr.shape

(8, 120)

In [21]:
W_final.shape

(8, 1)

In [22]:
Model={'W': W, 'E': E, 'W_final': W_final}
import pickle
with open('ModelParameter_3.p','wb') as fp:
    pickle.dump(Model,fp)

In [7]:
df_test=pd.read_csv('../input/test.csv')

In [8]:
df_test

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.973320,40.763805,-73.981430,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.751260,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.981160,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51 UTC,-73.968124,40.796997,-73.955643,40.780388,6
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51 UTC,-73.945511,40.803600,-73.960213,40.776371,6
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15 UTC,-73.991600,40.726608,-73.789742,40.647011,6
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19 UTC,-73.985573,40.735432,-73.939178,40.801731,6


In [10]:
add_travel_vector_features(df_test)
add_loc_bias_feature(df_test)
add_time_features(df_test)

df=df_test

test_X=np.column_stack((np.ones(len(df)), df.abs_diff_longitude, df.abs_diff_latitude, df.abs_dist_travel, df.up_diff_center, df.off_diff_center, df.pickup_year, df.pickup_hour))

In [12]:
import pickle
with open('ModelParameter_3.p', 'rb') as handle:
    data = pickle.load(handle)

In [14]:
W_final=data['W_final']

In [20]:
test_y_est = np.matmul(test_X, W_final).round(decimals = 2)

In [21]:
test_y_est.shape

(9914, 1)

In [28]:
submission = pd.DataFrame(
    {'key': df_test.key, 'fare_amount': test_y_est.reshape((len(test_y_est),))},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission_lsr.csv', index = False)

In [27]:
submission

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,9.89
1,2015-01-27 13:08:24.0000003,10.91
2,2011-10-08 11:53:44.0000002,5.65
3,2012-12-01 21:12:12.0000002,8.47
4,2012-12-01 21:12:12.0000003,12.89
...,...,...
9909,2015-05-10 12:37:51.0000002,10.73
9910,2015-01-12 17:05:51.0000001,11.77
9911,2015-04-19 20:44:15.0000001,53.23
9912,2015-01-31 01:05:19.0000005,21.23
