In [18]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import KFold,train_test_split
import time
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt


import os

In [19]:
train = pd.read_csv("Train_Processed_4.23.csv")
test = pd.read_csv("test_new_feature.csv")

In [20]:
print(train.shape, test.shape)

(9750645, 11) (9914, 9)


In [21]:
print(train.describe())

         Unnamed: 0   fare_amount  pickup_longitude  pickup_latitude  \
count  9.750645e+06  9.750645e+06      9.750645e+06     9.750645e+06   
mean   4.999705e+06  1.132601e+01     -7.397515e+01     4.075110e+01   
std    2.886771e+06  9.681748e+00      3.840572e-02     2.950159e-02   
min    0.000000e+00  0.000000e+00     -7.449954e+01     4.050005e+01   
25%    2.499544e+06  6.000000e+00     -7.399228e+01     4.073656e+01   
50%    4.999803e+06  8.500000e+00     -7.398210e+01     4.075336e+01   
75%    7.499597e+06  1.250000e+01     -7.396832e+01     4.076755e+01   
max    9.999999e+06  9.520000e+02     -7.280843e+01     4.175844e+01   

       dropoff_longitude  dropoff_latitude  passenger_count  \
count       9.750645e+06      9.750645e+06     9.750645e+06   
mean       -7.397430e+01      4.075146e+01     1.690576e+00   
std         3.754662e-02      3.272592e-02     1.306262e+00   
min        -7.449991e+01      4.050003e+01     1.000000e+00   
25%        -7.399158e+01      4.0735

In [22]:
train.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_diff_longitude,abs_diff_latitude
0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1,0.002701,0.009041
1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,0.03678,0.070701
2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2,0.008504,0.010708
3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1,0.004437,0.024949
4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,0.01144,0.015754


In [23]:
#check for missing values in train data
print(train.isnull().sum().sort_values(ascending=False))
#check for missing values in test data
print(test.isnull().sum().sort_values(ascending=False))
#drop the missing values
train = train.drop(train[train.isnull().any(1)].index, axis = 0)

abs_diff_latitude     0
abs_diff_longitude    0
passenger_count       0
dropoff_latitude      0
dropoff_longitude     0
pickup_latitude       0
pickup_longitude      0
pickup_datetime       0
fare_amount           0
key                   0
Unnamed: 0            0
dtype: int64
abs_diff_latitude     0
abs_diff_longitude    0
passenger_count       0
dropoff_latitude      0
dropoff_longitude     0
pickup_latitude       0
pickup_longitude      0
pickup_datetime       0
key                   0
dtype: int64


In [24]:
def add_distances_features(df):
    df['euclidean'] = (df['abs_diff_longitude'] ** 2 + df['abs_diff_latitude'] ** 2) ** 0.5
    df['manhattan'] = df['abs_diff_longitude'] + df['abs_diff_latitude']

def add_time_features(df):
    df['pickup_datetime'] =  pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S %Z')
    df['year'] = df['pickup_datetime'].apply(lambda x: x.year)
    df['month'] = df['pickup_datetime'].apply(lambda x: x.month)
    df['day'] = df['pickup_datetime'].apply(lambda x: x.day)
    df['hour'] = df['pickup_datetime'].apply(lambda x: x.hour)
    df['weekday'] = df['pickup_datetime'].apply(lambda x: x.weekday())
    
    # Drop 'pickup_datetime' as we won't need it anymore
    df = df.drop('pickup_datetime', axis=1)

add_distances_features(train)
add_time_features(train)
add_distances_features(test)
add_time_features(test)

In [25]:
y_train = train.fare_amount.values
X_train = train.drop(['fare_amount'],axis=1)
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [26]:
test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_diff_longitude,abs_diff_latitude,euclidean,manhattan,year,month,day,hour,weekday
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1,0.00811,0.01997,0.021554,0.02808,2015,1,27,13,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1,0.012024,0.019817,0.02318,0.031841,2015,1,27,13,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982524,40.75126,-73.979654,40.746139,1,0.00287,0.005121,0.00587,0.007991,2011,10,8,11,5
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.98116,40.767807,-73.990448,40.751635,1,0.009288,0.016172,0.018649,0.02546,2012,12,1,21,5
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1,0.022519,0.045348,0.050631,0.067867,2012,12,1,21,5


In [27]:
dropped_columns = ['fare_amount','Unnamed: 0', 'key', 'pickup_datetime']

dropped_columns1 = ['key', 'pickup_datetime']

X_train = train.drop(dropped_columns, axis=1)
test_key = test['key']
X_test = test.drop(dropped_columns1,axis =1)

In [28]:
X_test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_diff_longitude,abs_diff_latitude,euclidean,manhattan,year,month,day,hour,weekday
0,-73.97332,40.763805,-73.98143,40.743835,1,0.00811,0.01997,0.021554,0.02808,2015,1,27,13,1
1,-73.986862,40.719383,-73.998886,40.739201,1,0.012024,0.019817,0.02318,0.031841,2015,1,27,13,1
2,-73.982524,40.75126,-73.979654,40.746139,1,0.00287,0.005121,0.00587,0.007991,2011,10,8,11,5
3,-73.98116,40.767807,-73.990448,40.751635,1,0.009288,0.016172,0.018649,0.02546,2012,12,1,21,5
4,-73.966046,40.789775,-73.988565,40.744427,1,0.022519,0.045348,0.050631,0.067867,2012,12,1,21,5


In [29]:
params = {
        'objective': "regression", 
        'metric': "rmse",
        'num_boost_round':1000,
        'learning_rate': 0.034, 
        'num_leaves':31 , 
        'max_depth':-1, 
        'subsample':0.8, 
        'colsample_bytree':0.6, 
        'min_split_gain':0.5 , 
        'min_child_weight':1, 
        'min_child_samples':10, 
        'scale_pos_weight':1, 
        'num_threads':4, 
        'boosting_type':"gbdt", 
        'zero_as_missing':True, 
        'seed':0, 
        'eval_freq':50
}
train_set = lgb.Dataset(X_train, y_train, silent=True)
model = lgb.train(params, train_set = train_set)



In [None]:
# from sklearn.model_selection import GridSearchCV
# #decision tree Parameters Adjustment:
# model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
#                               learning_rate=0.1, n_estimators=43, max_depth=6,
#                               metric='rmse', bagging_fraction = 0.8,feature_fraction = 0.8)

# params_test1={
#     'max_depth': range(3,8,2),
#     'num_leaves':range(20, 110, 30)
# }
# gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
# gsearch1.fit(X_train, y_train)
# gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [30]:
pred_test_y = np.zeros(X_test.shape[0])
pred_test_y = model.predict(X_test, num_iteration = model.best_iteration)

In [31]:
print(pred_test_y)

[10.37355639 10.20687703  4.61326623 ... 53.57751232 20.66583504
  7.07329248]


In [32]:
submission = pd.DataFrame({"key": test_key, "fare_amount": pred_test_y.round(2)})
submission['fare_amount'] = pred_test_y
submission.to_csv('submission_LGB.csv', index=False)
submission.head(20)

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.373556
1,2015-01-27 13:08:24.0000003,10.206877
2,2011-10-08 11:53:44.0000002,4.613266
3,2012-12-01 21:12:12.0000002,8.340359
4,2012-12-01 21:12:12.0000003,15.034429
5,2012-12-01 21:12:12.0000005,10.468665
6,2011-10-06 12:10:20.0000001,5.483303
7,2011-10-06 12:10:20.0000003,49.412808
8,2011-10-06 12:10:20.0000002,11.344753
9,2014-02-18 15:22:20.0000002,6.772785
