In [95]:
import os, gc, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
import xgboost as xgb

%matplotlib inline

In [57]:
train = pd.read_csv("data/new_train.csv")
test = pd.read_csv("data/new_test.csv")
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup,dropoff,distance,duration
0,2012-08-11 10:06:00.00000063,6.1,2012-08-11 10:06:00,-73.96054,40.7757,-73.97509,40.782598,2,"40.7757,-73.96054000000001","40.782598,-73.97509000000001",1.6,5.0
1,2009-12-25 14:29:00.00000057,14.5,2009-12-25 14:29:00,-73.999317,40.743992,-73.97223,40.796308,1,"40.743992,-73.999317","40.796308,-73.97223000000001",7.6,19.0
2,2009-02-23 06:49:40.0000001,3.3,2009-02-23 06:49:40,-73.960274,40.773106,-73.965761,40.769447,1,"40.773106,-73.960274","40.769447,-73.965761",0.8,4.0
3,2010-06-01 21:36:04.0000001,5.7,2010-06-01 21:36:04,-73.981212,40.758478,-73.969858,40.765873,2,"40.758478000000004,-73.981212","40.765873,-73.969858",2.0,10.0
4,2012-04-10 12:11:00.0000003,7.3,2012-04-10 12:11:00,-73.968438,40.767667,-73.96114,40.798557,1,"40.767666999999996,-73.968438","40.798557,-73.96114",4.4,13.0


In [58]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

In [59]:
train['hour'] = train.pickup_datetime.dt.hour
train['year'] = train.pickup_datetime.dt.year
train['month'] = train.pickup_datetime.dt.month
test['hour'] = test.pickup_datetime.dt.hour
test['year'] = test.pickup_datetime.dt.year
test['month'] = test.pickup_datetime.dt.month

In [116]:
train.dropna(inplace=True)
X = train[['distance','duration','passenger_count','hour','month','year']]
y = train['fare_amount']
X_test = test[['distance','duration','passenger_count','hour','month','year']]
print (X.shape, y.shape, X_test.shape)

(14678, 6) (14678,) (9914, 6)


In [117]:
np.isnan(y).any()

False

In [118]:
X['dist_dur_avg'] = (X['distance']*X['duration'])/2
X_test['dist_dur_avg'] = (X_test['distance']*X_test['duration'])/2

In [119]:
X['speed'] = X['distance']/X['duration']
X_test['speed'] = X_test['distance']/X_test['duration']

In [120]:
X_train, dev_X, y, dev_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [149]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.003,
        "bagging_fraction" : 0.8,
        "feature_fraction" : 1,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42,
        "max_bin":500,
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=500, 
                      verbose_eval=100, 
                      evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

In [150]:
pred, model, lgb_res = run_lgb(X_train, y, dev_X, dev_y, X_test)

Training until validation scores don't improve for 500 rounds.
[100]	training's rmse: 7.55615	valid_1's rmse: 7.38939
[200]	training's rmse: 6.27238	valid_1's rmse: 6.14959
[300]	training's rmse: 5.42174	valid_1's rmse: 5.33388
[400]	training's rmse: 4.8706	valid_1's rmse: 4.81903
[500]	training's rmse: 4.51695	valid_1's rmse: 4.50597
[600]	training's rmse: 4.29016	valid_1's rmse: 4.31622
[700]	training's rmse: 4.14449	valid_1's rmse: 4.19954
[800]	training's rmse: 4.04699	valid_1's rmse: 4.12852
[900]	training's rmse: 3.97808	valid_1's rmse: 4.08995
[1000]	training's rmse: 3.92575	valid_1's rmse: 4.07027
[1100]	training's rmse: 3.87954	valid_1's rmse: 4.05393
[1200]	training's rmse: 3.83369	valid_1's rmse: 4.04446
[1300]	training's rmse: 3.79518	valid_1's rmse: 4.03938
[1400]	training's rmse: 3.75875	valid_1's rmse: 4.04092
[1500]	training's rmse: 3.7286	valid_1's rmse: 4.03834
[1600]	training's rmse: 3.69727	valid_1's rmse: 4.04541
[1700]	training's rmse: 3.66683	valid_1's rmse: 4.04

In [151]:
sub = pd.read_csv("data/sample_submission.csv")

In [152]:
sub['fare_amount'] = pd.Series(pred)

In [153]:
sub.to_csv("data/one_res.csv", index=False)