In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopy.distance as gp
import math
from datetime import datetime
import xgboost as xgb



In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
pd.set_option('display.width', 150)

In [3]:
train['new_user'].fillna('NO', inplace=True)
test['new_user'].fillna('NO', inplace=True)

train['tip_amount'].fillna(train['tip_amount'].median(), inplace=True)
test['tip_amount'].fillna(test['tip_amount'].median(), inplace=True)

train['pickup_longitude'].fillna(train['pickup_longitude'].mode()[0], inplace=True)
test['pickup_longitude'].fillna(test['pickup_longitude'].mode()[0], inplace=True)

train['pickup_latitude'].fillna(train['pickup_latitude'].mode()[0], inplace=True)
test['pickup_latitude'].fillna(test['pickup_latitude'].mode()[0], inplace=True)

train['dropoff_longitude'].fillna(train['dropoff_longitude'].mode()[0], inplace=True)
test['dropoff_longitude'].fillna(test['dropoff_longitude'].mode()[0], inplace=True)

train['dropoff_latitude'].fillna(train['dropoff_latitude'].mode()[0], inplace=True)
test['dropoff_latitude'].fillna(test['dropoff_latitude'].mode()[0], inplace=True)

In [4]:
train['surcharge'].fillna(train['surcharge'].median(), inplace=True)
test['surcharge'].fillna(test['surcharge'].median(), inplace=True)

In [5]:
drop_col = ['TID', 'store_and_fwd_flag']

In [6]:
set(train.columns) - set(drop_col)

{'dropoff_datetime',
 'dropoff_latitude',
 'dropoff_longitude',
 'fare_amount',
 'mta_tax',
 'new_user',
 'passenger_count',
 'payment_type',
 'pickup_datetime',
 'pickup_latitude',
 'pickup_longitude',
 'rate_code',
 'surcharge',
 'tip_amount',
 'tolls_amount',
 'vendor_id'}

In [7]:
cate_vars = ['vendor_id', 'payment_type']
one_hot_train = pd.get_dummies(train[cate_vars])
one_hot_test = pd.get_dummies(test[cate_vars])
     

In [8]:
cols = list(test.columns)

train_label = train['fare_amount']

train = train[list(set(cols) - set(cate_vars))]
test = test[list(set(cols) - set(cate_vars))]
print train.shape
print test.shape

(1658099, 15)
(509808, 15)


In [9]:
train = pd.concat([train, one_hot_train], axis=1)
test = pd.concat([test, one_hot_test], axis=1)

In [10]:
col1 = set(train.columns)
col2 = set(test.columns)
temp = list(col1.intersection(col2))
train = train[temp]
test = test[temp]

In [11]:
# Calculating time
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'])

test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])
test['dropoff_datetime'] = pd.to_datetime(test['dropoff_datetime'])



In [12]:
train['time_elapsed'] = (train['dropoff_datetime'] - train['pickup_datetime']).astype('timedelta64[m]')
test['time_elapsed'] = (test['dropoff_datetime'] - test['pickup_datetime']).astype('timedelta64[m]')

In [13]:
train['year'] = train['pickup_datetime'].dt.year
test['year'] = test['pickup_datetime'].dt.year

# train['month'] = train['pickup_datetime'].dt.month
# test['month'] = test['pickup_datetime'].dt.month

# train['day'] = train['pickup_datetime'].dt.day
# test['day'] = test['pickup_datetime'].dt.day

train['dayofweek'] = train['pickup_datetime'].dt.dayofweek
test['dayofweek'] = test['pickup_datetime'].dt.dayofweek

train['hour'] = train['pickup_datetime'].dt.hour
test['hour'] = test['pickup_datetime'].dt.hour


In [14]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [15]:
train['distance'] = haversine_np(train['pickup_longitude'], train['pickup_latitude'], train['dropoff_longitude'], train['dropoff_latitude'])
test['distance'] = haversine_np(test['pickup_longitude'], test['pickup_latitude'], test['dropoff_longitude'], test['dropoff_latitude'])


In [16]:
train.dtypes

payment_type_DIS                uint8
TID                            object
rate_code                       int64
surcharge                     float64
payment_type_UNK                uint8
pickup_longitude              float64
payment_type_CSH                uint8
vendor_id_DST000543             uint8
dropoff_latitude              float64
tolls_amount                  float64
pickup_datetime        datetime64[ns]
tip_amount                    float64
new_user                       object
mta_tax                       float64
dropoff_datetime       datetime64[ns]
vendor_id_DST000481             uint8
dropoff_longitude             float64
passenger_count                 int64
payment_type_NOC                uint8
payment_type_CRD                uint8
store_and_fwd_flag             object
pickup_latitude               float64
time_elapsed                  float64
year                            int64
dayofweek                       int64
hour                            int64
distance    

In [17]:
# print np.sum((train['new_user']=='YES') &  (train['year']==2016))
# print np.sum(train['new_user']=='YES')
# print np.sum(train['year']==2016)
# print np.sum(test['new_user']=='NO')


In [18]:
train['ne_user'] = train['new_user']=='YES'
test['ne_user'] = test['new_user']=='YES'

In [19]:
# drop_cols = ['year','TID','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','pickup_datetime','dropoff_datetime','store_and_fwd_flag', 'new_user']                   
drop_cols = ['TID','pickup_datetime','dropoff_datetime','store_and_fwd_flag', 'new_user']                   
train = train.drop(labels=drop_cols, axis=1)
test = test.drop(labels=drop_cols, axis=1)

In [20]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

In [23]:
# RandomForest CV
# model = RandomForestRegressor(n_estimators=40, max_features = 'sqrt',min_samples_split=10, n_jobs=-1)
# kfold = KFold(n_splits=5, random_state=7)
# scoring = 'neg_mean_absolute_error'
# results = cross_val_score(model, train, train_label, cv=kfold, scoring=scoring)
# results

In [23]:
# # AdaBoost CV
# model = AdaBoostRegressor(n_estimators=100, learning_rate=0.05, )
# kfold = KFold(n_splits=5, random_state=7)
# scoring = 'neg_mean_absolute_error'
# results = cross_val_score(model, train, train_label, cv=kfold, scoring=scoring)
# print results.mean()

-3.80203594266


In [24]:
# # AdaBoost CV
# model = AdaBoostRegressor(n_estimators=50, learning_rate=1, )
# kfold = KFold(n_splits=5, random_state=7)
# scoring = 'neg_mean_absolute_error'
# results = cross_val_score(model, train, train_label, cv=kfold, scoring=scoring)
# print results.mean()

-5.61689717767


In [None]:
model = RandomForestRegressor(n_estimators=80, n_jobs=-1)
model.fit(train, train_label)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=80, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [None]:
prediction = model.predict(test)

In [None]:
subm_file = pd.read_csv('sample_submission.csv')

In [None]:
temp = pd.read_csv('test.csv')

In [None]:
subm_file['TID'] = temp['TID']
subm_file['fare_amount'] = prediction

In [None]:
subm_file.to_csv('subm_file8.csv', index=False)

In [None]:
# # Other Stuff
# f1 = pd.read_csv('subm_file8.csv')
# f1['fare_amount'][test['distance']==0] = f1['fare_amount'].median()
# f1.to_csv('dist_zero.csv', index=False)