## Predicting Tips on NYC TAXI dataset
<br>
1) Predicting if person will tip
<br>
2) Predicting the tip amount
<br>

After analyzing the tip situation, all cash payments didn't include tips (because tax payments), therefore the classification model instead of predicting tipping was predicting payment method...

In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import datetime
import time
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_error

In [2]:
data = pd.read_csv('/home/asologuba/NYC_TAXI/TRIPS_ANALYTICS_10M.csv', low_memory=False)
#Choosing only card payments
data = data.loc[data.PAYMENT_TYPE == 1]
data.drop(columns='PAYMENT_TYPE', inplace=True)

Removing NaN values

In [3]:
data.STORE_AND_FWD_FLAG.fillna(2, inplace=True)
data.RATE_CODE_ID.fillna(0, inplace=True)
data.PASSENGER_COUNT.fillna(0, inplace=True)
data.MTA_TAX.fillna(0, inplace=True)
data.IMPROVEMENT_SURCHARGE.fillna(0, inplace=True)
data.TRIP_TYPE.fillna(0, inplace=True)
data.dropna(axis=0, inplace = True)

Removing outliers

In [4]:
#Traveltime longer than 24h
data = data.loc[data.TRAVELTIME <= 86400]
#Removing trips without payments
data = data.loc[data.FARE_AMOUNT > 0]

New variables

In [5]:
data['SPEED'] = data.TRIP_DISTANCE/(data.TRAVELTIME/3600)
data.loc[data.TRAVELTIME == 0,'SPEED'] = 0
data['PRICE_DISTANCE'] = data.FARE_AMOUNT/data.TRIP_DISTANCE
data.loc[data.TRIP_DISTANCE == 0,'PRICE_DISTANCE'] = 0
data['ALL_COSTS'] = data.FARE_AMOUNT + data.EXTRA + data.MTA_TAX + data.TOLLS_AMOUNT + data.IMPROVEMENT_SURCHARGE
data['PASSENGER_PRICE'] = data.ALL_COSTS/data.PASSENGER_COUNT
data.loc[data.PASSENGER_COUNT == 0,'PASSENGER_PRICE'] = 0

In [6]:
train = data[data.YEAR < 2015]
valid = data[data.YEAR == 2015]
train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

# OLD LOGISTIC REGRESSION - DOES_TIP

In [89]:
start = time.time()
print('Fun started at: '+ str(datetime.datetime.now()))

train_x = train.iloc[:,3:]
train_y = train.iloc[:,1]
valid_x = valid.iloc[:,3:]
valid_y = valid.iloc[:,1]
train_xgb = xgb.DMatrix(train_x, label = train_y)
valid_xgb = xgb.DMatrix(valid_x)

param = {'objective' : 'gpu:reg:logistic', 'tree_method':'gpu_hist', 'seed' :12345}

model = xgb.train(params = param,
                  dtrain = train_xgb,
                  num_boost_round = 100)

prediction = model.predict(valid_xgb)

results = DataFrame()
results['DOES_TIP_ACT'] = valid_y
results['DOES_TIP_SCORE'] = prediction
prediction[prediction < 0.5] = 0
prediction[prediction >= 0.5] = 1
results['DOES_TIP'] = prediction

done = time.time()
elapsed = done - start
print(datetime.datetime.now())
print('Everything done in '+str(elapsed)+' seconds')

print('Accuracy score '+str(accuracy_score(valid_y, prediction)*100))

Fun started at: 2018-08-16 12:32:28.093199
2018-08-16 12:33:04.725711
Everything done in 36.63248872756958 seconds
Accuracy score 97.0507867756594


# Predicting the tipping amount  (linear regression) - XGB

In [7]:
start = time.time()
print('Fun started at: '+ str(datetime.datetime.now()))

train_x = train.iloc[:,3:]
train_y = train.iloc[:,0]
valid_x = valid.iloc[:,3:]
valid_y = valid.iloc[:,0]
train_xgb = xgb.DMatrix(train_x, label = train_y)
valid_xgb = xgb.DMatrix(valid_x)

param = {'objective' : 'gpu:reg:linear', 'tree_method':'gpu_hist', 'gpu_id':0, 'seed' :12345, 'eval_metric':'rmse'}

model = xgb.train(params = param,
                  dtrain = train_xgb,
                  num_boost_round = 100)


prediction = model.predict(valid_xgb)
prediction[prediction < 0.001] = 0

results = DataFrame()
results['TIP_AMOUNT_ACT'] = valid_y
results['TIP_AMOUNT_FOR'] = prediction

done = time.time()
elapsed = done - start
print(datetime.datetime.now())
print('Everything done in '+str(elapsed)+' seconds')

print("Accuracy for model 1: %.2f" % (mean_absolute_error(valid_y, prediction)))

print(results.head(15))

Fun started at: 2018-08-16 13:02:45.860189
2018-08-16 13:02:59.508426
Everything done in 13.648211240768433 seconds
Accuracy for model 1: 0.85
    TIP_AMOUNT_ACT  TIP_AMOUNT_FOR
0             0.00        1.183753
1             2.55        1.999796
2             5.45        3.812779
3             8.20        5.511556
4             1.35        1.391471
5             1.00        1.595832
6             3.30        2.492273
7             4.35        3.975123
8             0.95        1.318972
9             2.05        2.044261
10            4.45        4.200309
11            0.01        2.997579
12            1.00        3.792670
13            2.65        2.459845
14            1.45        1.548150


In [22]:
#MAPE without zero values
results_no0 = results.loc[results.TIP_AMOUNT_ACT != 0]
sum(abs(results_no0.TIP_AMOUNT_ACT-results_no0.TIP_AMOUNT_FOR)/results_no0.TIP_AMOUNT_ACT)/results_no0.shape[0]

0.599946361381495

# testing SNAP ML for logistic regression

In [91]:
import snap_ml
lr = snap_ml.LogisticRegression(use_gpu=True, max_iter=100, dual=True, num_threads=32, device_ids=[0,1,2,3])
rr = snap_ml.RidgeRegression(use_gpu=True, max_iter=100, dual=True, num_threads=32, device_ids=[0,1,2,3])

In [92]:
train_x = train.iloc[:,3:]
train_y = train.iloc[:,1]
valid_x = valid.iloc[:,3:]
valid_y = valid.iloc[:,1]

train_x_np = np.array(train_x)
train_y_np = np.array(train_y)
valid_x_np = np.array(valid_x)

In [93]:
lr.fit(train_x_np, train_y_np)
rr.fit(train_x_np, train_y_np)

lr_res = lr.predict(valid_x_np)
rr_res = rr.predict(valid_x_np)

results['DOES_TIP_SNAP_LR'] = lr_res
results['DOES_TIP_SNAP_RR'] = rr_res