In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from catboost import Pool, CatBoostRegressor
from sklearn import metrics
from sklearn.metrics import mean_absolute_error

# ignore warnings from pandas
import warnings
warnings.filterwarnings('ignore')

from feature_selector import FeatureSelector

In [14]:
df_train_set = pd.read_csv('data/train_aggregated.csv', low_memory=False)

In [15]:
# # check if null
df_train_set.isnull().sum(axis=0)
# df_train_set.duplicated().sum()

ride_id              0
seat_number          0
travel_date          0
travel_time          0
travel_from          0
car_type             0
max_capacity         0
number_of_tickets    0
dtype: int64

#Feature Engineering Step

In [16]:
df_train_set["travel_date"] = pd.to_datetime(df_train_set["travel_date"],infer_datetime_format=True)
df_train_set["travel_dow"] = df_train_set["travel_date"].dt.dayofweek #change the full date to day of week
df_train_set["travel_month"] = df_train_set["travel_date"].dt.month #extract month
df_train_set['hour_booked'] = pd.to_numeric(df_train_set['travel_time'].str.extract(r'(^\d*)').loc[:,0])
df_train_set["Weekday"] = df_train_set["travel_date"].dt.weekday
df_train_set['ToM'] = df_train_set.travel_date.dt.day
df_train_set["isEndWeek"] = df_train_set["travel_dow"].apply(lambda x: 1 if x >= 5 else 0)
df_train_set["isMorning"] = df_train_set["hour_booked"].apply(lambda x: 1 if x <= 21 else 0)
df_train_set["isEndMonth"] = df_train_set["ToM"].apply(lambda x: 1 if (x >= 5) and (x < 28) else 0)

#express travel time in minutes
df_train_set["travel_time"] = df_train_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

In [17]:
df_train_set.head()

Unnamed: 0,ride_id,seat_number,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets,travel_dow,travel_month,hour_booked,Weekday,ToM,isEndWeek,isMorning,isEndMonth
0,1442,15A,2017-10-17,435,Migori,Bus,49,1.0,1,10,7,1,17,0,1,1
1,5437,14A,2017-11-19,432,Migori,Bus,49,1.0,6,11,7,6,19,1,1,1
2,5710,8B,2017-11-26,425,Keroka,Bus,49,1.0,6,11,7,6,26,1,1,1
3,5777,19A,2017-11-27,430,Homa Bay,Bus,49,5.0,0,11,7,0,27,0,1,1
4,5778,11A,2017-11-27,432,Migori,Bus,49,31.0,0,11,7,0,27,0,1,1


In [18]:
#change to categorical
df_train_set["car_type"] = pd.Categorical(df_train_set["car_type"])
car_type_categories = df_train_set.car_type.cat.categories
df_train_set["car_type"] = df_train_set.car_type.cat.codes

df_train_set["travel_from"] = pd.Categorical(df_train_set["travel_from"])
travel_from_categories = df_train_set.travel_from.cat.categories
df_train_set["travel_from"] = df_train_set.travel_from.cat.codes

In [19]:
df_train_set.head()

Unnamed: 0,ride_id,seat_number,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets,travel_dow,travel_month,hour_booked,Weekday,ToM,isEndWeek,isMorning,isEndMonth
0,1442,15A,2017-10-17,435,9,0,49,1.0,1,10,7,1,17,0,1,1
1,5437,14A,2017-11-19,432,9,0,49,1.0,6,11,7,6,19,1,1,1
2,5710,8B,2017-11-26,425,4,0,49,1.0,6,11,7,6,26,1,1,1
3,5777,19A,2017-11-27,430,1,0,49,5.0,0,11,7,0,27,0,1,1
4,5778,11A,2017-11-27,432,9,0,49,31.0,0,11,7,0,27,0,1,1


In [20]:
#remove least significant columns
df_train_set.drop(['ride_id','seat_number', 'travel_date'], axis=1, inplace=True)

In [21]:
df_train_set.head()

Unnamed: 0,travel_time,travel_from,car_type,max_capacity,number_of_tickets,travel_dow,travel_month,hour_booked,Weekday,ToM,isEndWeek,isMorning,isEndMonth
0,435,9,0,49,1.0,1,10,7,1,17,0,1,1
1,432,9,0,49,1.0,6,11,7,6,19,1,1,1
2,425,4,0,49,1.0,6,11,7,6,26,1,1,1
3,430,1,0,49,5.0,0,11,7,0,27,0,1,1
4,432,9,0,49,31.0,0,11,7,0,27,0,1,1


In [23]:
X = df_train_set.drop(["number_of_tickets"], axis=1)
y = df_train_set.number_of_tickets

In [25]:
params = {'depth': 12, 'iterations': 700, 'l2_leaf_reg': 9, 
        'learning_rate': 0.97, 'random_seed': 1111,'logging_level' : 'Silent',
        'loss_function': 'MAE','l2_leaf_reg':10}

regressor = CatBoostRegressor(**params)
regressor.fit(X,y)
y_predict = regressor.predict(X) 

print ("----------------------------------------------\n")
# print ('Mean Squared Error - ', metrics.mean_squared_error(y_predict,y))
# print ('Median Squared Error - ', metrics.median_absolute_error(y_predict,y))
print ('Mean Absolute Error - ', metrics.mean_absolute_error(y_predict,y))

----------------------------------------------

Mean Absolute Error -  0.9328629353625796


In [75]:
# df_test_set = pd.read_csv('data/test_questions_3.csv', low_memory=False)
df_test_set = pd.read_csv('data/test_questions.csv', low_memory=False)
df_test_set.drop(['travel_to'], axis=1, inplace=True)
df_test_set.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity
0,247,2018-05-07,07:06,Kisii,Bus,49
1,256,2018-05-06,11:08,Kisii,shuttle,11
2,275,2018-05-04,05:00,Kisii,shuttle,11
3,285,2018-05-04,09:10,Kisii,shuttle,11
4,286,2018-05-04,09:20,Kisii,shuttle,11


#Feature Engineering Step

In [77]:
df_test_set["travel_date"] = pd.to_datetime(df_test_set["travel_date"],infer_datetime_format=True)
df_test_set["travel_dow"] = df_test_set["travel_date"].dt.dayofweek #change the full date to day of week
df_test_set["travel_month"] = df_test_set["travel_date"].dt.month #extract month
df_test_set['hour_booked'] = pd.to_numeric(df_test_set['travel_time'].str.extract(r'(^\d*)').loc[:,0])
df_test_set["Weekday"] = df_test_set["travel_date"].dt.weekday
df_test_set['ToM'] = df_test_set.travel_date.dt.day
df_test_set["isEndWeek"] = df_test_set["travel_dow"].apply(lambda x: 1 if x >= 5 else 0)
df_test_set["isMorning"] = df_test_set["hour_booked"].apply(lambda x: 1 if x <= 21 else 0)
df_test_set["isEndMonth"] = df_test_set["ToM"].apply(lambda x: 1 if (x >= 5) and (x < 28) else 0)

#express travel time in minutes
df_test_set["travel_time"] = df_test_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

In [78]:
df_test_set["car_type"] = pd.Categorical(df_test_set["car_type"], categories=car_type_categories)
df_test_set["car_type"] = df_test_set.car_type.cat.codes

df_test_set["travel_from"] = pd.Categorical(df_test_set["travel_from"], categories=travel_from_categories)
df_test_set["travel_from"] = df_test_set.travel_from.cat.codes


In [80]:
#remove least significant columns
df_test_set.drop(['travel_date'], axis=1, inplace=True)

In [81]:
df_test_set.head()

Unnamed: 0,ride_id,travel_time,travel_from,car_type,max_capacity,travel_dow,travel_month,hour_booked,Weekday,ToM,isEndWeek,isMorning,isEndMonth
0,247,426,7,0,49,0,5,7,0,7,0,1,1
1,256,668,7,1,11,6,5,11,6,6,1,1,1
2,275,300,7,1,11,4,5,5,4,4,0,1,0
3,285,550,7,1,11,4,5,9,4,4,0,1,0
4,286,560,7,1,11,4,5,9,4,4,0,1,0


In [82]:
X_test = df_test_set.drop(['ride_id'], axis=1)
test_set_predictions = regressor.predict(X_test)

In [83]:
d = {'ride_id': df_test_set["ride_id"], 'number_of_ticket': test_set_predictions}
df_predictions = pd.DataFrame(data=d)
df_predictions = df_predictions[['ride_id','number_of_ticket']]

In [84]:
df_predictions.head()

Unnamed: 0,ride_id,number_of_ticket
0,247,11.278033
1,256,6.354302
2,275,7.065551
3,285,4.249143
4,286,4.698046


In [26]:
df_predictions.to_csv('preds_test_set_26th.csv', index=False) #save to csv filbe