In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score

In [2]:
!ls ./out && ls ./data

bus_id_encoder.pickle	   cleaned_train.csv  output.csv
busstop_id_encoder.pickle  output1.csv	      route_id_encoder.pickle
routes.csv  sample_submission.csv  stops.csv  test.csv	training.csv


In [3]:
bus_id_encoder     = pickle.load(open("./out/bus_id_encoder.pickle"    , "rb"))
route_id_encoder   = pickle.load(open("./out/route_id_encoder.pickle"  , "rb"))
busstop_id_encoder = pickle.load(open("./out/busstop_id_encoder.pickle", "rb"))

In [4]:
dummy_buses = [325819008, 451509115]
print("bus ids", dummy_buses)
print("bus categorical ids", bus_id_encoder.transform(dummy_buses))

dummy_routes = [11100010, 11100012]
print("route ids", dummy_routes)
print("route categorical ids", route_id_encoder.transform(dummy_routes))

dummy_busstops = [388, 112]
print("busstop ids", dummy_busstops)
print("busstop categorical ids", busstop_id_encoder.transform(dummy_busstops))


bus ids [325819008, 451509115]
bus categorical ids [578 664]
route ids [11100010, 11100012]
route categorical ids [3 4]
busstop ids [388, 112]
busstop categorical ids [387 111]


In [5]:
dataset_df  = pd.read_csv("./out/cleaned_train.csv")

In [6]:
dataset_df

Unnamed: 0,DAY_OF_WEEK,HOUR_OF_DAY,MINUTE_OF_HOUR,SECOND_OF_MINUTE,TIME,BUS_ID,BUSROUTE_ID,BUSSTOP_ID,TIMESTAMP_DIFF
0,0,7,28,25,5305,0,3,17,117.0
1,0,7,30,22,5422,0,3,10,123.0
2,0,7,32,25,5545,0,3,8,121.0
3,0,7,34,26,5666,0,3,5,136.0
4,0,7,36,42,5802,0,3,3,174.0
...,...,...,...,...,...,...,...,...,...
3113890,4,21,47,30,56850,1076,8,109,74.0
3113891,4,21,48,44,56924,1076,8,117,123.0
3113892,4,21,50,47,57047,1076,8,119,91.0
3113893,4,21,52,18,57138,1076,8,121,82.0


In [7]:
dataset_input = dataset_df[['DAY_OF_WEEK', 'HOUR_OF_DAY', 'MINUTE_OF_HOUR', 'SECOND_OF_MINUTE', 'BUS_ID', 'BUSROUTE_ID', 'BUSSTOP_ID']]
dataset_label = dataset_df[['TIMESTAMP_DIFF']]

In [8]:
xgb_model = xgb.XGBRegressor(
    n_estimators     = 100, 
    learning_rate    = 0.08, 
    gamma            = 0, 
    subsample        = 0.75,
    colsample_bytree = 1, 
    max_depth        = 7
)

In [9]:
%time xgb_model.fit(dataset_input, dataset_label, verbose=10)

CPU times: user 12min 2s, sys: 1.66 s, total: 12min 4s
Wall time: 3min 43s


XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.08, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.75, tree_method=None,
             validate_parameters=False, verbosity=None)

In [10]:
test_df = pd.read_csv("./data/test.csv")

In [11]:
test_df

Unnamed: 0,index,BUSROUTE_ID,BUS_ID,BUSSTOP_ID,BUSSTOP_SEQ,RECORD_DATE,TIMESTAMP
0,3220254,11100010,2054,11,3,2020-01-22 09:23:36,1.579685e+09
1,3220255,11100010,2054,9,4,,
2,3220256,11100010,2054,6,5,2020-01-22 09:27:26,1.579685e+09
3,3220257,11100010,2054,4,6,,
4,3220258,11100010,2054,147,7,,
...,...,...,...,...,...,...,...
1557459,4777713,12100232,740518021,657,9,2020-01-24 16:39:27,1.579884e+09
1557460,4777714,12100232,740518021,219,2,2020-01-24 17:28:00,1.579887e+09
1557461,4777715,12100232,740518021,221,3,,
1557462,4777716,12100232,740518021,242,4,,


In [12]:
test_df['DAY_OF_WEEK'     ] = pd.to_datetime(test_df['RECORD_DATE']).dt.dayofweek
test_df['HOUR_OF_DAY'     ] = pd.to_datetime(test_df['RECORD_DATE']).dt.hour
test_df['MINUTE_OF_HOUR'  ] = pd.to_datetime(test_df['RECORD_DATE']).dt.minute
test_df['SECOND_OF_MINUTE'] = pd.to_datetime(test_df['RECORD_DATE']).dt.second

In [13]:
test_df

Unnamed: 0,index,BUSROUTE_ID,BUS_ID,BUSSTOP_ID,BUSSTOP_SEQ,RECORD_DATE,TIMESTAMP,DAY_OF_WEEK,HOUR_OF_DAY,MINUTE_OF_HOUR,SECOND_OF_MINUTE
0,3220254,11100010,2054,11,3,2020-01-22 09:23:36,1.579685e+09,2.0,9.0,23.0,36.0
1,3220255,11100010,2054,9,4,,,,,,
2,3220256,11100010,2054,6,5,2020-01-22 09:27:26,1.579685e+09,2.0,9.0,27.0,26.0
3,3220257,11100010,2054,4,6,,,,,,
4,3220258,11100010,2054,147,7,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1557459,4777713,12100232,740518021,657,9,2020-01-24 16:39:27,1.579884e+09,4.0,16.0,39.0,27.0
1557460,4777714,12100232,740518021,219,2,2020-01-24 17:28:00,1.579887e+09,4.0,17.0,28.0,0.0
1557461,4777715,12100232,740518021,221,3,,,,,,
1557462,4777716,12100232,740518021,242,4,,,,,,


In [14]:
import random

cleaned_buses = dataset_df['BUS_ID'].unique()

In [15]:
dataset_input.iloc[0].values.tolist()

[0, 7, 28, 25, 0, 3, 17]

In [16]:
# 'DAY_OF_WEEK', 'HOUR_OF_DAY', 'MINUTE_OF_HOUR', 'SECOND_OF_MINUTE', 'BUS_ID', 'BUSROUTE_ID', 'BUSSTOP_ID'
for idx, row in test_df.iterrows():
    if (pd.isna(row['TIMESTAMP'])):
        timestamp = test_df.iloc[idx-1]['TIMESTAMP']
        prev_row  = test_df.iloc[idx-1]
        if (not pd.isna(timestamp)):
            day_of_week      = prev_row['DAY_OF_WEEK']
            hour_of_day      = prev_row['HOUR_OF_DAY']
            minute_of_hour   = prev_row['MINUTE_OF_HOUR']
            second_of_minute = prev_row['SECOND_OF_MINUTE']

            bus_id = int(prev_row['BUS_ID'])
            try:
                bus_position = bus_id_encoder.transform([bus_id])[0]
            except:
                bus_position = random.choice(cleaned_buses)

            route_id       = int(prev_row['BUSROUTE_ID'])
            route_position = route_id_encoder.transform([route_id])[0]

            busstop_id       = int(prev_row['BUSSTOP_ID'])
            busstop_position = busstop_id_encoder.transform([busstop_id])[0]

            model_input = pd.DataFrame(np.array([[day_of_week, hour_of_day, minute_of_hour, second_of_minute, bus_position, route_position, busstop_position]]),
                                       columns=['DAY_OF_WEEK', 'HOUR_OF_DAY', 'MINUTE_OF_HOUR', 'SECOND_OF_MINUTE', 'BUS_ID', 'BUSROUTE_ID', 'BUSSTOP_ID'])
            prediction  = int(xgb_model.predict(model_input)[0])
            
            test_df.at[idx, 'TIMESTAMP'] = timestamp + prediction

            #break

    if idx%5000==0:
        print("completed at", idx)

completed at 0
completed at 5000
completed at 10000
completed at 15000
completed at 20000
completed at 25000
completed at 30000
completed at 35000
completed at 40000
completed at 45000
completed at 50000
completed at 55000
completed at 60000
completed at 65000
completed at 70000
completed at 75000
completed at 80000
completed at 85000
completed at 90000
completed at 95000
completed at 100000
completed at 105000
completed at 110000
completed at 115000
completed at 120000
completed at 125000
completed at 130000
completed at 135000
completed at 140000
completed at 145000
completed at 150000
completed at 155000
completed at 160000
completed at 165000
completed at 170000
completed at 175000
completed at 180000
completed at 185000
completed at 190000
completed at 195000
completed at 200000
completed at 205000
completed at 210000
completed at 215000
completed at 220000
completed at 225000
completed at 230000
completed at 235000
completed at 240000
completed at 245000
completed at 250000
compl

In [17]:
test_df[['index', 'TIMESTAMP']].to_csv("./out/output.csv", index=False)