In [1]:
import tensorflow as tf
tf.__version__

'2.1.0'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [3]:
bus_id_encoder     = pickle.load(open("./out/bus_id_encoder.pickle"    , "rb"))
route_id_encoder   = pickle.load(open("./out/route_id_encoder.pickle"  , "rb"))
busstop_id_encoder = pickle.load(open("./out/busstop_id_encoder.pickle", "rb"))

In [4]:
dummy_buses = [325819008, 451509115]
print("bus ids", dummy_buses)
print("bus categorical ids", bus_id_encoder.transform(dummy_buses))

dummy_routes = [11100010, 11100012]
print("route ids", dummy_routes)
print("route categorical ids", route_id_encoder.transform(dummy_routes))

dummy_busstops = [388, 112]
print("busstop ids", dummy_busstops)
print("busstop categorical ids", busstop_id_encoder.transform(dummy_busstops))


bus ids [325819008, 451509115]
bus categorical ids [578 664]
route ids [11100010, 11100012]
route categorical ids [3 4]
busstop ids [388, 112]
busstop categorical ids [387 111]


In [5]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                        np.arange(d_model)[np.newaxis, :],
                        d_model)
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

In [6]:
class BusNet(tf.keras.Model):
    def __init__(self, **kwargs):
        super(BusNet, self).__init__(**kwargs)

        self.week          = [i for i in range(0, 7) ]
        self.hours         = [i for i in range(0, 24)]
        self.minutes       = [i for i in range(0, 60)]
        self.seconds       = [i for i in range(0, 60)]
        
        
        self.time_dim      = 151 # 7 days + 24 hours + 60 minutes + 60 seconds
        self.time_count    = 18*3600

        self.bus_dim       = 32
        self.bus_count     = 1077

        self.route_dim     = 32
        self.route_count   = 353

        self.busstop_dim   = 32
        self.busstop_count = 1357

        self.dropout_rate  = 0.1

        self.days_of_week_matrix      = tf.one_hot(self.week   , len(self.week)   )
        self.hours_of_day_matrix      = tf.one_hot(self.hours  , len(self.hours)  )
        self.minutes_of_hour_matrix   = tf.one_hot(self.minutes, len(self.minutes))
        self.seconds_of_minute_matrix = tf.one_hot(self.seconds, len(self.seconds))
        self.time_position_matrix     = positional_encoding(self.time_count, self.time_dim)
        
        self.day_embedding     = tf.keras.layers.Embedding(len(self.week)    , len(self.week)   , weights=[self.days_of_week_matrix      ], trainable=False)
        self.hour_embedding    = tf.keras.layers.Embedding(len(self.hours)   , len(self.hours)  , weights=[self.hours_of_day_matrix      ], trainable=False)
        self.minute_embedding  = tf.keras.layers.Embedding(len(self.minutes) , len(self.minutes), weights=[self.minutes_of_hour_matrix   ], trainable=False)
        self.second_embedding  = tf.keras.layers.Embedding(len(self.seconds) , len(self.seconds), weights=[self.seconds_of_minute_matrix ], trainable=False)
        self.time_embedding    = tf.keras.layers.Embedding(self.time_count   , self.time_dim    , weights=[self.time_position_matrix[0]  ], trainable=True )
        self.bus_embedding     = tf.keras.layers.Embedding(self.bus_count    , self.bus_dim     , embeddings_initializer='uniform')
        self.route_embedding   = tf.keras.layers.Embedding(self.route_count  , self.route_dim   , embeddings_initializer='uniform')
        self.busstop_embedding = tf.keras.layers.Embedding(self.busstop_count, self.busstop_dim , embeddings_initializer='uniform')

        self.layer_1      = tf.keras.layers.Dense(100, activation='relu')
        #self.dropout_1    = tf.keras.layers.Dropout(self.dropout_rate)
        self.layer_2      = tf.keras.layers.Dense(64 , activation='relu')
        self.dropout_2    = tf.keras.layers.Dropout(self.dropout_rate)
        self.layer_3      = tf.keras.layers.Dense(32 , activation='relu')
        self.dropout_3    = tf.keras.layers.Dropout(self.dropout_rate)
        self.output_layer = tf.keras.layers.Dense(1)

    def call(self, inputs, training=True):
        # 'DAY_OF_WEEK', 'HOUR_OF_DAY', 'MINUTE_OF_HOUR', 'SECOND_OF_MINUTE', 'TIME',
        # 'BUS_ID', 'BUSROUTE_ID', 'BUSSTOP_ID', 'ROUTE_TIME', 'SEQ_NUM'
        days_of_week      = inputs[:, 0]
        hours_of_day      = inputs[:, 1]
        minutes_of_hour   = inputs[:, 2]
        seconds_of_minute = inputs[:, 3]
        time_positions    = inputs[:, 4]
        bus_ids           = inputs[:, 5]
        route_ids         = inputs[:, 6]
        busstop_ids       = inputs[:, 7]
        route_times       = inputs[:, 8]
        seq_nums          = inputs[:, 9]

        day_vectors     = self.day_embedding    (tf.cast(days_of_week     , dtype=tf.int32))
        hour_vectors    = self.hour_embedding   (tf.cast(hours_of_day     , dtype=tf.int32))
        minute_vectors  = self.minute_embedding (tf.cast(minutes_of_hour  , dtype=tf.int32))
        second_vectors  = self.second_embedding (tf.cast(seconds_of_minute, dtype=tf.int32))
        time_vectors    = self.time_embedding   (tf.cast(time_positions   , dtype=tf.int32))
        
        bus_vectors     = self.bus_embedding    (tf.cast(bus_ids    , dtype=tf.int32))
        route_vectors   = self.route_embedding  (tf.cast(route_ids  , dtype=tf.int32))
        busstop_vectors = self.busstop_embedding(tf.cast(busstop_ids, dtype=tf.int32))
        
        temporal_features =  tf.math.add(
            tf.concat([day_vectors, hour_vectors, minute_vectors, second_vectors], 1),
            time_vectors
        )
        #print(temporal_features.shape)

        concatted_input = tf.concat([
            temporal_features, 
            bus_vectors      , 
            route_vectors    , 
            busstop_vectors
        ], 1)
        #print(concatted_input.shape)
        
        x = self.layer_1(concatted_input)
        #print(x.shape)
        x = tf.concat([
            x, 
            tf.reshape(route_times, [route_times.shape[0], 1]),
            tf.reshape(seq_nums   , [seq_nums.shape   [0], 1])
        ], 1) # scaling factor
        #print(x.shape)
        x = self.layer_2(x)
        x = self.dropout_2(x, training=training)
        x = self.layer_3(x)
        x = self.dropout_3(x, training=training)

        return self.output_layer(x)

In [7]:
test_df = pd.read_csv('./out/cleaned_test.csv')
test_df

Unnamed: 0.1,Unnamed: 0,index,BUSROUTE_ID,BUS_ID,BUSSTOP_ID,BUSSTOP_SEQ,RECORD_DATE,TIMESTAMP,ROUTE_IDX,ROUTE_TIME,SEQ_NUM,DAY_OF_WEEK,HOUR_OF_DAY,MINUTE_OF_HOUR,SECOND_OF_MINUTE,TIME_POSITION,ROUTE_POSITION,BUSSTOP_POSITION,BUS_POSITION
0,0,3220254,11100010,2054,11,3,2020-01-22 09:23:36,1.579685e+09,,,,2.0,9.0,23.0,36.0,12216.0,3,10,983.0
1,1,3220255,11100010,2054,9,4,,,0.0,1.488333,38.0,,,,,,3,8,3.0
2,2,3220256,11100010,2054,6,5,2020-01-22 09:27:26,1.579685e+09,0.0,1.488333,38.0,2.0,9.0,27.0,26.0,12446.0,3,5,601.0
3,3,3220257,11100010,2054,4,6,,,0.0,1.488333,38.0,,,,,,3,3,345.0
4,4,3220258,11100010,2054,147,7,,,0.0,1.488333,38.0,,,,,,3,146,615.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1557459,1557459,4777713,12100232,740518021,657,9,2020-01-24 16:39:27,1.579884e+09,43975.0,3.960833,8.0,4.0,16.0,39.0,27.0,38367.0,352,648,269.0
1557460,1557460,4777714,12100232,740518021,219,2,2020-01-24 17:28:00,1.579887e+09,43976.0,0.317778,4.0,4.0,17.0,28.0,0.0,41280.0,352,218,441.0
1557461,1557461,4777715,12100232,740518021,221,3,,,43976.0,0.317778,4.0,,,,,,352,220,551.0
1557462,1557462,4777716,12100232,740518021,242,4,,,43976.0,0.317778,4.0,,,,,,352,241,241.0


In [8]:
#del model
model = BusNet()

In [9]:
test_input = test_df[['DAY_OF_WEEK', 'HOUR_OF_DAY', 'MINUTE_OF_HOUR', 'SECOND_OF_MINUTE', 'TIME_POSITION', 
                        'BUS_POSITION', 'ROUTE_POSITION', 'BUSSTOP_POSITION', 'ROUTE_TIME', 'SEQ_NUM']].iloc[2].values
model(np.array([test_input]))



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-2.1934247]], dtype=float32)>

In [10]:
optimizer = tf.keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=0.01, nesterov=True)
ckpt      = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=model)
manager   = tf.train.CheckpointManager(ckpt, './busnet_checkpoints', max_to_keep=3)

In [11]:
ckpt.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
    print("Restored from {}".format(manager.latest_checkpoint))
else:
    print("Initializing from scratch.")

Restored from ./busnet_checkpoints\ckpt-233


In [12]:
model(np.array([test_input]), training=False)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[96.93216]], dtype=float32)>

In [13]:
def timestamp_to_dayofweek(time):
    return (time//3600//24 + 3) % 7

def timestamp_to_timeposition(time, starting_time = 21600): 
    if time < starting_time:
        return 0
    return (time - starting_time) % 86400

In [14]:
import random
buses = np.unique(test_df['BUS_POSITION']).astype(int)
random.choice(buses)

1072

In [15]:
int(timestamp_to_timeposition(64874))

43274

In [16]:
#'DAY_OF_WEEK', 'HOUR_OF_DAY', 'MINUTE_OF_HOUR', 'SECOND_OF_MINUTE', 'TIME_POSITION', 
#'BUS_POSITION', 'ROUTE_POSITION', 'BUSSTOP_POSITION', 'ROUTE_TIME', 'SEQ_NUM'
for idx, row in test_df.iterrows():
    if (pd.isna(row['TIMESTAMP'])):
        timestamp = test_df.iloc[idx-1]['TIMESTAMP']
        prev_row  = test_df.iloc[idx-1]
        if (not pd.isna(timestamp)):
            day_of_week      = int(prev_row['DAY_OF_WEEK'     ])
            hour_of_day      = int(prev_row['HOUR_OF_DAY'     ])
            minute_of_hour   = int(prev_row['MINUTE_OF_HOUR'  ])
            second_of_minute = int(prev_row['SECOND_OF_MINUTE'])
            time_position    = int(prev_row['TIME_POSITION'   ])

            bus_position     = int(prev_row['BUS_POSITION'    ])
            route_position   = int(prev_row['ROUTE_POSITION'  ])
            busstop_position = int(prev_row['BUSSTOP_POSITION'])
            route_time       = prev_row['ROUTE_TIME']
            seq_num          = prev_row['SEQ_NUM'   ]

            try:
                if time_position>=64800:
                    time_position = 64799
                    print("OMG exceeded time pos...")
                model_input = np.array([[
                    day_of_week, hour_of_day, minute_of_hour, second_of_minute, time_position,
                    bus_position, route_position, busstop_position, route_time, seq_num
                ]])
                prediction  = int(model(model_input, training=False))
            except:
                print(model_input)
                print("crash...")
                break
                pass
            
            predicted_timestamp = timestamp + prediction
            
            test_df.at[idx, 'TIMESTAMP'       ] = predicted_timestamp
            test_df.at[idx, 'TIME_POSITION'   ] = timestamp_to_timeposition(int(predicted_timestamp))
            test_df.at[idx, 'DAY_OF_WEEK'     ] = pd.to_datetime(predicted_timestamp, unit='s').dayofweek
            test_df.at[idx, 'HOUR_OF_DAY'     ] = pd.to_datetime(predicted_timestamp, unit='s').hour
            test_df.at[idx, 'MINUTE_OF_HOUR'  ] = pd.to_datetime(predicted_timestamp, unit='s').minute
            test_df.at[idx, 'SECOND_OF_MINUTE'] = pd.to_datetime(predicted_timestamp, unit='s').second

    if idx%5000==0:
        print("completed at", idx)

completed at 0
completed at 5000
completed at 10000
completed at 15000
completed at 20000
completed at 25000
completed at 30000
completed at 35000
OMG exceeded time pos...
completed at 40000
completed at 45000
OMG exceeded time pos...
OMG exceeded time pos...
completed at 50000
completed at 55000
completed at 60000
OMG exceeded time pos...
OMG exceeded time pos...
completed at 65000
completed at 70000
completed at 75000
completed at 80000
completed at 85000
completed at 90000
completed at 95000
completed at 100000
OMG exceeded time pos...
completed at 105000
completed at 110000
completed at 115000
completed at 120000
completed at 125000
completed at 130000
completed at 135000
completed at 140000
completed at 145000
completed at 150000
completed at 155000
completed at 160000
completed at 165000
completed at 170000
completed at 175000
completed at 180000
completed at 185000
completed at 190000
completed at 195000
completed at 200000
completed at 205000
completed at 210000
completed at 21

In [17]:
test_df[['index', 'TIMESTAMP']].to_csv("./out/output3.csv", index=False)