In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import pickle

In [2]:
train_df_raw = pd.read_csv("./data/training.csv")

In [3]:
routes_df = pd.read_csv("./data/routes.csv")
route_list = routes_df['BUSROUTE_ID'].unique()

In [4]:
stop_df = pd.read_csv("./data/stops.csv")
stop_list = stop_df['BUSSTOP_ID'].unique()

In [5]:
bus_list = train_df_raw['BUS_ID'].unique()

In [6]:
train_df = train_df_raw

In [7]:
train_df.head()

Unnamed: 0,index,RECORD_DATE,BUS_ID,BUSROUTE_ID,BUSSTOP_ID,BUSSTOP_SEQ,TIMESTAMP
0,0,2020-01-06 07:00:00,325819008,11100360,388,21,1578294000
1,1,2020-01-06 07:00:00,451509115,11100480,112,9,1578294000
2,2,2020-01-06 07:00:00,235501055,11100500,374,9,1578294000
3,3,2020-01-06 07:00:00,145010001,11100330,331,30,1578294000
4,4,2020-01-06 07:00:01,421733004,11100620,51,29,1578294001


In [8]:
train_df = train_df.sort_values(by=['BUS_ID', 'TIMESTAMP'])

In [9]:
train_df['TIMESTAMP_DIFF'] = train_df['TIMESTAMP'].shift(-1) - train_df['TIMESTAMP']

In [10]:
train_df['SEQ_DIFF'] = train_df['BUSSTOP_SEQ'].shift(-1) - train_df['BUSSTOP_SEQ']

In [11]:
train_df['ROUTE_DIFF'] = train_df['BUSROUTE_ID'].shift(-1) - train_df['BUSROUTE_ID']

In [12]:
train_df['BUS_ID_DIFF'] = train_df['BUS_ID'].shift(-1) - train_df['BUS_ID']

In [13]:
# train_df = train_df[(train_df['SEQ_DIFF']==1.0) & (train_df['ROUTE_DIFF']==0.0) & (train_df['BUS_ID_DIFF']==0.0)]

In [14]:
train_df = train_df[train_df['SEQ_DIFF']==1]
train_df = train_df[train_df['ROUTE_DIFF']==0]
train_df = train_df[train_df['BUS_ID_DIFF']==0]

In [15]:
train_df[train_df['TIMESTAMP_DIFF']<1]

Unnamed: 0,index,RECORD_DATE,BUS_ID,BUSROUTE_ID,BUSSTOP_ID,BUSSTOP_SEQ,TIMESTAMP,TIMESTAMP_DIFF,SEQ_DIFF,ROUTE_DIFF,BUS_ID_DIFF


In [16]:
# bus id encoding
bus_id_encoder = preprocessing.LabelEncoder()
bus_id_encoder.fit(bus_list)
train_df.BUS_ID = bus_id_encoder.transform(train_df.BUS_ID)

# bus_id_encoder.transform([145010001, 235501055])
# bus_id_encoder.inverse_transform([280, 466])

In [17]:
# route id encoding
route_id_encoder = preprocessing.LabelEncoder()
route_id_encoder.fit(route_list)
train_df.BUSROUTE_ID = route_id_encoder.transform(train_df.BUSROUTE_ID)

In [18]:
# stop id encoding
stop_id_encoder = preprocessing.LabelEncoder()
stop_id_encoder.fit(stop_list)
train_df.BUSSTOP_ID = stop_id_encoder.transform(train_df.BUSSTOP_ID)

In [19]:
pickle.dump(bus_id_encoder  , open("./out/bus_id_encoder.pickle"    , "wb"))
pickle.dump(route_id_encoder, open("./out/route_id_encoder.pickle"  , "wb"))
pickle.dump(stop_id_encoder , open("./out/busstop_id_encoder.pickle", "wb"))

In [20]:
train_df

Unnamed: 0,index,RECORD_DATE,BUS_ID,BUSROUTE_ID,BUSSTOP_ID,BUSSTOP_SEQ,TIMESTAMP,TIMESTAMP_DIFF,SEQ_DIFF,ROUTE_DIFF,BUS_ID_DIFF
1624542,1624542,2020-01-13 07:28:25,0,3,17,2,1578900505,117.0,1.0,0.0,0.0
1625151,1625151,2020-01-13 07:30:22,0,3,10,3,1578900622,123.0,1.0,0.0,0.0
1625791,1625791,2020-01-13 07:32:25,0,3,8,4,1578900745,121.0,1.0,0.0,0.0
1626425,1626425,2020-01-13 07:34:26,0,3,5,5,1578900866,136.0,1.0,0.0,0.0
1627161,1627161,2020-01-13 07:36:42,0,3,3,6,1578901002,174.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2893774,2893774,2020-01-17 21:47:30,1076,8,109,31,1579297650,74.0,1.0,0.0,0.0
2894017,2894017,2020-01-17 21:48:44,1076,8,117,32,1579297724,123.0,1.0,0.0,0.0
2894432,2894432,2020-01-17 21:50:47,1076,8,119,33,1579297847,91.0,1.0,0.0,0.0
2894723,2894723,2020-01-17 21:52:18,1076,8,121,34,1579297938,82.0,1.0,0.0,0.0


In [25]:
train_df['DAY_OF_WEEK'     ] = pd.to_datetime(train_df['RECORD_DATE']).dt.dayofweek
train_df['HOUR_OF_DAY'     ] = pd.to_datetime(train_df['RECORD_DATE']).dt.hour
train_df['MINUTE_OF_HOUR'  ] = pd.to_datetime(train_df['RECORD_DATE']).dt.minute
train_df['SECOND_OF_MINUTE'] = pd.to_datetime(train_df['RECORD_DATE']).dt.second

In [26]:
train_df

Unnamed: 0,index,RECORD_DATE,BUS_ID,BUSROUTE_ID,BUSSTOP_ID,BUSSTOP_SEQ,TIMESTAMP,TIMESTAMP_DIFF,SEQ_DIFF,ROUTE_DIFF,BUS_ID_DIFF,DAY_OF_WEEK,HOUR_OF_DAY,MINUTE_OF_HOUR,SECOND_OF_MINUTE
1624542,1624542,2020-01-13 07:28:25,0,3,17,2,1578900505,117.0,1.0,0.0,0.0,0,7,28,25
1625151,1625151,2020-01-13 07:30:22,0,3,10,3,1578900622,123.0,1.0,0.0,0.0,0,7,30,22
1625791,1625791,2020-01-13 07:32:25,0,3,8,4,1578900745,121.0,1.0,0.0,0.0,0,7,32,25
1626425,1626425,2020-01-13 07:34:26,0,3,5,5,1578900866,136.0,1.0,0.0,0.0,0,7,34,26
1627161,1627161,2020-01-13 07:36:42,0,3,3,6,1578901002,174.0,1.0,0.0,0.0,0,7,36,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2893774,2893774,2020-01-17 21:47:30,1076,8,109,31,1579297650,74.0,1.0,0.0,0.0,4,21,47,30
2894017,2894017,2020-01-17 21:48:44,1076,8,117,32,1579297724,123.0,1.0,0.0,0.0,4,21,48,44
2894432,2894432,2020-01-17 21:50:47,1076,8,119,33,1579297847,91.0,1.0,0.0,0.0,4,21,50,47
2894723,2894723,2020-01-17 21:52:18,1076,8,121,34,1579297938,82.0,1.0,0.0,0.0,4,21,52,18


In [27]:
def timestamp_to_timeposition(time, starting_time = 21600): 
    if time < starting_time:
        return 0
    return (time - starting_time) % 86400

In [28]:
def timestamp_to_dayofweek(time):
    return (time//3600//24 + 3) % 7

In [29]:
timestamp_to_timeposition(1578607757)

58157

In [30]:
# time validation (valid time is 06:00:00 ~ 23:59:59)
starting_time = 21600 # 06:00:00
train_df['TIME_validation'] = train_df['TIMESTAMP'] % 86400
train_df = train_df[train_df['TIME_validation']>=starting_time]

In [31]:
# encodeing time

# (pd.to_datetime(train_df['RECORD_DATE']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
# (pd.Timestamp('2020-01-14 07:00:00')  - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
train_df['TIME'] = (train_df['TIMESTAMP'] - starting_time) % 86400


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [32]:
train_df.sort_values(by=['TIMESTAMP_DIFF'])

Unnamed: 0,index,RECORD_DATE,BUS_ID,BUSROUTE_ID,BUSSTOP_ID,BUSSTOP_SEQ,TIMESTAMP,TIMESTAMP_DIFF,SEQ_DIFF,ROUTE_DIFF,BUS_ID_DIFF,DAY_OF_WEEK,HOUR_OF_DAY,MINUTE_OF_HOUR,SECOND_OF_MINUTE,TIME_validation,TIME
2495182,2495182,2020-01-16 13:20:24,610,206,378,16,1579180824,3.0,1.0,0.0,0.0,3,13,20,24,48024,26424
257759,257759,2020-01-07 07:23:00,296,260,1170,47,1578381780,4.0,1.0,0.0,0.0,1,7,23,0,26580,4980
80314,80314,2020-01-06 11:18:34,725,251,379,27,1578309514,4.0,1.0,0.0,0.0,0,11,18,34,40714,19114
1170937,1170937,2020-01-10 14:41:21,427,41,452,40,1578667281,5.0,1.0,0.0,0.0,4,14,41,21,52881,31281
2543779,2543779,2020-01-16 16:05:54,200,251,379,27,1579190754,5.0,1.0,0.0,0.0,3,16,5,54,57954,36354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1274219,1274219,2020-01-10 21:28:38,710,135,163,8,1578691718,207631.0,1.0,0.0,0.0,4,21,28,38,77318,55718
1031576,1031576,2020-01-09 22:09:17,468,200,1158,21,1578607757,208388.0,1.0,0.0,0.0,3,22,9,17,79757,58157
1238330,1238330,2020-01-10 19:06:14,219,265,148,13,1578683174,214548.0,1.0,0.0,0.0,4,19,6,14,68774,47174
1162737,1162737,2020-01-10 14:12:16,800,223,1194,25,1578665536,232925.0,1.0,0.0,0.0,4,14,12,16,51136,29536


In [33]:
# 2000 sec-ees ihiig evdersen esvel tur zogsolt hiisen gej uzeed haslaa

# train_df[(train_df['TIMESTAMP_DIFF']>2000)].sort_values(by=['TIMESTAMP_DIFF'])
train_df = train_df[(train_df['TIMESTAMP_DIFF']<2000)]

In [34]:
train_df

Unnamed: 0,index,RECORD_DATE,BUS_ID,BUSROUTE_ID,BUSSTOP_ID,BUSSTOP_SEQ,TIMESTAMP,TIMESTAMP_DIFF,SEQ_DIFF,ROUTE_DIFF,BUS_ID_DIFF,DAY_OF_WEEK,HOUR_OF_DAY,MINUTE_OF_HOUR,SECOND_OF_MINUTE,TIME_validation,TIME
1624542,1624542,2020-01-13 07:28:25,0,3,17,2,1578900505,117.0,1.0,0.0,0.0,0,7,28,25,26905,5305
1625151,1625151,2020-01-13 07:30:22,0,3,10,3,1578900622,123.0,1.0,0.0,0.0,0,7,30,22,27022,5422
1625791,1625791,2020-01-13 07:32:25,0,3,8,4,1578900745,121.0,1.0,0.0,0.0,0,7,32,25,27145,5545
1626425,1626425,2020-01-13 07:34:26,0,3,5,5,1578900866,136.0,1.0,0.0,0.0,0,7,34,26,27266,5666
1627161,1627161,2020-01-13 07:36:42,0,3,3,6,1578901002,174.0,1.0,0.0,0.0,0,7,36,42,27402,5802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2893774,2893774,2020-01-17 21:47:30,1076,8,109,31,1579297650,74.0,1.0,0.0,0.0,4,21,47,30,78450,56850
2894017,2894017,2020-01-17 21:48:44,1076,8,117,32,1579297724,123.0,1.0,0.0,0.0,4,21,48,44,78524,56924
2894432,2894432,2020-01-17 21:50:47,1076,8,119,33,1579297847,91.0,1.0,0.0,0.0,4,21,50,47,78647,57047
2894723,2894723,2020-01-17 21:52:18,1076,8,121,34,1579297938,82.0,1.0,0.0,0.0,4,21,52,18,78738,57138


In [35]:
output_df = train_df[[
    "DAY_OF_WEEK", "HOUR_OF_DAY", "MINUTE_OF_HOUR", "SECOND_OF_MINUTE", "TIME", 
    "BUS_ID", "BUSROUTE_ID", "BUSSTOP_ID", "TIMESTAMP_DIFF"
]]

In [36]:
output_df.to_csv('./out/cleaned_train.csv', index = False)