In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import pickle

In [2]:
# датасетээ унших

train_df  = pd.read_csv("./data/training.csv")
routes_df = pd.read_csv("./data/routes.csv")
stop_df   = pd.read_csv("./data/stops.csv")

In [3]:
# ялгаатай id жагсаалтуудыг бэлтгэх

route_list = routes_df['BUSROUTE_ID'].unique()
stop_list  = stop_df['BUSSTOP_ID'].unique()
bus_list   = train_df['BUS_ID'].unique()

In [4]:
# Автобусны явсан дарааллын дагуу эрэмбэлнэ

train_df = train_df.sort_values(by=['BUS_ID', 'TIMESTAMP'])

In [5]:
# Автус өмнөх буудлаасаа хэдэн секунд задцуулж ирснийг TIMESTAMP_DIFF дээр тэмдэглэж үлдэнэ
# Энэ баганыг моделын таамаглах утга болгож ашиглана. 

train_df['TIMESTAMP_DIFF'] = train_df['TIMESTAMP'].shift(-1) - train_df['TIMESTAMP']

In [6]:
# Буудал алгассан тохиолдол болон route-ийн эхлэлийн зогсоолуудыг хасалт хийж цэвэрлэнэ

train_df['SEQ_DIFF'   ] = train_df['BUSSTOP_SEQ'].shift(-1) - train_df['BUSSTOP_SEQ']
train_df['ROUTE_DIFF' ] = train_df['BUSROUTE_ID'].shift(-1) - train_df['BUSROUTE_ID']
train_df['BUS_ID_DIFF'] = train_df['BUS_ID'].shift(-1) - train_df['BUS_ID']

train_df = train_df[train_df['SEQ_DIFF'   ]==1]
train_df = train_df[train_df['ROUTE_DIFF' ]==0]
train_df = train_df[train_df['BUS_ID_DIFF']==0]

In [7]:
# BUS_ID, BUSROUTE_ID, BUSSTOP_ID багануудыг 0-оос N хүртэл шинээр дугаарлаж бэлдэнэ.

# bus id encoding
bus_id_encoder = preprocessing.LabelEncoder()
bus_id_encoder.fit(bus_list)
train_df.BUS_ID = bus_id_encoder.transform(train_df.BUS_ID)
# bus_id_encoder.transform([145010001, 235501055])
# bus_id_encoder.inverse_transform([280, 466])

# route id encoding
route_id_encoder = preprocessing.LabelEncoder()
route_id_encoder.fit(route_list)
train_df.BUSROUTE_ID = route_id_encoder.transform(train_df.BUSROUTE_ID)

# stop id encoding
stop_id_encoder = preprocessing.LabelEncoder()
stop_id_encoder.fit(stop_list)
train_df.BUSSTOP_ID = stop_id_encoder.transform(train_df.BUSSTOP_ID)

In [8]:
# Дараа нь Тэст дата дээр ашиглах учраас encoder-уудаа хадгалж үлдэнэ.

pickle.dump(bus_id_encoder  , open("./out/bus_id_encoder.pickle"    , "wb"))
pickle.dump(route_id_encoder, open("./out/route_id_encoder.pickle"  , "wb"))
pickle.dump(stop_id_encoder , open("./out/busstop_id_encoder.pickle", "wb"))

In [9]:
# DAY_OF_WEEK-д хэддэх өдөр болохыг тэмдэглэнэ

train_df['DAY_OF_WEEK'] = pd.to_datetime(train_df['RECORD_DATE']).dt.dayofweek

In [10]:
# time validation (valid time is 06:00:00 ~ 23:59:59)
# Шөнийн 00 цагаас өглөөний 6 цаг хүртэлх датаг автус хуваарийн дагуу яваагүй гэж үзээд хассан 

starting_time = 21600 # 06:00:00
train_df['TIME_validation'] = train_df['TIMESTAMP'] % 86400
train_df = train_df[train_df['TIME_validation']>=starting_time]

In [11]:
# encodeing time
# Тухайн өдрийн хэддэх секунд вэ гэдгийг TIME дээр тэмдэглэнэ.
# TIME тооцохдоо 6AM-ээс өдөр эхэлдэг гэж тооцсон

train_df['TIME'] = (train_df['TIMESTAMP'] - starting_time) % 86400

In [12]:
# 1 буудал явах хугацаа 2000 секундээс их бол автус эвдэрсэн эсвэл түр зогсолт хийсэн гэж үзээд хассан

train_df = train_df[(train_df['TIMESTAMP_DIFF']<2000)]

In [13]:
train_df.head()

Unnamed: 0,index,RECORD_DATE,BUS_ID,BUSROUTE_ID,BUSSTOP_ID,BUSSTOP_SEQ,TIMESTAMP,TIMESTAMP_DIFF,SEQ_DIFF,ROUTE_DIFF,BUS_ID_DIFF,DAY_OF_WEEK,TIME_validation,TIME
1624542,1624542,2020-01-13 07:28:25,0,3,17,2,1578900505,117.0,1.0,0.0,0.0,0,26905,5305
1625151,1625151,2020-01-13 07:30:22,0,3,10,3,1578900622,123.0,1.0,0.0,0.0,0,27022,5422
1625791,1625791,2020-01-13 07:32:25,0,3,8,4,1578900745,121.0,1.0,0.0,0.0,0,27145,5545
1626425,1626425,2020-01-13 07:34:26,0,3,5,5,1578900866,136.0,1.0,0.0,0.0,0,27266,5666
1627161,1627161,2020-01-13 07:36:42,0,3,3,6,1578901002,174.0,1.0,0.0,0.0,0,27402,5802


In [14]:
# saving cleaned data

output_df = train_df[["DAY_OF_WEEK", "TIME", "BUS_ID", "BUSROUTE_ID", "BUSSTOP_ID", "TIMESTAMP_DIFF"]]
output_df.to_csv('./out/cleaned_train.csv', index = False)