In [128]:
import json
import time
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
import requests
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

In [129]:
orders_df = pd.read_csv("orders.csv")
nodes_df = pd.read_csv("nodes.csv")

In [130]:
orders_df.running_time = pd.to_datetime(orders_df.running_time, format='%Y-%m-%d %H:%M:%S')
orders_df.completed_time = pd.to_datetime(orders_df.completed_time, format='%Y-%m-%d %H:%M:%S')

In [131]:
nodes_df.dropna(inplace=True)
mean_speed_list = []

for i in orders_df["Id"]:
    mean_speed_list.append(nodes_df[nodes_df["Id"] == i].iloc[0, 4].mean())

orders_df["mean_nodes_speed"] = mean_speed_list

In [132]:
def day_period(hours):
    if 3 <= hours < 7:
        return 0
    elif 7 <= hours < 12:
        return 1
    elif 12 <= hours < 18:
        return 2
    else:
        return 3


day_period_list = []
for i in orders_df["Id"]:
    day_period_list.append(day_period(orders_df[orders_df["Id"] == i].iloc[0, 1].hour))

orders_df["day_period"] = day_period_list

In [133]:
orders_df = orders_df.drop(np.where(orders_df.route_distance_km == 0)[0])

In [134]:
odesa_df = pd.read_csv("nodes/odesa.csv")

In [135]:
nodes_df["node_start_lat"] = nodes_df["node_start"].map(odesa_df.set_index("id")["lat"]).dropna()
nodes_df["node_start_lon"] = nodes_df["node_start"].map(odesa_df.set_index("id")["lon"]).dropna()

In [136]:
nodes_df.dropna(inplace=True)

mean_lat = nodes_df.groupby(by="Id").apply(lambda df: df["node_start_lat"].mean())
mean_lon = nodes_df.groupby(by="Id").apply(lambda df: df["node_start_lon"].mean())

orders_df["node_start_lat"] = orders_df["Id"].map(mean_lat)
orders_df["node_start_lon"] = orders_df["Id"].map(mean_lon)


In [137]:
orders_df.dropna(inplace=True)

In [138]:
orders_df["mean_speed"] = orders_df["route_distance_km"] / (orders_df["delta_time"] / 3600)

In [139]:
X, Y = orders_df[orders_df.columns].drop(
    ['delta_time', 'completed_time', 'Id', 'mean_speed'], axis=1), orders_df["delta_time"]

In [140]:
X

Unnamed: 0,running_time,route_distance_km,mean_nodes_speed,day_period,node_start_lat,node_start_lon
0,2022-01-24 18:30:21,3.740,30.0,3,46.470668,30.736663
1,2022-01-24 06:53:53,3.526,32.0,0,46.474103,30.727825
2,2022-01-24 10:00:59,5.071,26.0,1,46.475841,30.741471
3,2022-01-24 14:28:05,2.867,24.0,2,46.571949,30.793730
4,2022-01-24 11:57:29,3.751,28.0,1,46.446009,30.691674
...,...,...,...,...,...,...
4995,2022-01-24 21:10:38,7.397,47.0,3,46.438255,30.715946
4996,2022-01-24 15:10:27,1.948,19.5,2,46.428127,30.756566
4997,2022-01-24 13:57:04,2.547,49.0,2,46.477205,30.703157
4998,2022-01-24 08:46:13,3.013,27.0,1,46.463377,30.741518


Створимо словник для зберігання оцінок кросс-валідації для різних моделей.

In [141]:
rsme_scores = {}

In [142]:
SEED = 42

In [143]:
from sklearn.metrics import make_scorer, mean_squared_error

mse = make_scorer(mean_squared_error, greater_is_better=False)

In [144]:
X

Unnamed: 0,running_time,route_distance_km,mean_nodes_speed,day_period,node_start_lat,node_start_lon
0,2022-01-24 18:30:21,3.740,30.0,3,46.470668,30.736663
1,2022-01-24 06:53:53,3.526,32.0,0,46.474103,30.727825
2,2022-01-24 10:00:59,5.071,26.0,1,46.475841,30.741471
3,2022-01-24 14:28:05,2.867,24.0,2,46.571949,30.793730
4,2022-01-24 11:57:29,3.751,28.0,1,46.446009,30.691674
...,...,...,...,...,...,...
4995,2022-01-24 21:10:38,7.397,47.0,3,46.438255,30.715946
4996,2022-01-24 15:10:27,1.948,19.5,2,46.428127,30.756566
4997,2022-01-24 13:57:04,2.547,49.0,2,46.477205,30.703157
4998,2022-01-24 08:46:13,3.013,27.0,1,46.463377,30.741518


Наступні моделі повині показати кращі результати так як вони є більш орієнтованими на нелінійні дані та робастність.

**Random Forest Reggressor**

In [205]:
from sklearn.ensemble import RandomForestRegressor

model_random_forest = RandomForestRegressor(
    random_state=SEED,
    max_depth=7,
    n_estimators=25
)

model_random_forest.fit(X.drop("running_time",axis=1), Y)

print(f'Best parameters {model_random_forest.best_params_}')
print(
    f'MSE score of the best_estimator: ' + f'{model_random_forest.best_score_:.3f}'
)
rsme_scores['random_forest'] = (model_random_forest.best_score_ * (-1)) ** 0.5

Best parameters {'max_depth': 7, 'n_estimators': 25}
MSE score of the best_estimator: -18982.923


**XGBoost**

In [146]:
from xgboost import XGBRegressor

model_xgb = XGBRegressor(
    random_state=SEED,
    learning_rate=0.1,
    max_depth=5,
    n_estimators=100,
)


model = model_xgb.fit(X.drop("running_time",axis=1), Y)


Mean cross-validation accuracy score of the best_estimator: 


In [None]:
test_train_list = []

from numpy import random

indices = X.index.values
rand_chosen = random.default_rng().choice(indices, round(len(indices)/10), replace=False)

rand_chosen

In [193]:
x_test = X.loc[rand_chosen]
y_test = Y.loc[rand_chosen]

x_train = X.loc[~X.index.isin(rand_chosen)]
y_train = Y.loc[~Y.index.isin(rand_chosen)]

In [194]:
x_train

Unnamed: 0,running_time,route_distance_km,mean_nodes_speed,day_period,node_start_lat,node_start_lon
0,2022-01-24 18:30:21,3.740,30.0,3,46.470668,30.736663
1,2022-01-24 06:53:53,3.526,32.0,0,46.474103,30.727825
2,2022-01-24 10:00:59,5.071,26.0,1,46.475841,30.741471
3,2022-01-24 14:28:05,2.867,24.0,2,46.571949,30.793730
4,2022-01-24 11:57:29,3.751,28.0,1,46.446009,30.691674
...,...,...,...,...,...,...
4993,2022-01-24 20:08:56,3.504,32.0,3,46.440358,30.746310
4995,2022-01-24 21:10:38,7.397,47.0,3,46.438255,30.715946
4997,2022-01-24 13:57:04,2.547,49.0,2,46.477205,30.703157
4998,2022-01-24 08:46:13,3.013,27.0,1,46.463377,30.741518


In [198]:
train_pairs = []

In [199]:
for i in x_test.index:
    yt = y_test.loc[i]
    xt = x_test.loc[i]

    x_tr = x_train[x_train["running_time"] < xt["running_time"]].drop(["running_time"],axis=1)
    y_tr = y_train[y_train.index.isin(x_tr.index)]

    # x_tr.drop(labels=["running_time"],axis=1, inplace=True)
    # xt.drop(labels=["running_time"], inplace=True)

    train_pairs.append((x_tr,y_tr,xt,yt))

len(train_pairs)

500

In [200]:
preds = []
for pair in train_pairs:
    x_tr,y_tr,xt,yt = pair
    model = model_xgb.fit(x_tr, y_tr)
    # xt_ =
    # print(xt_)
    prediction = model.predict(xt.to_frame().T.drop(["running_time"],axis=1).astype("float"))
    print(prediction, yt)
    preds.append(prediction)

[492.24365] 418.0
[507.62436] 399.0
[483.93448] 496.00000000000006
[498.02798] 543.0
[420.70285] 406.0
[807.246] 984.0
[806.66144] 791.0
[654.70636] 642.0
[759.79004] 836.0
[337.24072] 251.00000000000003
[383.87183] 632.0
[399.8181] 341.0
[502.0443] 479.00000000000006
[585.2406] 612.0
[485.30573] 548.0
[631.0248] 798.0
[555.1172] 564.0
[470.75583] 138.0
[773.9248] 810.0
[573.80414] 504.00000000000006
[725.173] 971.0
[661.4343] 681.0
[769.1079] 709.0
[532.73] 553.0
[600.6169] 848.0
[896.28217] 997.0
[432.40994] 434.0
[782.4143] 576.0
[740.4293] 535.0
[868.8377] 938.0
[894.08276] 976.0
[841.6085] 916.0
[527.7037] 718.0
[423.64636] 461.00000000000006
[709.74866] 561.0
[745.58997] 950.0
[507.76843] 608.0
[416.1051] 709.0
[525.00745] 488.00000000000006
[601.29346] 819.0
[592.90247] 765.0
[783.91455] 972.0
[735.55817] 737.0
[671.68744] 528.0
[744.4156] 554.0
[281.36444] 691.0
[446.4119] 413.0
[275.2144] 488.00000000000006
[845.3271] 747.0
[527.2212] 891.0
[730.5629] 702.0
[867.19727] 931.0
[

In [203]:
import math

preds_np = np.array(preds).flatten()
math.sqrt(((preds_np-y_test)**2).sum()/len(y_test))

147.60580389973114

In [204]:
preds_np

array([492.24365, 507.62436, 483.93448, 498.02798, 420.70285, 807.246  ,
       806.66144, 654.70636, 759.79004, 337.24072, 383.87183, 399.8181 ,
       502.0443 , 585.2406 , 485.30573, 631.0248 , 555.1172 , 470.75583,
       773.9248 , 573.80414, 725.173  , 661.4343 , 769.1079 , 532.73   ,
       600.6169 , 896.28217, 432.40994, 782.4143 , 740.4293 , 868.8377 ,
       894.08276, 841.6085 , 527.7037 , 423.64636, 709.74866, 745.58997,
       507.76843, 416.1051 , 525.00745, 601.29346, 592.90247, 783.91455,
       735.55817, 671.68744, 744.4156 , 281.36444, 446.4119 , 275.2144 ,
       845.3271 , 527.2212 , 730.5629 , 867.19727, 528.7327 , 802.6925 ,
       438.63626, 847.38464, 634.2353 , 777.3104 , 749.98395, 541.70636,
       546.10754, 399.2643 , 671.19867, 504.24423, 497.86465, 334.2537 ,
       745.18   , 367.57047, 547.4277 , 841.2137 , 574.0349 , 565.47296,
       790.8262 , 642.97296, 562.0309 , 715.7405 , 698.59454, 327.21292,
       276.5967 , 632.48425, 595.76385, 617.796  , 

In [206]:
from sklearn.ensemble import RandomForestRegressor

model_random_forest = RandomForestRegressor(
    random_state=SEED,
    max_depth=7,
    n_estimators=25
)

preds = []
for pair in train_pairs:
    x_tr,y_tr,xt,yt = pair
    model = model_random_forest.fit(x_tr, y_tr)
    prediction = model.predict(xt.to_frame().T.drop(["running_time"],axis=1).astype("float"))
    print(prediction, yt)
    preds.append(prediction)

[495.57503411] 418.0
[426.05252753] 399.0
[519.93739329] 496.00000000000006
[474.36866612] 543.0
[375.] 406.0
[786.72234716] 984.0
[832.39632291] 791.0
[651.76409437] 642.0
[664.44877344] 836.0
[375.41869088] 251.00000000000003
[364.84932717] 632.0
[404.70170146] 341.0
[486.17477278] 479.00000000000006
[573.78973915] 612.0
[505.0464699] 548.0
[683.55675999] 798.0
[583.61141171] 564.0
[482.96694817] 138.0
[747.23616247] 810.0
[611.96641298] 504.00000000000006
[756.41464538] 971.0
[666.17752614] 681.0
[839.90960792] 709.0
[543.82940391] 553.0
[614.61666667] 848.0
[842.96730737] 997.0
[576.08762373] 434.0
[774.394947] 576.0
[735.05216273] 535.0
[846.95863289] 938.0
[871.18088159] 976.0
[776.30942242] 916.0
[593.06529281] 718.0
[436.34180998] 461.00000000000006
[697.71610058] 561.0
[722.48867562] 950.0
[483.73645547] 608.0
[400.67326491] 709.0
[536.73851249] 488.00000000000006
[623.09909069] 819.0
[562.73986418] 765.0
[850.82666667] 972.0
[771.9856894] 737.0
[743.18660314] 528.0
[692.39393

In [207]:
preds_np = np.array(preds).flatten()
math.sqrt(((preds_np-y_test)**2).sum()/len(y_test))

147.9294301044813