In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
path_train = r"./train_1/train/"
path_test = r"./train_2/test/"
ind = "wagnum"

# Импортируем датасеты, оставляем необходимые признаки и удаляем дубликаты

In [3]:
wag_prob = pd.read_parquet(path_train + '/wagons_probeg_ownersip.parquet').convert_dtypes()
dislok_wag = pd.read_parquet(path_train + '/dislok_wagons.parquet').convert_dtypes()
wag_param = pd.read_parquet(path_train + '/wag_params.parquet').convert_dtypes()
pr_rems = pd.read_parquet(path_train + '/pr_rems.parquet').convert_dtypes()
tr_rems = pd.read_parquet(path_train + '/tr_rems.parquet').convert_dtypes()

In [4]:
wag_prob = wag_prob[[ind, "ost_prob"]]
dislok_wag = dislok_wag[[ind, "date_kap", "date_dep", "date_pl_rem"]]
wag_param = wag_param[[ind, "model", "gruz", "tara", "date_build", "srok_sl", "cnsi_probeg_dr", "cnsi_probeg_kr", "norma_km"]]
pr_rems = pr_rems[[ind, "model"]]
tr_rems = tr_rems[[ind, "rem_month"]]

In [5]:
wag_prob.drop_duplicates(subset=ind, inplace=True)
dislok_wag.drop_duplicates(subset=ind, inplace=True)
wag_param.drop_duplicates(subset=ind, inplace=True)
pr_rems.drop_duplicates(subset=ind, inplace=True)
tr_rems.drop_duplicates(subset=ind, inplace=True)

In [6]:
train_df = wag_prob.merge(dislok_wag, on=ind, how="left")\
    .merge(wag_param, on=ind, how="left")\
    .merge(pr_rems, on=ind, how="left")\
    .merge(tr_rems, on=ind, how="left")

In [7]:
wag_prob = pd.read_parquet(path_test + '/wagons_probeg_ownersip.parquet').convert_dtypes()
dislok_wag = pd.read_parquet(path_test + '/dislok_wagons.parquet').convert_dtypes()
wag_param = pd.read_parquet(path_test + '/wag_params.parquet').convert_dtypes()
pr_rems = pd.read_parquet(path_test + '/pr_rems.parquet').convert_dtypes()
tr_rems = pd.read_parquet(path_test + '/tr_rems.parquet').convert_dtypes()

In [8]:
wag_prob = wag_prob[[ind, "ost_prob"]]
dislok_wag = dislok_wag[[ind, "date_kap", "date_dep", "date_pl_rem"]]
wag_param = wag_param[[ind, "model", "gruz", "tara", "date_build", "srok_sl", "cnsi_probeg_dr", "cnsi_probeg_kr", "norma_km"]]
pr_rems = pr_rems[[ind, "model"]]
tr_rems = tr_rems[[ind, "rem_month"]]

In [9]:
wag_prob.drop_duplicates(subset=ind, inplace=True)
dislok_wag.drop_duplicates(subset=ind, inplace=True)
wag_param.drop_duplicates(subset=ind, inplace=True)
pr_rems.drop_duplicates(subset=ind, inplace=True)
tr_rems.drop_duplicates(subset=ind, inplace=True)

In [10]:
test_df = wag_prob.merge(dislok_wag, on=ind, how="left")\
    .merge(wag_param, on=ind, how="left")\
    .merge(pr_rems, on=ind, how="left")\
    .merge(tr_rems, on=ind, how="left")

# Почистим пропущенные значения

In [11]:
train_df.isna().sum()

wagnum                0
ost_prob           1679
date_kap          14979
date_dep           1913
date_pl_rem           0
model_x               0
gruz                  0
tara                  0
date_build            0
srok_sl               0
cnsi_probeg_dr        0
cnsi_probeg_kr        0
norma_km              0
model_y           23584
rem_month         16128
dtype: int64

In [12]:
test_df.isna().sum()

wagnum                0
ost_prob           1166
date_kap          14985
date_dep           1920
date_pl_rem           7
model_x               0
gruz                  0
tara                  0
date_build            0
srok_sl               0
cnsi_probeg_dr        0
cnsi_probeg_kr        0
norma_km              0
model_y           32119
rem_month         29786
dtype: int64

Фича ```model_y``` имеет слишком много пропущенных значений, поэтому удалим её

In [13]:
train_df.drop(columns=["model_x", "model_y"], inplace=True)
test_df.drop(columns=["model_x", "model_y"], inplace=True)

Фичи ```rem_month``` и ```date_kap``` также имеют слишком много пропусков

In [14]:
train_df.drop(columns=["rem_month", "date_kap"], inplace=True)
test_df.drop(columns=["rem_month", "date_kap"], inplace=True)

# Переведём данные с датами к одному формату, а далее закодируем используя timestap

In [16]:
date_features = ["date_dep", "date_pl_rem", "date_build", "srok_sl"]

train_df[date_features] = train_df[date_features].astype("datetime64[ms]")
test_df[date_features] = test_df[date_features].astype("datetime64[ms]")

In [30]:
for column in date_features:
    train_df[column] = pd.to_datetime(train_df[column], errors="ignore").astype(int)
    train_median_date = train_df[column].median()
    train_df[column].fillna(train_median_date, inplace=True)
    test_df[column] = pd.to_datetime(test_df[column], errors="ignore").astype(int)
    test_median_date = test_df[column].median()
    test_df[column].fillna(test_median_date, inplace=True)

In [31]:
test_df.isna().sum()

wagnum               0
ost_prob          1166
date_dep             0
date_pl_rem          0
gruz                 0
tara                 0
date_build           0
srok_sl              0
cnsi_probeg_dr       0
cnsi_probeg_kr       0
norma_km             0
dtype: int64

Заполним последний столбец медианными значениями

In [32]:
train_df["ost_prob"].fillna(int(train_df["ost_prob"].median()), inplace=True)
test_df["ost_prob"].fillna(int(test_df["ost_prob"].median()), inplace=True)

In [33]:
test_df.isna().sum()

wagnum            0
ost_prob          0
date_dep          0
date_pl_rem       0
gruz              0
tara              0
date_build        0
srok_sl           0
cnsi_probeg_dr    0
cnsi_probeg_kr    0
norma_km          0
dtype: int64

# Импортируем таргеты

In [36]:
train_target = pd.read_csv(path_train + "target/y_train.csv")
train_target.head()

Unnamed: 0,wagnum,month,target_month,target_day
0,33361,2023-01-01,0,0
1,33364,2023-01-01,0,0
2,33366,2023-01-01,0,0
3,33358,2023-01-01,0,0
4,33349,2023-01-01,0,0


In [37]:
train_dataset = train_df.merge(train_target, on=ind, how="left")

In [46]:
X_train = train_dataset.drop(columns=[ind, "month", "target_day", "target_month"])
y_day_train, y_month_train = train_dataset["target_day"], train_dataset["target_month"]

In [52]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [51]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
catboost_model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss', random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

stacked_model_day = StackingClassifier(
    classifiers=[rf_model, catboost_model, gb_model, mlp_model],
    meta_classifier=RandomForestClassifier(n_estimators=50, random_state=42)
)

stacked_model_month = StackingClassifier(
    classifiers=[rf_model, catboost_model, gb_model, mlp_model],
    meta_classifier=RandomForestClassifier(n_estimators=50, random_state=42)
)

In [54]:
day_classifier_pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('classifier', stacked_model_day)
])

month_classifier_pipeline = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('classifier', stacked_model_month)
])

In [55]:
day_classifier_pipeline.fit(X_train, y_day_train)
month_classifier_pipeline.fit(X_train, y_month_train)

0:	learn: 0.6041101	total: 10.2ms	remaining: 1.01s
1:	learn: 0.5314021	total: 18.9ms	remaining: 926ms
2:	learn: 0.4708702	total: 28.6ms	remaining: 924ms
3:	learn: 0.4197250	total: 36.4ms	remaining: 874ms
4:	learn: 0.3761771	total: 45.4ms	remaining: 862ms
5:	learn: 0.3389129	total: 53.8ms	remaining: 843ms
6:	learn: 0.3067274	total: 58.9ms	remaining: 782ms
7:	learn: 0.2788259	total: 66ms	remaining: 759ms
8:	learn: 0.2544725	total: 73.2ms	remaining: 741ms
9:	learn: 0.2332237	total: 82.8ms	remaining: 746ms
10:	learn: 0.2144682	total: 91.3ms	remaining: 739ms
11:	learn: 0.1979867	total: 100ms	remaining: 737ms
12:	learn: 0.1836059	total: 109ms	remaining: 730ms
13:	learn: 0.1707528	total: 116ms	remaining: 714ms
14:	learn: 0.1594095	total: 124ms	remaining: 700ms
15:	learn: 0.1493724	total: 132ms	remaining: 693ms
16:	learn: 0.1404920	total: 140ms	remaining: 686ms
17:	learn: 0.1325739	total: 148ms	remaining: 676ms
18:	learn: 0.1257271	total: 156ms	remaining: 666ms
19:	learn: 0.1194445	total: 164m

In [60]:
test_day = day_classifier_pipeline.predict(test_df.drop(columns=[ind]))
test_month = month_classifier_pipeline.predict(test_df.drop(columns=[ind]))

In [79]:
pd.Series(test_day).value_counts()

0    33708
Name: count, dtype: int64

In [80]:
pd.Series(test_month).value_counts()

0    33708
Name: count, dtype: int64