In [1]:
import pandas as pd
from metrics_f1 import calc_f1_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import numpy as np

# from scipy import stats
# from typing import List
# from scipy.stats import shapiro

from tqdm import tqdm

In [2]:
y_train_csv = pd.read_csv('./target/y_train.csv')
 # параметры вагона
wag_param = pd.read_parquet('wag_params.parquet').convert_dtypes()
# данные по дислокации
dislok = pd.read_parquet('dislok_wagons.parquet').convert_dtypes()
# список вагонов с остаточным пробегом на момент прогноза
wag_prob = pd.read_parquet('wagons_probeg_ownersip.parquet').convert_dtypes()

In [3]:
#для ускорения
data = y_train_csv.copy()
data

Unnamed: 0,wagnum,month,target_month,target_day
0,33361,2023-01-01,0,0
1,33364,2023-01-01,0,0
2,33366,2023-01-01,0,0
3,33358,2023-01-01,0,0
4,33349,2023-01-01,0,0
...,...,...,...,...
203848,25045,2022-12-01,0,0
203849,27156,2022-12-01,0,0
203850,21361,2022-12-01,0,0
203851,8061,2022-12-01,0,0


In [4]:
def get_dataset(data, columns_to_drop):
    #создаем копию тренировочных данных
    dataset = data.copy()
    dataset['month'] = dataset['month'].astype("datetime64[ns]")
    #добавляем дату постройки вагона и срок службы
    wag_data_build = wag_param[['wagnum', 'date_build', 'srok_sl']].drop_duplicates()
    dataset = dataset.merge(wag_data_build, how='left')
    #добавляем дату планового ремонта
    data_pl_rem = dislok[['wagnum', 'date_pl_rem']].drop_duplicates('wagnum')
    dataset = dataset.merge(data_pl_rem, how='left', on='wagnum')
    
    months = pd.DataFrame({
        "month_name": ["July", "August", "September", "October", "November", "December", "January", "February"],
        "date" : ["2022-07-01", "2022-08-01", "2022-09-01", "2022-10-01","2022-11-01", "2022-12-01","2023-01-01", "2023-01-31"],
    })
    # изменение типа данных 
    months['date'] = months['date'].astype("datetime64[ns]")
    
    #добавление параметров, зависящих от месяца
    for j in range(len(months['date'])):
        date = months.loc[j, 'date']
        month_name = months.loc[j, 'month_name']
        for i in tqdm(range(len(dataset['month']))):
            if date < dataset.loc[i,'month']:
                wagnum = dataset.loc[i, 'wagnum']
                #добавление остаточного пробега
                ost_prob_series = wag_prob.loc[((wag_prob.repdate == date) & (wag_prob.wagnum == wagnum)), 'ost_prob']
                if not ost_prob_series.empty:
                    ost_prob = ost_prob_series.iloc[0]
                else :
                    ost_prob = 0
                dataset.loc[i, month_name+'_ost_prob'] = ost_prob
                #добавление срока службы
                dataset.loc[i, month_name+'_ost_srok_sl'] = (dataset.loc[i,'srok_sl'] - date).days
            else:
                dataset.loc[i, month_name+'_ost_prob'] = 0
                dataset.loc[i, month_name+'_ost_srok_sl'] = 0
    
    # Удаление пустот
    dataset = dataset.dropna()
    dataset_y = dataset[columns_to_drop]
    dataset = dataset.drop(columns_to_drop, axis=1)

    #Подготовка датасета
    cat_col_names = []
    time_col_names = ['month', 'date_build', 'srok_sl', 'date_pl_rem']
    
    num_col_names = dataset.columns.values.tolist()
    
    df = dataset.copy()
    
    # Преобразуем признаки из формата Дата в формат float64
    for col in time_col_names:
        df[col] = df[col].values.astype("float64")
            
    #Кодирование категориальных признаков с помощью OHE
    encoder = OneHotEncoder(drop='first', sparse=False)
    X_ohe = encoder.fit_transform(df[cat_col_names])
    encoder_col_names = encoder.get_feature_names_out()
    
    # Применим масштабирование к вещественным признакам  
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[num_col_names])
    
    X_ohe = pd.DataFrame(X_ohe, columns=encoder_col_names)
    X_scaled = pd.DataFrame(X_scaled, columns=num_col_names)
    
    X = pd.concat([X_ohe, X_scaled], axis=1)

    return X, dataset_y

In [5]:
columns_to_drop = ['target_month', 'target_day']
X, dataset_y = get_dataset(data, columns_to_drop)

100%|█████████████████████████████████| 203853/203853 [1:26:37<00:00, 39.22it/s]
100%|█████████████████████████████████| 203853/203853 [1:13:52<00:00, 45.99it/s]
100%|███████████████████████████████████| 203853/203853 [59:15<00:00, 57.34it/s]
100%|███████████████████████████████████| 203853/203853 [45:15<00:00, 75.07it/s]
100%|██████████████████████████████████| 203853/203853 [30:14<00:00, 112.32it/s]
100%|██████████████████████████████████| 203853/203853 [15:32<00:00, 218.64it/s]
100%|█████████████████████████████████| 203853/203853 [00:27<00:00, 7347.09it/s]
100%|█████████████████████████████████| 203853/203853 [00:27<00:00, 7352.49it/s]


In [75]:
X

Unnamed: 0,wagnum,month,date_build,srok_sl,date_pl_rem,July_ost_prob,July_ost_srok_sl,August_ost_prob,August_ost_srok_sl,September_ost_prob,...,October_ost_prob,October_ost_srok_sl,November_ost_prob,November_ost_srok_sl,December_ost_prob,December_ost_srok_sl,January_ost_prob,January_ost_srok_sl,February_ost_prob,February_ost_srok_sl
0,2.872574,-256.0,1.043306,0.141934,-0.894937,0.0,0.141934,-1.434491,0.141934,-1.465378,...,-1.460943,0.141934,-1.463062,0.141934,1.680256,0.141934,0.0,0.0,0.0,0.0
1,2.872930,-256.0,0.954632,-0.092334,0.055149,0.0,-0.092334,-0.721538,-0.092334,-0.853233,...,-0.940453,-0.092334,-1.148351,-0.092334,-1.391033,-0.092334,0.0,0.0,0.0,0.0
2,2.873167,-256.0,0.973698,0.004229,-0.707420,0.0,0.004229,-1.369351,0.004229,-1.348498,...,-1.387362,0.004229,-1.417295,0.004229,-1.445514,0.004229,0.0,0.0,0.0,0.0
3,2.872218,-256.0,1.223983,0.110993,0.651036,0.0,0.110993,-0.607609,0.110993,-0.679149,...,-0.746734,0.110993,-0.855904,0.110993,-0.878882,0.110993,0.0,0.0,0.0,0.0
4,2.871151,-256.0,1.129559,0.236457,-0.315718,0.0,0.236457,-1.478360,0.236457,-1.492789,...,-1.490988,0.236457,1.668946,0.236457,1.560343,0.236457,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754,0.310048,-256.0,0.213766,-0.703674,1.171917,0.0,-0.703674,0.923014,-0.703674,0.674714,...,0.512381,-0.703674,0.430145,-0.703674,0.287868,-0.703674,0.0,0.0,0.0,0.0
755,0.310759,-256.0,0.185015,-0.712854,2.434531,0.0,-0.712854,1.958101,-0.712854,1.671546,...,1.426890,-0.712854,1.191209,-0.712854,1.086453,-0.712854,0.0,0.0,0.0,0.0
756,0.311352,-256.0,0.198331,-0.797857,0.930228,0.0,-0.797857,0.112965,-0.797857,-0.081878,...,-0.318221,-0.797857,-0.484483,-0.797857,-0.648350,-0.797857,0.0,0.0,0.0,0.0
757,0.311589,-256.0,0.415627,-0.518368,0.196828,0.0,-0.518368,-0.247611,-0.518368,-0.463781,...,-0.601641,-0.518368,-0.687876,-0.518368,-0.756127,-0.518368,0.0,0.0,0.0,0.0


In [6]:
def train_models(df, dataset_y, columns_to_drop, y_name, random_st):    
        
    # Разделим выборку на обучающую и тестовую    
    y = dataset_y[y_name]
    X = df.drop(columns_to_drop, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_st) 

    # Инициализация объекта модели
    model_lr = LogisticRegression()
    # Обучение
    model_lr.fit(X_train, y_train)
    # Предсказание
    y_pred = model_lr.predict(X_test)
                                  
    # Расчет метрики ROC-AUC
    roc = roc_auc_score(y_test, y_pred)
    print("ROC=", roc)
    residuals =  y_test - y_pred

    return model_lr, y_test, y_pred, residuals

In [40]:
# dataset.to_csv('dataset.csv')

In [29]:
# dataset = pd.read_csv('dataset.csv')
# dataset['month'] = dataset['month'].astype("datetime64[ns]")
# dataset['date_build'] = dataset['date_build'].astype("datetime64[ns]")
# dataset['srok_sl'] = dataset['srok_sl'].astype("datetime64[ns]")
# dataset['date_pl_rem'] = dataset['date_pl_rem'].astype("datetime64[ns]")
# dataset=dataset.drop(['Unnamed: 0'], axis=1)
# dataset

In [7]:
#Обучение модели по прогнозированию месяца
predict_column_name = 'target_month'
model_lr1, y_test1, y_pred1, residuals1 = train_models(X,  dataset_y, ['wagnum'], predict_column_name, 42)

ROC= 0.508330520743011


In [8]:
#Обучение модели по прогнозированию дня
predict_column_name = 'target_day'
model_lr2, y_test2, y_pred2, residuals2 = train_models(X, dataset_y, ['wagnum'], predict_column_name, 42)

ROC= 0.49993752863271


In [9]:
#Прогнозирование месяца и дня
# month_to_predict = pd.to_datetime('2022-12-01')
# data.month = pd.to_datetime(data.month)
# target_data = data.loc[data.month == month_to_predict, :]
# target_data = target_data[['wagnum', 'month']]
target_data = pd.read_csv('./target/y_predict.csv')
target_data

Unnamed: 0,wagnum,month
0,33361,2023-03-01
1,33364,2023-03-01
2,33366,2023-03-01
3,33358,2023-03-01
4,33349,2023-03-01
...,...,...
33702,17621,2023-03-01
33703,25045,2023-03-01
33704,27156,2023-03-01
33705,21361,2023-03-01


In [None]:
target_X, target_dataset_y = get_dataset(target_data, [])

100%|█████████████████████████████████████| 33707/33707 [14:22<00:00, 39.06it/s]
100%|█████████████████████████████████████| 33707/33707 [14:42<00:00, 38.18it/s]
100%|█████████████████████████████████████| 33707/33707 [14:41<00:00, 38.22it/s]
 82%|██████████████████████████████▎      | 27628/33707 [12:05<02:39, 38.04it/s]

In [106]:
# predict_month_df = pd.DataFrame(predict_month, columns = ['predict_month'])
target_X = target_X.drop('wagnum', axis=1)
predict_month = model_lr1.predict(target_X)
predict_day = model_lr2.predict(target_X)

In [None]:
for i in range(len(predict_day)):
    if predict_day[i] == 1:
        predict_month[i] = 1

In [140]:
predict = pd.DataFrame()
predict['wagnum'] = target_data['wagnum']#[:761]
predict['target_month'] = predict_month
predict['target_day'] = predict_day

ValueError: Length of values (761) does not match length of index (33977)

In [138]:
target_path = './prediction/target_predicton.csv'
predict.to_csv(target_path, index=False)

In [139]:
true_target_path = './prediction/target_predicton_true.csv'
calc_f1_score( true_target_path, target_path,)

ValueError: Found input variables with inconsistent numbers of samples: [33977, 761]

In [14]:
r_squared= r2_score(y_test, predictions)
print("R-squared:", r_squared)

R-squared: 1.0


In [36]:
predictions

array([-1.38777878e-16, -2.08166817e-17,  4.85722573e-17, ...,
       -2.70616862e-16,  8.32667268e-17,  2.77555756e-17])