In [1]:
import pandas as pd
from metrics_f1 import calc_f1_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

import numpy as np
# import matplotlib.pyplot as plt  
from scipy import stats
# import seaborn as sns
from typing import List
from scipy.stats import shapiro
import warnings

In [2]:
y_train_csv = pd.read_csv('./target/y_train.csv')
 # параметры вагона
wag_param = pd.read_parquet('wag_params.parquet').convert_dtypes()
# данные по дислокации
dislok = pd.read_parquet('dislok_wagons.parquet').convert_dtypes()
# список вагонов с остаточным пробегом на момент прогноза
wag_prob = pd.read_parquet('wagons_probeg_ownersip.parquet').convert_dtypes()

In [15]:
#оставляем номер вагон и дату, на которую будем делать прогноз
dataset = y_train_csv[['wagnum', 'month']]
dataset['month'] = dataset['month'].astype("datetime64[ns]")
#добавляем дату постройки вагона и срок службы
wag_data_build = wag_param[['wagnum', 'date_build', 'srok_sl']].drop_duplicates()
dataset = dataset.merge(wag_data_build)
#добавляем дату планового ремонта
data_pl_rem = dislok[['wagnum', 'date_pl_rem']].drop_duplicates()
dataset = dataset.merge(data_pl_rem)
dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['month'] = dataset['month'].astype("datetime64[ns]")


Unnamed: 0,wagnum,month,date_build,srok_sl,date_pl_rem
0,33361,2023-01-01,2011-02-08,2033-03-01,2023-02-17
1,33361,2022-08-01,2011-02-08,2033-03-01,2023-02-17
2,33361,2022-09-01,2011-02-08,2033-03-01,2023-02-17
3,33361,2022-10-01,2011-02-08,2033-03-01,2023-02-17
4,33361,2022-11-01,2011-02-08,2033-03-01,2023-02-17
...,...,...,...,...,...
214318,26318,2022-09-01,1992-12-25,2022-04-27,2023-01-01
214319,26318,2022-10-01,1992-12-25,2022-04-27,2023-01-01
214320,26318,2022-11-01,1992-12-25,2022-04-27,2023-01-01
214321,26318,2022-12-01,1992-12-25,2022-04-27,2023-01-01


In [16]:
months = pd.DataFrame({
    "month_name": ["July", "August", "September", "October", "November", "December", "January", "February"],
    "date" : ["2022-07-01", "2022-08-01", "2022-09-01", "2022-10-01","2022-11-01", "2022-12-01","2023-01-01", "2023-01-31"],
})
# изменение типа данных 
months['date'] = months['date'].astype("datetime64[ns]")

In [17]:
#добавление параметров, зависящих от месяца
for j in range(len(months['date'])):
    date = months.loc[j, 'date']
    month_name = months.loc[j, 'month_name']
    for i in range(5): #range(len(dataset['month'])):
        if date < dataset.loc[i,'month']:
            wagnum = dataset.loc[i, 'wagnum']
            #добавление остаточного пробега
            ost_prob_series = wag_prob.loc[((wag_prob.repdate == date) & (wag_prob.wagnum == wagnum)), 'ost_prob']
            if not ost_prob_series.empty:
                ost_prob = ost_prob_series.iloc[0]
            else :
                ost_prob = 0
            dataset.loc[i, month_name+'_ost_prob'] = ost_prob
            #добавление срока службы
            dataset.loc[i, month_name+'_ost_srok_sl'] = (dataset.loc[i,'srok_sl'] - date).days
dataset

Unnamed: 0,wagnum,month,date_build,srok_sl,date_pl_rem,July_ost_prob,July_ost_srok_sl,August_ost_prob,August_ost_srok_sl,September_ost_prob,September_ost_srok_sl,October_ost_prob,October_ost_srok_sl,November_ost_prob,November_ost_srok_sl,December_ost_prob,December_ost_srok_sl
0,33361,2023-01-01,2011-02-08,2033-03-01,2023-02-17,0.0,3896.0,7541.0,3865.0,4105.0,3834.0,2185.0,3804.0,1236.0,3773.0,159916.0,3743.0
1,33361,2022-08-01,2011-02-08,2033-03-01,2023-02-17,0.0,3896.0,,,,,,,,,,
2,33361,2022-09-01,2011-02-08,2033-03-01,2023-02-17,0.0,3896.0,7541.0,3865.0,,,,,,,,
3,33361,2022-10-01,2011-02-08,2033-03-01,2023-02-17,0.0,3896.0,7541.0,3865.0,4105.0,3834.0,,,,,,
4,33361,2022-11-01,2011-02-08,2033-03-01,2023-02-17,0.0,3896.0,7541.0,3865.0,4105.0,3834.0,2185.0,3804.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214318,26318,2022-09-01,1992-12-25,2022-04-27,2023-01-01,,,,,,,,,,,,
214319,26318,2022-10-01,1992-12-25,2022-04-27,2023-01-01,,,,,,,,,,,,
214320,26318,2022-11-01,1992-12-25,2022-04-27,2023-01-01,,,,,,,,,,,,
214321,26318,2022-12-01,1992-12-25,2022-04-27,2023-01-01,,,,,,,,,,,,


In [18]:
dataset = dataset.merge(y_train_csv[['wagnum', 'target_month']])
dataset

Unnamed: 0,wagnum,month,date_build,srok_sl,date_pl_rem,July_ost_prob,July_ost_srok_sl,August_ost_prob,August_ost_srok_sl,September_ost_prob,September_ost_srok_sl,October_ost_prob,October_ost_srok_sl,November_ost_prob,November_ost_srok_sl,December_ost_prob,December_ost_srok_sl,target_month
0,33361,2023-01-01,2011-02-08,2033-03-01,2023-02-17,0.0,3896.0,7541.0,3865.0,4105.0,3834.0,2185.0,3804.0,1236.0,3773.0,159916.0,3743.0,0
1,33361,2023-01-01,2011-02-08,2033-03-01,2023-02-17,0.0,3896.0,7541.0,3865.0,4105.0,3834.0,2185.0,3804.0,1236.0,3773.0,159916.0,3743.0,0
2,33361,2023-01-01,2011-02-08,2033-03-01,2023-02-17,0.0,3896.0,7541.0,3865.0,4105.0,3834.0,2185.0,3804.0,1236.0,3773.0,159916.0,3743.0,0
3,33361,2023-01-01,2011-02-08,2033-03-01,2023-02-17,0.0,3896.0,7541.0,3865.0,4105.0,3834.0,2185.0,3804.0,1236.0,3773.0,159916.0,3743.0,0
4,33361,2023-01-01,2011-02-08,2033-03-01,2023-02-17,0.0,3896.0,7541.0,3865.0,4105.0,3834.0,2185.0,3804.0,1236.0,3773.0,159916.0,3743.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1285908,26318,2022-12-01,1992-12-25,2022-04-27,2023-01-01,,,,,,,,,,,,,0
1285909,26318,2022-12-01,1992-12-25,2022-04-27,2023-01-01,,,,,,,,,,,,,0
1285910,26318,2022-12-01,1992-12-25,2022-04-27,2023-01-01,,,,,,,,,,,,,0
1285911,26318,2022-12-01,1992-12-25,2022-04-27,2023-01-01,,,,,,,,,,,,,0


In [53]:
def train_models(df, columns_to_drop, y_name, cat_col_names, num_col_names, time_col_names, random_st):
    RANDOM_STATE = random_st
    X = df.drop(columns_to_drop, axis=1)
    y = df[y_name]
    X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=RANDOM_STATE) 

    if cat_col_names:
        encoder = OneHotEncoder(drop='first', sparse=False)
        X_train_ohe = encoder.fit_transform(X_train[cat_col_names])
        X_test_ohe = encoder.transform(X_test[cat_col_names])
        print(X_train_ohe)
        encoder_col_names = encoder.get_feature_names()
        print(encoder_col_names)

    for col in time_col_names:
        df[col] = df[col].values.astype("float64")

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train[num_col_names])
    X_test_scaled = scaler.transform(X_test[num_col_names])

    X_train_ohe = pd.DataFrame(X_train_ohe, columns=encoder_col_names)
    X_test_ohe = pd.DataFrame(X_test_ohe, columns=encoder_col_names)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=num_col_names)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=num_col_names)

    X_train = pd.concat([X_train_ohe, X_train_scaled], axis=1)
    X_test = pd.concat([X_test_ohe, X_test_scaled], axis=1)

    model_lr  = LinearRegression()
    model_lr.fit(X_train, y_train)
    predictions = model_lr.predict(X_test)
    residuals =  y_test - predictions

    return model_lr, encoder, scaler, y_test, predictions, residuals

In [None]:
columns_to_drop = []
cat_col_names = []
num_col_names = dataset.columns.values.tolist()
predict_column_name = 'target_month'
time_col_names = ['month', 'date_build', 'srok_sl', 'date_pl_rem']

model_lr, encoder, scaler, y_test, predictions, residuals = train_models(dataset, columns_to_drop, predict_column_name, 
                                   cat_col_names, num_col_names, time_col_names, 42)

In [None]:
calc_f1_score(y_test, predictions)