In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from math import sqrt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

def preprocessing_n_reading_train():
    # Reading csv files
    train_X = pd.read_csv('train_X.csv')
    
    # Creating dataframes and division into time gaps
    columns = train_X.columns
    essential_columns = ['contract_id', 'report_date', 'specialization_id', 'contract_init_sum', 'contract_date', 'project_id', 'building_id', 'contractor_id', 'contract_current_sum']
    ALL_TIME_COLUMNS = essential_columns + [i for i in columns if "ALL_TIME" in i]
    ll = []
    essential_columns = ['contract_id', 'report_date', 'specialization_id', 'contract_init_sum', 'contract_date', 'project_id', 'building_id', 'contractor_id', 'contract_current_sum', 'agg_all_contracts__g_contract__bit_da_guid__isMain__count__ALL_TIME', 'agg_all_contracts__g_contract__abs_change_price_last_ds__isMain__last__ALL_TIME', 'agg_all_contracts__g_contract__abs_change_price_last_ds__isMain__mean__ALL_TIME', 'agg_all_contracts__g_contract__rel_change_price_last_ds__isMain__last__ALL_TIME', 'agg_all_contracts__g_contract__rel_change_price_last_ds__isMain__mean__ALL_TIME']
    time_gaps = ['1W', '2W', '3W', '4W', '5W', '6W', '7W', '8W', '12W', '26W', '52W', '1M', '2M', '3M', '4M', '5M', '6M', '7M', '8M', '12M', '12_24M', '12_36M', '12_48M']
    for i in time_gaps:
        ll.append(essential_columns + [j for j in columns if i in j])
    time_gaps_dataframes = {}
    for i in range(len(ll)):
        time_gaps_dataframes[time_gaps[i]] = train_X[ll[i]]
    time_gaps_dataframes['ALL_TIME'] = train_X[ALL_TIME_COLUMNS]
    return time_gaps_dataframes

def preprocessing_n_reading_test():
    # Reading csv files
    test_X = pd.read_csv('test1_X.csv')
    
    # Creating dataframes and division into time gaps
    columns = test_X.columns
    essential_columns = ['contract_id', 'report_date', 'specialization_id', 'contract_init_sum', 'contract_date', 'project_id', 'building_id', 'contractor_id', 'contract_current_sum']
    ALL_TIME_COLUMNS = essential_columns + [i for i in columns if "ALL_TIME" in i]
    ll = []
    essential_columns = ['contract_id', 'report_date', 'specialization_id', 'contract_init_sum', 'contract_date', 'project_id', 'building_id', 'contractor_id', 'contract_current_sum', 'agg_all_contracts__g_contract__bit_da_guid__isMain__count__ALL_TIME', 'agg_all_contracts__g_contract__abs_change_price_last_ds__isMain__last__ALL_TIME', 'agg_all_contracts__g_contract__abs_change_price_last_ds__isMain__mean__ALL_TIME', 'agg_all_contracts__g_contract__rel_change_price_last_ds__isMain__last__ALL_TIME', 'agg_all_contracts__g_contract__rel_change_price_last_ds__isMain__mean__ALL_TIME']
    time_gaps = ['1W', '2W', '3W', '4W', '5W', '6W', '7W', '8W', '12W', '26W', '52W', '1M', '2M', '3M', '4M', '5M', '6M', '7M', '8M', '12M', '12_24M', '12_36M', '12_48M']
    for i in time_gaps:
        ll.append(essential_columns + [j for j in columns if i in j])
    time_gaps_dataframes = {}
    for i in range(len(ll)):
        time_gaps_dataframes[time_gaps[i]] = test_X[ll[i]]
    time_gaps_dataframes['ALL_TIME'] = test_X[ALL_TIME_COLUMNS]
    return time_gaps_dataframes

def nans_checker(df):
    s = 0
    nans = df.isna().sum().tolist()
    for i in range(len(nans)):
        if nans[i] != 0:
            s += 1
            print(f"Column index: {i}\nQuantity of nans: {nans[i]}\nColumn name: {df.columns[i]}\n")

    print(s, max(nans), df.shape[0])

In [84]:
dataframes_train = preprocessing_n_reading_train()
dataframes_test = preprocessing_n_reading_test()

Unnamed: 0,contract_id,report_date,specialization_id,contract_init_sum,contract_date,project_id,building_id,contractor_id,contract_current_sum,agg_all_contracts__g_contract__bit_da_guid__isMain__count__ALL_TIME,...,agg_cec_requests__g_contract__request_id__all__count__1W,agg_cec_requests__g_contract__total_sum_accepted__all__sum__1W,agg_payments__g_contract__sum__all__countDistinct__1W,agg_payments__g_contract__sum__all__sum__1W,agg_ks2__g_contract__id__all__count__1W,agg_ks2__g_contract__total_sum__all__sum__1W,agg_spass_applications__g_contract__appl_count_week__mean__1W,agg_workers__g_contract__fact_workers__all__mean__1W,agg_materials__g_contract__order_id__countDistinct__1W,agg_tender_proposal__g_contractor__id__ALL__countDistinct__1W
0,5433,2023-01-01,18,1.115267,2022-05-10 21:00:00,43,701,438,0.793952,-0.539030,...,-0.255766,-0.168947,-0.291946,-0.160384,-0.305877,-0.163229,1.077650,-0.189367,-0.288428,-0.136894
1,6875,2023-01-01,18,1.608002,2022-08-21 21:00:00,31,268,438,2.681675,0.743673,...,0.808046,-0.168947,2.758596,8.587055,-0.305877,-0.163229,0.695774,-0.189367,4.229133,-0.136894
2,1476,2023-01-01,18,-0.360764,2022-10-17 21:00:00,31,268,438,-0.416432,-0.539030,...,-0.255766,-0.168947,-0.291946,-0.160384,-0.305877,-0.163229,0.695774,-0.189367,-0.288428,-0.136894
3,4469,2023-01-01,12,-0.089303,2022-04-20 21:00:00,43,697,484,-0.193827,-0.539030,...,-0.255766,-0.168947,-0.291946,-0.160384,-0.305877,-0.163229,0.695774,-0.189367,-0.288428,-0.136894
4,1330,2023-01-01,12,-0.515778,2022-03-02 21:00:00,49,224,484,0.133446,0.743673,...,-0.255766,-0.168947,-0.291946,-0.160384,-0.305877,-0.163229,-0.449854,-0.189367,-0.288428,-0.136894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28826,5078,2023-07-23,21,-0.484227,2023-04-13 21:00:00,18,915,683,-0.504509,0.743673,...,-0.255766,-0.168947,-0.291946,-0.160384,-0.305877,-0.163229,-0.449854,-0.189367,-0.288428,-0.136894
28827,3854,2023-07-23,21,-0.500003,2023-04-13 21:00:00,18,915,683,-0.524174,0.743673,...,-0.255766,-0.168947,-0.291946,-0.160384,-0.305877,-0.163229,-0.449854,-0.189367,-0.288428,-0.136894
28828,5351,2023-07-23,21,-0.481102,2023-07-16 21:00:00,50,298,683,-0.515112,-0.539030,...,-0.255766,-0.168947,-0.291946,-0.160384,-0.305877,-0.163229,-0.449854,-0.189367,-0.288428,-0.136894
28829,57,2023-07-23,21,-0.468450,2023-03-16 21:00:00,45,915,683,-0.504737,-0.539030,...,-0.255766,-0.168947,-0.291946,-0.160384,-0.305877,-0.163229,-0.449854,-0.189367,-0.288428,-0.136894


In [None]:
nans = train_X.isna().sum().tolist()
s = 0
for i in range(len(nans)):
    if nans[i] != 0:
        s += 1
        print(f"Column index: {i}\nQuantity of nans: {nans[i]}\nColumn name: {train_X.columns[i]}\n")

print(s, max(nans), train_X.shape[0])

In [52]:
columns = train_X.columns

Unnamed: 0,contract_id,report_date,specialization_id,contract_init_sum,contract_date,project_id,building_id,contractor_id,contract_current_sum,agg_all_contracts__g_contract__bit_da_guid__isMain__count__ALL_TIME,...,agg_Finance__g_contractor__Value__NetProfit_y__last__ALL_TIME,agg_Finance__g_contractor__Value__CostPrice_y__last__ALL_TIME,agg_FinanceAndTaxesFTS__g_contractor__Expenses__last__ALL_TIME,agg_FinanceAndTaxesFTS__g_contractor__Income__last__ALL_TIME,agg_FinanceAndTaxesFTS__g_contractor__TaxArrearsSum__last__ALL_TIME,agg_FinanceAndTaxesFTS__g_contractor__TaxPenaltiesSum__last__ALL_TIME,agg_FinanceAndTaxesFTS__g_contractor__TaxesSum__last__ALL_TIME,agg_ArbitrationCases__g_contractor__DefendantSum__sum__ALL_TIME,agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__ALL_TIME,agg_tender_proposal__g_contractor__id__ALL__countDistinct__ALL_TIME
0,5433,2023-01-01,18,1.115267,2022-05-10 21:00:00,43,701,438,0.793952,-0.539030,...,-0.126966,-0.147499,-0.195024,-0.170525,,,-0.003374,-0.148490,-0.158473,-0.258454
1,6875,2023-01-01,18,1.608002,2022-08-21 21:00:00,31,268,438,2.681675,0.743673,...,-0.126966,-0.147499,-0.195024,-0.170525,,,-0.003374,-0.148490,-0.158473,-0.258454
2,1476,2023-01-01,18,-0.360764,2022-10-17 21:00:00,31,268,438,-0.416432,-0.539030,...,-0.126966,-0.147499,-0.195024,-0.170525,,,-0.003374,-0.148490,-0.158473,-0.258454
3,4469,2023-01-01,12,-0.089303,2022-04-20 21:00:00,43,697,484,-0.193827,-0.539030,...,-0.130052,-0.144810,1.998265,1.984354,,,0.180730,-0.117481,-0.158471,-0.258454
4,1330,2023-01-01,12,-0.515778,2022-03-02 21:00:00,49,224,484,0.133446,0.743673,...,-0.130052,-0.144810,1.998265,1.984354,,,0.180730,-0.117481,-0.158471,-0.258454
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28826,5078,2023-07-23,21,-0.484227,2023-04-13 21:00:00,18,915,683,-0.504509,0.743673,...,-0.181148,-0.156905,-0.411792,-0.556706,,,-0.285360,,,0.148360
28827,3854,2023-07-23,21,-0.500003,2023-04-13 21:00:00,18,915,683,-0.524174,0.743673,...,-0.181148,-0.156905,-0.411792,-0.556706,,,-0.285360,,,0.148360
28828,5351,2023-07-23,21,-0.481102,2023-07-16 21:00:00,50,298,683,-0.515112,-0.539030,...,-0.181148,-0.156905,-0.411792,-0.556706,,,-0.285360,,,0.148360
28829,57,2023-07-23,21,-0.468450,2023-03-16 21:00:00,45,915,683,-0.504737,-0.539030,...,-0.181148,-0.156905,-0.411792,-0.556706,,,-0.285360,,,0.148360
