In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)

In [2]:
KEYS = [
    'entity_id',
    'financial_report_id',
    'financial_statement_id',
    'entity_ico',
    'entity_name']

DIMENSIONS = [
    'year',
    # 'month',
    # 'financial_report_order_number',
    'period_length',
    # 'consolidated',
    # 'approved_date',
    'type',
    'financial_report_template_id',
    # 'establishment_date',
    # 'cancellation_date',
    # 'sknace_code',
    # 'entity_state',
    'legal_form_code',
    'org_size_code',
    'ownership_category_code',
    'entity_consolidated',
    # 'sknace_division_name',
    'sknace_division',
    # 'sknace_subcategory',
    'sal_class',
    'is_outlier']

RAW_VALUES = [ 
    'total_assets',
    'non_current_assets',
    'non_current_intangible_assets',
    'non_current_tangible_assets',
    'non_current_financial_assets',
    'current_assets',
    'inventories',
    'non_current_receivables',
    'current_receivables',
    'financial_assets',
    'current_financial_assets',
    'cash_and_bank_balances',
    'total_equity_and_liabilities',
    'equity',
    'share_capital',
    'share_premium',
    'net_profit_of_previous_years',
    'net_profit_after_tax',
    'liabilities',
    'non_current_liabilities',
    'long_term_provisions',
    'long_term_bank_loans',
    'current_liabilities',
    'short_term_provisions',
    'current_bank_loans',
    'short_term_financial_assistance',
    'operating_revenues',
    'sales_from_the_merchandise',
    'sales_from_the_own_products_and_services',
    'sales_from_the_other',
    'other_revenues_from_operating_activities',
    'operating_costs',
    'costs_of_merchandise_sold',
    'consumed_materials_energy_non_inventory_supplies',
    'services',
    'personnel_costs',
    'taxes_and_fees',
    'depreciation',
    'remaining_cost_of_sold_long_term_assets_and_materials',
    'adjustments_to_receivables',
    'other_costs_of_operating_activities',
    'operating_result',
    'value_added',
    'revenues_from_financial_activities',
    'cost_of_financial_activities',
    'result_from_fincancial_activities',
    'result_before_tax',
    'income_tax',
    'result_after_tax']

PRIMARY_VALUES = [
    'CA',
    'CASH',
    'CL',
    'EQ',
    'NCL',
    'TA',
    'TL',
    'CC',
    'INV',
    'CF_NETTO',
    'CF_SELFFIN',
    'EAT',
    'EBIT',
    'SAL',
    'COST',
    'INT',
    'YIE',
    'LABOR']

RATIOS = [
    'L3',
    'L2',
    'L1',
    'CF_CL',
    'CASH_TA',
    'SAL_TA',
    'TL_SAL',
    'INV_COST',
    'INV_SAL',
    'CC_SAL',
    'TA_SAL',
    'TL_TA',
    'CF_TL',
    'CL_TA',
    'NCL_TA',
    'EQ_TL',
    'EQ_TA',
    'CL_CC',
    'ROE',
    'EAT_TA',
    'ROA_BRUTTO',
    'CF_TA',
    'CF_SAL',
    'ROS',
    'EAT_YIE',
    'ROI',
    'ROA_NETTO',
    'LABOR_PRODUCTIVITY']

DISTRESS_SIGNALS = [
    'DPHZ_vat_registration_cancelled',
    'DPHZ_vat_registration_cancelled_1y_off',
    'DPHZ_vat_registration_cancelled_2y_off',
    'DPHZ_vat_registration_cancelled_3y_off',
    'RU_event',
    'RU_event_1y_off',
    'RU_event_2y_off',
    'RU_event_3y_off',
    'cancellation_year',
    'RUZ_cancelled',
    'RUZ_cancelled_1y_off',
    'RUZ_cancelled_2y_off',
    'RUZ_cancelled_3y_off']

In [3]:
data_raw = pd.read_csv('../../DATA/FINAL/training_sample.csv')
test_data_raw = pd.read_csv('../../DATA/FINAL/testing_sample.csv')
data_raw

Unnamed: 0,entity_id,year,month,financial_report_id,financial_report_template_id,CA,CASH,CL,EQ,NCL,TA,TL,CC,INV,CF_NETTO,CF_SELFFIN,EAT,EBIT,SAL,COST,INT,YIE,LABOR,L3,L2,L1,CF_CL,CASH_TA,SAL_TA,TL_SAL,INV_COST,INV_SAL,CC_SAL,TA_SAL,TL_TA,CF_TL,CL_TA,NCL_TA,EQ_TL,EQ_TA,EBIT_INT,CL_CC,ROE,EAT_TA,ROA_BRUTTO,CF_TA,CF_SAL,ROS,EAT_YIE,ROI,ROA_NETTO,LABOR_PRODUCTIVITY,financial_statement_id,financial_report_order_number,entity_ico,period_length,consolidated,approved_date,type,entity_name,establishment_date,cancellation_date,sknace_code,entity_state,legal_form_code,org_size_code,ownership_category_code,entity_consolidated,sknace_division_name,sknace_division,sknace_subcategory,DPHZ_vat_registration_cancelled,DPHZ_vat_registration_cancelled_1y_off,DPHZ_vat_registration_cancelled_2y_off,DPHZ_vat_registration_cancelled_3y_off,RU_event,RU_event_1y_off,RU_event_2y_off,RU_event_3y_off,cancellation_year,RUZ_cancelled,RUZ_cancelled_1y_off,RUZ_cancelled_2y_off,RUZ_cancelled_3y_off,sal_class,is_outlier,RUZ_indirect_criteria,RUZ_indirect_criteria_1y_off,RUZ_indirect_criteria_2y_off,RUZ_indirect_criteria_3y_off,combined_issue_criteria_1y_off,combined_issue_criteria_2y_off,combined_issue_criteria_3y_off,plausible_sample
0,27971.0,2017.0,1.0,6322368,699.0,992069.0,244824.0,747985.0,14724.0,25063.0,1014018.0,999294.0,732533.0,11351.0,-69288.0,-72168.0,-76727.0,-73847.0,3818727.0,3859116.0,19980.0,3818774.0,337135.0,1.326322,1.306653,0.327311,-0.092633,0.241440,3.765936,0.261682,1.058885,1.070084,0.191826,95.593762,0.985480,-0.069337,0.737645,0.024717,0.014734,0.014520,-2.696046,1.021094,-5.211016,-0.075666,-0.072826,-0.068330,-0.018144,-0.020092,-0.020092,-0.055963,-0.075666,0.088285,3722506.0,1.0,652423.0,12.0,,2018-06-27,Riadna,KUHO - mix. spol. s r.o.,1991-06-25,,49410.0,,112.0,2.0,2.0,False,Pozemná doprava a doprava potrubím,H,49.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,,0,0,0,0,2. Malý,False,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
1,72912.0,2017.0,1.0,6351388,699.0,1481047.0,1630.0,1326396.0,-718007.0,549510.0,1521883.0,2239890.0,1417869.0,61548.0,47298.0,44418.0,10312.0,13192.0,4512342.0,4460274.0,32121.0,4512573.0,692228.0,1.116595,1.070192,0.001229,0.035659,0.001071,2.964973,0.496392,4.967695,4.910372,0.314220,121.417632,1.471789,0.021116,0.871549,0.361072,-0.320555,-0.471789,1.410697,0.935486,-0.014362,0.006776,0.008668,0.031079,0.010482,0.002285,0.002285,0.027882,0.006776,0.153408,3748560.0,1.0,35904798.0,12.0,,2018-06-21,Riadna,"WETRON, s.r.o.",2004-10-23,,33200.0,,112.0,0.0,8.0,False,Oprava a inštalácia strojov a prístrojov,C,33.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,,0,0,0,0,2. Malý,False,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,74296.0,2016.0,1.0,5844975,699.0,514344.0,10817.0,477398.0,-100409.0,0.0,1660755.0,1761164.0,410644.0,92883.0,66009.0,-141558.0,-344270.0,-341390.0,4362628.0,4667711.0,34421.0,4362628.0,501993.0,1.077390,0.882829,0.022658,0.138268,0.006513,2.626894,0.403693,7.163657,7.664619,0.094128,137.043956,1.060460,0.037480,0.287458,0.000000,-0.057013,-0.060460,-8.918073,1.162559,3.428677,-0.207297,-0.205563,0.039746,0.015131,-0.078913,-0.078913,-0.186571,-0.207297,0.115067,3338450.0,2.0,36561380.0,12.0,,2017-07-28,Riadna,Hoeckle s.r.o.,2004-06-25,,25620.0,,112.0,0.0,7.0,False,Výroba kovových konštrukcií okrem strojov a za...,C,25.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,,0,0,0,0,2. Malý,True,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,81349.0,2017.0,1.0,6370373,699.0,514945.0,13348.0,82348.0,597059.0,278564.0,1595777.0,998718.0,501597.0,0.0,339656.0,326252.0,46443.0,59847.0,2017657.0,1883181.0,34235.0,2017735.0,250683.0,6.253279,6.253279,0.162093,4.124642,0.008365,1.264373,0.494989,0.000000,0.000000,0.248604,284.726155,0.625851,0.340092,0.051604,0.174563,0.597825,0.374149,2.748123,0.164172,0.077786,0.029104,0.037503,0.212847,0.168342,0.023018,0.023017,0.050557,0.029104,0.124245,3765328.0,1.0,36266388.0,12.0,,,Riadna,LINE - Molnár s.r.o.,2004-12-01,,49410.0,,112.0,0.0,2.0,False,Pozemná doprava a doprava potrubím,H,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0,0,0,0,2. Malý,False,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
4,87510.0,2015.0,7.0,5445087,699.0,9172473.0,44403.0,8906402.0,4432599.0,428023.0,22657631.0,14844555.0,1750141.0,7329272.0,-1271601.0,-1078264.0,-3471890.0,-3469010.0,41304834.0,44478524.0,215601.0,41315293.0,6894645.0,1.029874,0.201489,0.004986,-0.142774,0.001960,1.822999,0.359390,59.321616,63.879640,0.042371,197.476817,0.655168,-0.085661,0.393086,0.018891,0.298601,0.195634,-15.089953,5.088963,-0.783263,-0.153233,-0.153106,-0.056122,-0.030786,-0.084055,-0.084034,-0.143717,-0.153233,0.166921,3014087.0,3.0,36269727.0,12.0,,2017-04-30,Riadna,Bodet & Horst mattress ticking Verwaltungs s.r.o.,2005-03-18,,13910.0,,112.0,3.0,7.0,False,Výroba textilu,C,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0,0,0,0,3. Stredný,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7237,1480184.0,2015.0,1.0,5316898,699.0,641383.0,39399.0,623606.0,158385.0,49123.0,958608.0,800223.0,590797.0,11187.0,184274.0,166468.0,82007.0,99813.0,4957318.0,4848950.0,4844.0,4958205.0,205019.0,1.028507,1.010568,0.063179,0.295497,0.041100,5.171371,0.161423,0.830555,0.812399,0.119177,69.614029,0.834776,0.230278,0.650533,0.051244,0.197926,0.165224,21.605491,1.055533,0.517770,0.085548,0.104123,0.192231,0.037172,0.016543,0.016540,0.090601,0.085548,0.041357,2917154.0,1.0,46959599.0,12.0,,2016-04-22,Riadna,Leras s.r.o.,2013-01-01,,10110.0,,112.0,0.0,7.0,False,Výroba potravín,C,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7238,1490180.0,2018.0,1.0,6573202,699.0,936228.0,11695.0,889625.0,73183.0,94.0,1021804.0,948621.0,924533.0,0.0,8448.0,2143.0,313.0,6618.0,2660003.0,2649476.0,2048.0,2660003.0,8135.0,1.052385,1.052385,0.013146,0.009496,0.011445,2.603242,0.356624,0.000000,0.000000,0.347568,138.289107,0.928379,0.008906,0.870642,0.000092,0.077147,0.071621,4.231445,0.962243,0.004277,0.000306,0.006477,0.008268,0.003176,0.000118,0.000118,0.002311,0.000306,0.003058,3924270.0,2.0,46807454.0,12.0,,2019-09-24,Riadna,"Anton Putiš, s.r.o.",2013-01-01,,52100.0,,112.0,0.0,2.0,False,Skladové a pomocné činnosti v doprave,H,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7239,1501557.0,2018.0,1.0,6645297,699.0,627336.0,215394.0,294697.0,336724.0,41072.0,684838.0,342686.0,318813.0,37679.0,428880.0,340885.0,318119.0,406114.0,4044062.0,3636352.0,4473.0,4046254.0,75015.0,2.128749,1.812733,0.730900,1.455325,0.314518,5.905137,0.084738,3.730233,3.354162,0.078835,60.963872,0.500390,1.251525,0.430316,0.059973,0.982602,0.491684,91.792309,0.924357,0.944747,0.464517,0.593007,0.626250,0.106052,0.078663,0.078621,0.471049,0.464517,0.018549,3986615.0,1.0,46946853.0,12.0,,,Riadna,ARTINIT s. r. o.,2013-01-01,,69200.0,,112.0,5.0,2.0,False,Právne a účtovnícke činnosti,M,69.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7240,1505752.0,2018.0,1.0,6645006,699.0,572868.0,703.0,403370.0,145092.0,1293.0,577015.0,431923.0,572165.0,0.0,71286.0,51922.0,50349.0,69713.0,2473724.0,2403670.0,0.0,2473724.0,1890340.0,1.420205,1.420205,0.001743,0.176726,0.001218,4.287105,0.174604,0.000000,0.000000,0.231297,83.972747,0.748547,0.165043,0.699063,0.002241,0.335921,0.251453,18053.666667,0.704989,0.347014,0.087258,0.120817,0.123543,0.028817,0.020354,0.020354,0.087258,0.087258,0.764168,3986334.0,1.0,46946098.0,12.0,,2017-06-16,Riadna,Job Impulse s.r.o.,2012-12-21,,78200.0,,112.0,6.0,8.0,False,Sprostredkovanie práce,N,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
SAL_CLASSES = data_raw['sal_class'].unique()
SKNACE_GROUPS = data_raw['sknace_division'].unique()

In [5]:
modelling_dataset = data_raw.query('is_outlier == False')

In [8]:
from sklearn.impute import SimpleImputer


def fill_missing_values_mean(data):
    # Instantiate SimpleImputer with mean strategy
    imputer = SimpleImputer(strategy='mean')

    # Fit the imputer on the data and transform the data
    imputed_data = imputer.fit_transform(data)

    return imputed_data

training_data_filled = pd.DataFrame()

for sal_class in SAL_CLASSES:
    for sknace in SKNACE_GROUPS:
        # print(f'sales_class: {sal_class}; sknace: {sknace}')
        subset_training = modelling_dataset.query('sal_class == @sal_class and sknace_division == @sknace')
        for col in RATIOS:
            col_mean = subset_training[col].mean()
            subset_training[col] = subset_training[col].fillna(col_mean)       

        training_data_filled = pd.concat([training_data_filled, subset_training], axis = 0)

display(training_data_filled.shape[0])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_training[col] = subset_training[col].fillna(col_mean)


6230

In [31]:
# Importing necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


def generate_train_test_sample(data, features_selection, criterion):


    train_sample = pd.concat([data.query(f'{criterion} == 0').sample(500, replace = True), data.query(f'{criterion} == 1').sample(500, replace = True)], axis=0)

    X =  train_sample[features_selection].values
    y =  train_sample[criterion].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test





def generate_logit_model(data1, features_selection, criterion):

    X_train, X_test, y_train, y_test = generate_train_test_sample(data1, features_selection, criterion)
    
    # Creating SMOTE and RandomUnderSampler objects
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    
    logistic_model = LogisticRegression(max_iter=2000)

    # Creating a pipeline to combine sampling and modeling
    pipeline = Pipeline([('SMOTE', smote), ('RUS', rus), ('Logistic Regression', logistic_model)])
        
    # Fitting the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Define scoring metrics
    scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'balanced_accuracy']

    # Perform cross-validation
    cv_results = cross_validate(pipeline, X_test, y_test, cv=5, scoring=scoring)

    
    return (
        np.mean(cv_results['test_accuracy']),
        np.mean(cv_results['test_precision']),
        np.mean(cv_results['test_recall']),
        np.mean(cv_results['test_f1']),
        np.mean(cv_results['test_roc_auc']),
        np.mean(cv_results['test_balanced_accuracy']))

In [41]:
possible_features = RATIOS[::]
selected_features = ['TL_TA', 'CL_TA', 'CC_SAL']

for feature in selected_features:
    possible_features.remove(feature)

summary = []

for candidate_feature in possible_features:

    summary_row = []
    summary_row.append(selected_features)
    summary_row.append(candidate_feature)

    summary_row += generate_logit_model(training_data_filled, [candidate_feature] + selected_features, 'combined_issue_criteria_3y_off')
    summary.append(summary_row)


logit_summary_df = pd.DataFrame(summary, columns = ['selected_features', 'candidate_feature', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC Score', 'Balanced Accuracy'])

display(logit_summary_df.shape[0])
display(logit_summary_df.sort_values('F1 Score', ascending = False).head(20))
# display(logit_summary_df.sort_values('F1 Score', ascending = True).head(20))



25

Unnamed: 0,selected_features,candidate_feature,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Balanced Accuracy
3,"[TL_TA, CL_TA, CC_SAL]",CF_CL,0.75,0.745797,0.789048,0.764525,0.823144,0.748997
11,"[TL_TA, CL_TA, CC_SAL]",NCL_TA,0.74,0.773656,0.740952,0.749569,0.78812,0.741792
7,"[TL_TA, CL_TA, CC_SAL]",INV_COST,0.735,0.743333,0.749524,0.746001,0.812649,0.734499
24,"[TL_TA, CL_TA, CC_SAL]",LABOR_PRODUCTIVITY,0.75,0.781849,0.720952,0.744274,0.806123,0.750739
10,"[TL_TA, CL_TA, CC_SAL]",CF_TL,0.73,0.735859,0.738571,0.735914,0.769585,0.729023
14,"[TL_TA, CL_TA, CC_SAL]",CL_CC,0.715,0.720744,0.750952,0.731953,0.796591,0.714687
9,"[TL_TA, CL_TA, CC_SAL]",TA_SAL,0.715,0.729597,0.731429,0.726903,0.78908,0.715188
8,"[TL_TA, CL_TA, CC_SAL]",INV_SAL,0.72,0.740551,0.730476,0.726237,0.777078,0.719975
21,"[TL_TA, CL_TA, CC_SAL]",EAT_YIE,0.71,0.724585,0.720952,0.721471,0.726466,0.709424
15,"[TL_TA, CL_TA, CC_SAL]",ROE,0.715,0.738118,0.700952,0.718534,0.808629,0.715213
