In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)


In [2]:
data_raw = pd.read_csv('../../DATA/FINAL/transformed_set_enriched.csv')

In [3]:
KEYS = [
    'entity_id',
    'financial_report_id',
    'financial_statement_id',
    'entity_ico',
    'entity_name']

DIMENSIONS = [
    'year',
    # 'month',
    # 'financial_report_order_number',
    'period_length',
    # 'consolidated',
    # 'approved_date',
    'type',
    'financial_report_template_id',
    # 'establishment_date',
    # 'cancellation_date',
    # 'sknace_code',
    # 'entity_state',
    'legal_form_code',
    'org_size_code',
    'ownership_category_code',
    'entity_consolidated',
    # 'sknace_division_name',
    'sknace_division',
    'sknace_division_normalized',
    # 'sknace_subcategory',
    'sal_class']

RAW_VALUES = [ 
    'total_assets',
    'non_current_assets',
    'non_current_intangible_assets',
    'non_current_tangible_assets',
    'non_current_financial_assets',
    'current_assets',
    'inventories',
    'non_current_receivables',
    'current_receivables',
    'financial_assets',
    'current_financial_assets',
    'cash_and_bank_balances',
    'total_equity_and_liabilities',
    'equity',
    'share_capital',
    'share_premium',
    'net_profit_of_previous_years',
    'net_profit_after_tax',
    'liabilities',
    'non_current_liabilities',
    'long_term_provisions',
    'long_term_bank_loans',
    'current_liabilities',
    'short_term_provisions',
    'current_bank_loans',
    'short_term_financial_assistance',
    'operating_revenues',
    'sales_from_the_merchandise',
    'sales_from_the_own_products_and_services',
    'sales_from_the_other',
    'other_revenues_from_operating_activities',
    'operating_costs',
    'costs_of_merchandise_sold',
    'consumed_materials_energy_non_inventory_supplies',
    'services',
    'personnel_costs',
    'taxes_and_fees',
    'depreciation',
    'remaining_cost_of_sold_long_term_assets_and_materials',
    'adjustments_to_receivables',
    'other_costs_of_operating_activities',
    'operating_result',
    'value_added',
    'revenues_from_financial_activities',
    'cost_of_financial_activities',
    'result_from_fincancial_activities',
    'result_before_tax',
    'income_tax',
    'result_after_tax']


RATIOS = ['L1',
       'L2', 'L3', 'NCL_CF', 'CASH_TA', 'INV_COGS', 'CC_SAL', 'TL_COST',
       'CL_COST', 'SAL_TA', 'EQ_TA', 'EQ_TL', 'CL_CC', 'LTC_NCA', 'TZ',
       'CF_TL', 'CL_TA', 'NCL_TA', 'EBT_REV', 'ROS', 'EAT_REV', 'ROA', 'ROE',
       'CF_TA', 'CF_SAL', 'LABOR_EAT', 'LABOR_REV']

PRIMARY_VALUES = [
    'CA', 'CASH',
       'CL', 'EQ', 'NCL', 'TA', 'TL', 'CC', 'INV', 'CF_NETTO', 'CF_SELFFIN',
       'REV', 'VA', 'OE', 'EAT', 'EBIT', 'SAL', 'COST', 'INT'
]



DISTRESS_SIGNALS = [
    'DPHZ_vat_registration_cancelled',
    'DPHZ_vat_registration_cancelled_1y_off',
    'DPHZ_vat_registration_cancelled_2y_off',
    'DPHZ_vat_registration_cancelled_3y_off',
    'RU_event',
    'RU_event_1y_off',
    'RU_event_2y_off',
    'RU_event_3y_off',
    'cancellation_year',
    'RUZ_cancelled',
    'RUZ_cancelled_1y_off',
    'RUZ_cancelled_2y_off',
    'RUZ_cancelled_3y_off']

In [4]:

entity_details_table = pd.read_csv('../../DATA/TRANSFORM/entity_details/financial_statements_register_entity_details.csv')
entity_details_table = entity_details_table[['id', 'entity_name']].rename(columns={'id': 'entity_id'})
entity_details_table

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,entity_id,entity_name
0,4.0,"NESS Slovensko, a.s."
1,63.0,"EKOCHEM, a.s. v likvidácií"
2,74.0,"ZTS Sabinov, a. s."
3,184.0,HPK engineering a.s.
4,185.0,"CHEMINVEST, a.s."
...,...,...
410569,2096976.0,"MERIN SERVICES, s. r. o."
410570,2096979.0,ELDA-BAU s. r. o.
410571,2096986.0,Derma Medical s.r.o.
410572,2096989.0,Cmetal s. r. o.


In [5]:
NOT_RELEVANT = ['T', 'U', 'O']
RELEVANT = ['H', 'C', 'M', 'F', 'G']

def sknace_grouping(x):
    if x in ['T', 'U', 'O']:
        return 'non-relevant'
    if x not in ['H', 'C', 'M', 'F', 'G']:
        return 'other'
    return x

data_raw['sknace_division_normalized'] = data_raw['sknace_division'].apply(sknace_grouping)

In [6]:
data = data_raw.drop(columns=RAW_VALUES)
display(data.shape[0])
data = data.drop(columns = 'entity_name').merge(entity_details_table.drop_duplicates('entity_id'), on ='entity_id', how = 'left')
data_raw = None
display(data.shape[0])

955333

955333

In [7]:
def parse_name(x):
    if 'v likvidacii' in x or 'v likvidácii' in x:
        return 'likvidacia'
    
    if 'v konkurze' in x:
        return 'konkurz'
    
    if 'v restrukturalizacii' in x or 'v reštrukturalizácii' in x:
        return 'restrukturalizacia'

    return 'ok'

data['parsed_state'] = data['entity_name'].apply(parse_name)

In [8]:
# pomer VI a zavazkov je mensi ako 8, t.j EQ_TL < 0.08
# 2016 : 0.04, 2017 : 0.06, 2018 : 0.08, 
# EQ < 0
# L3 < 1


def process_issues(row):
    EQ_TL = row.EQ_TL
    EQ = row.EQ
    L3 = row.L3
    EAT = row.EAT
    y = row.year

    if y <= 2016: threat_criterion = 0.04
    elif y == 2017: threat_criterion = 0.06
    else: threat_criterion = 0.08

    if EQ_TL < threat_criterion and EQ < 0 and L3 < 1 and EAT < 0:
        return 1
    return 0
    

In [9]:
ruz_indirect_list = []

for _, row in data.iterrows():
    criteria_value = process_issues(row)
    ruz_indirect_list.append([row['entity_id'], row['year'], row['period_length'], criteria_value])

ruz_indirect_df = pd.DataFrame(ruz_indirect_list, columns=['entity_id', 'year', 'period_length', 'RUZ_indirect_criteria'])


In [10]:
ruz_indirect_df['period_length_check'] = ruz_indirect_df['period_length'].apply(lambda x: 1 if x == 12 else 0)
ruz_indirect_df

ruz_indirect_df = (ruz_indirect_df
                   .sort_values(['period_length_check', 'RUZ_indirect_criteria'], ascending=[False, False])
                   .groupby(['entity_id', 'year'])
                   .first()
                   .reset_index())[['entity_id', 'year', 'RUZ_indirect_criteria']]

In [11]:
def process_entity_issues_2y_off(row, issue_type):
        return max(row[issue_type+'_1y_off'], row[issue_type+'_2y_off'])

def process_entity_issues_3y_off(row, issue_type):
        return max(row[issue_type+'_1y_off'], row[issue_type+'_2y_off'], row[issue_type+'_3y_off'])


entity_issues_indicators_1y_off = ruz_indirect_df.copy()
entity_issues_indicators_2y_off = ruz_indirect_df.copy()
entity_issues_indicators_3y_off = ruz_indirect_df.copy()


entity_issues_indicators_1y_off['year'] = entity_issues_indicators_1y_off['year'] - 1
entity_issues_indicators_1y_off.rename(columns={'RUZ_indirect_criteria':'RUZ_indirect_criteria_1y_off'}, inplace=True)

entity_issues_indicators_2y_off['year'] = entity_issues_indicators_2y_off['year'] - 2
entity_issues_indicators_2y_off.rename(columns={'RUZ_indirect_criteria':'RUZ_indirect_criteria_2y_off'}, inplace=True)

entity_issues_indicators_3y_off['year'] = entity_issues_indicators_3y_off['year'] - 3
entity_issues_indicators_3y_off.rename(columns={'RUZ_indirect_criteria':'RUZ_indirect_criteria_3y_off'}, inplace=True)


entity_issues_indicators = (ruz_indirect_df
                            .merge(
                                entity_issues_indicators_1y_off,
                                on = ['entity_id', 'year'],
                                how = 'outer'
                            )                           
                            .merge(
                                entity_issues_indicators_2y_off,
                                on = ['entity_id', 'year'],
                                how = 'outer'
                            )                                                    
                            .merge(
                                entity_issues_indicators_3y_off,
                                on = ['entity_id', 'year'],
                                how = 'outer'
                            ))

for col in ['RUZ_indirect_criteria',
       'RUZ_indirect_criteria_1y_off', 'RUZ_indirect_criteria_2y_off',
       'RUZ_indirect_criteria_3y_off']:
       entity_issues_indicators[col] = entity_issues_indicators[col].fillna(0)

entity_issues_indicators['RUZ_indirect_criteria_2y_off'] = entity_issues_indicators.apply(lambda x: process_entity_issues_2y_off(x, 'RUZ_indirect_criteria'), axis = 1)
entity_issues_indicators['RUZ_indirect_criteria_3y_off'] = entity_issues_indicators.apply(lambda x: process_entity_issues_3y_off(x, 'RUZ_indirect_criteria'), axis = 1)

entity_issues_indicators


Unnamed: 0,entity_id,year,RUZ_indirect_criteria,RUZ_indirect_criteria_1y_off,RUZ_indirect_criteria_2y_off,RUZ_indirect_criteria_3y_off
0,4.0,2014.0,0.0,0.0,0.0,0.0
1,4.0,2015.0,0.0,0.0,0.0,0.0
2,4.0,2016.0,0.0,0.0,0.0,0.0
3,4.0,2017.0,0.0,0.0,0.0,1.0
4,4.0,2018.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...
1420963,1611042.0,2011.0,0.0,0.0,0.0,0.0
1420964,1611565.0,2011.0,0.0,0.0,0.0,0.0
1420965,1634614.0,2012.0,0.0,0.0,0.0,0.0
1420966,1635085.0,2012.0,0.0,0.0,0.0,1.0


In [12]:
data = data.merge(entity_issues_indicators, on = ['entity_id', 'year'], how = 'left')

In [13]:
DISTRESS_SIGNALS = [
    'DPHZ_vat_registration_cancelled',
    'DPHZ_vat_registration_cancelled_1y_off',
    'DPHZ_vat_registration_cancelled_2y_off',
    'DPHZ_vat_registration_cancelled_3y_off',
    'RU_event',
    'RU_event_1y_off',
    'RU_event_2y_off',
    'RU_event_3y_off',
    'cancellation_year',
    'RUZ_cancelled',
    'RUZ_cancelled_1y_off',
    'RUZ_cancelled_2y_off',
    'RUZ_cancelled_3y_off',
    'RUZ_indirect_criteria',
    'RUZ_indirect_criteria_1y_off',
    'RUZ_indirect_criteria_2y_off',
    'RUZ_indirect_criteria_3y_off']


In [14]:
data.query('sal_class in ["2. Malý", "3. Stredný"]')['cancellation_year'].value_counts()

2017.0    488
2021.0    442
2020.0    422
2022.0    345
2016.0    324
2018.0    303
2019.0    247
2015.0     89
2014.0      1
Name: cancellation_year, dtype: int64

In [15]:
def get_sample_sizes(data, row_label, column_label):
    samples_df = data.groupby([row_label, column_label])['financial_statement_id'].nunique().to_frame('count').reset_index()
    wide_df = pd.pivot_table(samples_df, index = row_label, columns=column_label, fill_value=0)
    return wide_df

In [16]:
def combined_criteria(row, years_offset):
    return max(row[f'RU_event_{int(years_offset)}y_off'], row[f'RUZ_cancelled_{int(years_offset)}y_off'], row[f'RUZ_indirect_criteria_{int(years_offset)}y_off'])

data['combined_issue_criteria_1y_off'] = data.apply(lambda x: combined_criteria(x, 1), axis = 1)
data['combined_issue_criteria_2y_off'] = data.apply(lambda x: combined_criteria(x, 2), axis = 1)
data['combined_issue_criteria_3y_off'] = data.apply(lambda x: combined_criteria(x, 3), axis = 1)

data

Unnamed: 0,entity_id,year,month,financial_report_id,financial_report_template_id,CA,CASH,CL,EQ,NCL,TA,TL,CC,INV,CF_NETTO,CF_SELFFIN,REV,VA,OE,EAT,EBIT,SAL,COST,INT,LABOR,L1,L2,L3,NCL_CF,CASH_TA,INV_COGS,CC_SAL,TL_COST,CL_COST,SAL_TA,EQ_TA,EQ_TL,CL_CC,LTC_NCA,TZ,CF_TL,CL_TA,NCL_TA,EBT_REV,ROS,EAT_REV,ROA,ROE,CF_TA,CF_SAL,LABOR_EAT,LABOR_REV,financial_statement_id,financial_report_order_number,entity_ico,period_length,consolidated,approved_date,type,establishment_date,cancellation_date,sknace_code,entity_state,legal_form_code,org_size_code,ownership_category_code,entity_consolidated,sknace_division_name,sknace_division,sknace_subcategory,DPHZ_vat_registration_cancelled,DPHZ_vat_registration_cancelled_1y_off,DPHZ_vat_registration_cancelled_2y_off,DPHZ_vat_registration_cancelled_3y_off,RU_event,RU_event_1y_off,RU_event_2y_off,RU_event_3y_off,cancellation_year,RUZ_cancelled,RUZ_cancelled_1y_off,RUZ_cancelled_2y_off,RUZ_cancelled_3y_off,sal_class,establishment_year,sknace_division_normalized,entity_name,parsed_state,RUZ_indirect_criteria,RUZ_indirect_criteria_1y_off,RUZ_indirect_criteria_2y_off,RUZ_indirect_criteria_3y_off,combined_issue_criteria_1y_off,combined_issue_criteria_2y_off,combined_issue_criteria_3y_off
0,4.0,2014.0,1.0,4644773,699.0,8787618.0,5238983.0,4658457.0,2112122.0,3232.0,11200988.0,8657266.0,2578606.0,16429.0,1171078.0,1063283.0,19361305.0,3910128.0,408169.0,1007897.0,1115692.0,18580340.0,18172171.0,4446.0,3384185.0,1.124618,1.678150,1.681677,0.002760,0.467725,,0.138781,0.476402,0.256351,1.658813,0.188566,0.196123,1.806580,2.532115,0.702983,0.135271,0.415897,0.000289,0.057625,0.054245,0.052057,0.089983,0.477196,0.104551,0.063028,3.357669,0.174791,2382229.0,1.0,603783.0,12.0,,2015-07-07,Riadna,1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,other,"NESS Slovensko, a.s.",ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,2015.0,1.0,5382540,699.0,20276962.0,5687157.0,6153737.0,13229589.0,2890.0,20862929.0,7184264.0,14037315.0,6792.0,3940396.0,3245262.0,29573853.0,4276968.0,6057021.0,6536244.0,7231378.0,17608945.0,11551924.0,2910.0,3581817.0,0.924179,3.205284,3.206387,0.000733,0.272596,,0.797170,0.621911,0.532702,0.844030,0.634119,0.648069,0.438384,24.336039,-3.184198,0.548476,0.294960,0.000139,0.244519,0.371189,0.221014,0.313295,0.494063,0.188871,0.223772,0.547993,0.121114,2968194.0,1.0,603783.0,12.0,,2016-09-28,Riadna,1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,other,"NESS Slovensko, a.s.",ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,2016.0,1.0,5665728,699.0,5991927.0,1828391.0,4859422.0,1178778.0,3568.0,6518310.0,5120402.0,3840942.0,5894.0,247140.0,11702.0,12914606.0,3565411.0,313595.0,-44282.0,191156.0,12878215.0,12564620.0,38229.0,3476051.0,0.376257,1.166668,1.167881,0.014437,0.280501,,0.298251,0.407525,0.386754,1.975698,0.180841,0.187132,1.265164,2.735191,-2.244983,0.048266,0.745503,0.000547,0.014802,-0.003439,-0.003429,-0.006793,-0.037566,0.037915,0.019191,-78.498058,0.269157,3194585.0,3.0,603783.0,12.0,,2017-06-12,Riadna,1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,other,"NESS Slovensko, a.s.",ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,2017.0,1.0,6245237,699.0,3016170.0,1224056.0,2649399.0,268147.0,3224.0,3442598.0,2902134.0,1355249.0,1954.0,-1327325.0,-1211994.0,7725983.0,1952011.0,-1442061.0,-1270731.0,-1386062.0,7651564.0,9093625.0,9037.0,3304021.0,0.462013,0.973543,0.974281,-0.002429,0.355562,,0.177121,0.319139,0.291347,2.222613,0.077891,0.084581,1.954917,1.221500,-0.241746,-0.457362,0.769593,0.000937,-0.179403,-0.166075,-0.164475,-0.369120,-4.738934,-0.385559,-0.173471,-2.600095,0.427651,3662172.0,2.0,603783.0,12.0,,2018-06-26,Riadna,1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,1991,other,"NESS Slovensko, a.s.",ok,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,4.0,2018.0,1.0,6757564,699.0,2392774.0,763867.0,1984699.0,146903.0,2689.0,3396723.0,2635535.0,1208801.0,1954.0,-58437.0,-79956.0,7119951.0,1945400.0,-65289.0,-121244.0,-99725.0,7119894.0,7185183.0,13197.0,2241363.0,0.384878,0.993938,0.994923,-0.046015,0.224884,,0.169778,0.366801,0.276221,2.096107,0.043248,0.052797,1.641874,0.794601,-11.309838,-0.022173,0.584298,0.000792,-0.014006,-0.017029,-0.017029,-0.035694,-0.825334,-0.017204,-0.008208,-18.486383,0.314800,4072051.0,3.0,603783.0,12.0,,2019-06-26,Riadna,1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,1991,other,"NESS Slovensko, a.s.",ok,0.0,0.0,1.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955328,1634614.0,2019.0,1.0,7813015,687.0,6639.0,6639.0,1440.0,5199.0,0.0,6639.0,1440.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.610417,4.610417,4.610417,,1.000000,,,,,0.000000,0.783100,0.783100,,,,0.000000,0.216900,0.000000,,,,0.000000,0.000000,0.000000,,,,4917665.0,1.0,35913746.0,12.0,,2021-11-08,Riadna,2005-01-01,,66220.0,,112.0,0.0,7.0,False,Pomocné činnosti finančných služieb a poistenia,K,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,"0. <50,000",2005,other,H.Richter s.r.o.,ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0
955329,1634614.0,2020.0,1.0,7812822,687.0,6639.0,6639.0,1780.0,4859.0,0.0,6639.0,1780.0,0.0,0.0,-340.0,-340.0,0.0,0.0,-340.0,-340.0,-340.0,0.0,340.0,0.0,0.0,3.729775,3.729775,3.729775,-0.000000,1.000000,,,5.235294,5.235294,0.000000,0.731887,0.731887,,,14.291176,-0.191011,0.268113,0.000000,,,,-0.051213,-0.069973,-0.051213,,-0.000000,,4917576.0,1.0,35913746.0,12.0,,2021-11-08,Riadna,2005-01-01,,66220.0,,112.0,0.0,7.0,False,Pomocné činnosti finančných služieb a poistenia,K,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,"0. <50,000",2005,other,H.Richter s.r.o.,ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0
955330,1634614.0,2021.0,1.0,7942973,687.0,6639.0,6639.0,1780.0,4859.0,0.0,6639.0,1780.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.729775,3.729775,3.729775,,1.000000,,,,,0.000000,0.731887,0.731887,,,,0.000000,0.268113,0.000000,,,,0.000000,0.000000,0.000000,,,,5023110.0,1.0,35913746.0,12.0,,2022-03-15,Riadna,2005-01-01,,66220.0,,112.0,0.0,7.0,False,Pomocné činnosti finančných služieb a poistenia,K,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,"0. <50,000",2005,other,H.Richter s.r.o.,ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0
955331,1635085.0,2015.0,1.0,5405717,687.0,52611.0,796.0,133071.0,-80460.0,0.0,52611.0,133071.0,51815.0,0.0,0.0,-480.0,0.0,0.0,0.0,-480.0,0.0,0.0,0.0,0.0,0.0,0.005982,0.395360,0.395360,,0.015130,,,,,0.000000,-1.529338,-1.529338,2.568195,,,0.000000,2.529338,0.000000,,,,-0.009124,0.005966,0.000000,,-0.000000,,2986874.0,1.0,31104193.0,12.0,,,Riadna,1991-12-16,2006-12-20,47510.0,,112.0,0.0,2.0,False,Maloobchod okrem motorových vozidiel a motocyklov,G,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2006.0,1,1,1,1,"0. <50,000",1991,G,"DA-LAS TRADING, spol. s r.o.",ok,1.0,0.0,0.0,0.0,1.0,1.0,1.0


In [17]:
temp = data.query('period_length == 12 and REV > 0 and TA > 0')[['entity_id', 'year', 'REV', 'cancellation_year', 'RU_event_3y_off', 'RUZ_cancelled_3y_off', 'sknace_division', 'financial_statement_id']]

In [18]:
def get_weighted_sample_sizes(data, row_label, column_label, weight):
    samples_df = data.groupby([row_label, column_label])[weight].sum().to_frame('count').reset_index()
    wide_df = pd.pivot_table(samples_df, index = row_label, columns=column_label, fill_value=0)
    return wide_df

In [19]:
get_weighted_sample_sizes(temp, 'sknace_division', 'year', 'REV')

Unnamed: 0_level_0,count,count,count,count,count,count,count,count
year,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0
sknace_division,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,1845145000.0,1861335000.0,1866057000.0,1909732000.0,2005903025,2008761000.0,2196973734,2188476780
B,263989800.0,300915100.0,250771200.0,310206200.0,329235052,329496500.0,214863565,319995485
C,33830350000.0,36353820000.0,35699860000.0,35559300000.0,37824178506,36730660000.0,33987814370,36757186392
D,2313414000.0,2559443000.0,2595885000.0,2505948000.0,2858521973,3006665000.0,2584085285,2511019345
E,629157100.0,651500500.0,656773600.0,700319800.0,745254502,771046900.0,743173363,948339567
F,6057562000.0,7260728000.0,6204776000.0,6576768000.0,7043552849,6987609000.0,6201035235,6329303480
G,39658550000.0,41469500000.0,39824110000.0,41029450000.0,42663808566,42617470000.0,40045424643,42327017290
H,6325830000.0,6675447000.0,6683663000.0,6961080000.0,7222108697,7186020000.0,6451083634,6959046218
I,1194480000.0,1294759000.0,1430401000.0,1329613000.0,1398932857,1528410000.0,1058426946,1052802032
J,4709275000.0,5263808000.0,4438437000.0,4265324000.0,4596605120,4933766000.0,4367479005,4526487892


In [20]:
def get_liquidation_3y_off(row):
    if row.RUZ_cancelled_3y_off == 1 and row.parsed_state != 'ok':
        return 1
    return 0

def get_liquidation_2y_off(row):
    if row.RUZ_cancelled_2y_off == 1 and row.parsed_state != 'ok':
        return 1
    return 0

def get_liquidation_1y_off(row):
    if row.RUZ_cancelled_1y_off == 1 and row.parsed_state != 'ok':
        return 1
    return 0


data['RUZ_liquidation_3y_off'] = data.apply(get_liquidation_3y_off, axis = 1)
data['RUZ_liquidation_2y_off'] = data.apply(get_liquidation_2y_off, axis = 1)
data['RUZ_liquidation_1y_off'] = data.apply(get_liquidation_1y_off, axis = 1)



data.query('RUZ_cancelled_3y_off == 1 and parsed_state != "ok"')['RUZ_liquidation_3y_off'].value_counts()

1    15098
Name: RUZ_liquidation_3y_off, dtype: int64

In [21]:
plausible_entity_ids = pd.DataFrame()

for current_year in 2015, 2016, 2017, 2018:
    previous_year = current_year - 1
    two_consequent_years = set(temp.query('year == @current_year')['entity_id']).intersection(temp.query('year == @previous_year')['entity_id'])

    one_off = current_year + 1
    two_off = current_year + 2
    three_off = current_year + 3

    all_three = (set(temp.query('year == @one_off')['entity_id'])
                 .intersection(temp.query('year == @two_off')['entity_id'])
                 .intersection(temp.query('year == @three_off')['entity_id']))
    
    two_and_cancelled = (set(temp.query('year == @one_off')['entity_id'])
                         .intersection(temp.query('year == @two_off')['entity_id'])
                         .intersection(temp.query('cancellation_year == @three_off')['entity_id']))
    
    one_and_cancelled = (set(temp.query('year == @one_off')['entity_id'])
                         .intersection(temp.query('cancellation_year == @two_off')['entity_id']))
    
    next_cancelled = set(temp.query('cancellation_year == @one_off')['entity_id'])

    ru_event_pass = set(temp.query('year == @current_year and RU_event_3y_off == 1 or RUZ_cancelled_3y_off == 1')['entity_id'])

    future_set = (all_three
                  .union(two_and_cancelled)
                  .union(one_and_cancelled)
                  .union(next_cancelled)
                  .union(ru_event_pass))
    
    total = two_consequent_years.intersection(future_set)

    yearly_plausible_entity_ids = pd.DataFrame(total, columns = ['entity_id'])
    yearly_plausible_entity_ids['year'] = current_year
    
    plausible_entity_ids = pd.concat([plausible_entity_ids, yearly_plausible_entity_ids], axis = 0)

plausible_entity_ids['plausible_sample'] = 1

subset_data = data.merge(plausible_entity_ids, on = ['entity_id', 'year'], how = 'left')
subset_data['plausible_sample'] = subset_data['plausible_sample'].fillna(0)

subset_data

Unnamed: 0,entity_id,year,month,financial_report_id,financial_report_template_id,CA,CASH,CL,EQ,NCL,TA,TL,CC,INV,CF_NETTO,CF_SELFFIN,REV,VA,OE,EAT,EBIT,SAL,COST,INT,LABOR,L1,L2,L3,NCL_CF,CASH_TA,INV_COGS,CC_SAL,TL_COST,CL_COST,SAL_TA,EQ_TA,EQ_TL,CL_CC,LTC_NCA,TZ,CF_TL,CL_TA,NCL_TA,EBT_REV,ROS,EAT_REV,ROA,ROE,CF_TA,CF_SAL,LABOR_EAT,LABOR_REV,financial_statement_id,financial_report_order_number,entity_ico,period_length,consolidated,approved_date,type,establishment_date,cancellation_date,sknace_code,entity_state,legal_form_code,org_size_code,ownership_category_code,entity_consolidated,sknace_division_name,sknace_division,sknace_subcategory,DPHZ_vat_registration_cancelled,DPHZ_vat_registration_cancelled_1y_off,DPHZ_vat_registration_cancelled_2y_off,DPHZ_vat_registration_cancelled_3y_off,RU_event,RU_event_1y_off,RU_event_2y_off,RU_event_3y_off,cancellation_year,RUZ_cancelled,RUZ_cancelled_1y_off,RUZ_cancelled_2y_off,RUZ_cancelled_3y_off,sal_class,establishment_year,sknace_division_normalized,entity_name,parsed_state,RUZ_indirect_criteria,RUZ_indirect_criteria_1y_off,RUZ_indirect_criteria_2y_off,RUZ_indirect_criteria_3y_off,combined_issue_criteria_1y_off,combined_issue_criteria_2y_off,combined_issue_criteria_3y_off,RUZ_liquidation_3y_off,RUZ_liquidation_2y_off,RUZ_liquidation_1y_off,plausible_sample
0,4.0,2014.0,1.0,4644773,699.0,8787618.0,5238983.0,4658457.0,2112122.0,3232.0,11200988.0,8657266.0,2578606.0,16429.0,1171078.0,1063283.0,19361305.0,3910128.0,408169.0,1007897.0,1115692.0,18580340.0,18172171.0,4446.0,3384185.0,1.124618,1.678150,1.681677,0.002760,0.467725,,0.138781,0.476402,0.256351,1.658813,0.188566,0.196123,1.806580,2.532115,0.702983,0.135271,0.415897,0.000289,0.057625,0.054245,0.052057,0.089983,0.477196,0.104551,0.063028,3.357669,0.174791,2382229.0,1.0,603783.0,12.0,,2015-07-07,Riadna,1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,other,"NESS Slovensko, a.s.",ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0
1,4.0,2015.0,1.0,5382540,699.0,20276962.0,5687157.0,6153737.0,13229589.0,2890.0,20862929.0,7184264.0,14037315.0,6792.0,3940396.0,3245262.0,29573853.0,4276968.0,6057021.0,6536244.0,7231378.0,17608945.0,11551924.0,2910.0,3581817.0,0.924179,3.205284,3.206387,0.000733,0.272596,,0.797170,0.621911,0.532702,0.844030,0.634119,0.648069,0.438384,24.336039,-3.184198,0.548476,0.294960,0.000139,0.244519,0.371189,0.221014,0.313295,0.494063,0.188871,0.223772,0.547993,0.121114,2968194.0,1.0,603783.0,12.0,,2016-09-28,Riadna,1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,other,"NESS Slovensko, a.s.",ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1.0
2,4.0,2016.0,1.0,5665728,699.0,5991927.0,1828391.0,4859422.0,1178778.0,3568.0,6518310.0,5120402.0,3840942.0,5894.0,247140.0,11702.0,12914606.0,3565411.0,313595.0,-44282.0,191156.0,12878215.0,12564620.0,38229.0,3476051.0,0.376257,1.166668,1.167881,0.014437,0.280501,,0.298251,0.407525,0.386754,1.975698,0.180841,0.187132,1.265164,2.735191,-2.244983,0.048266,0.745503,0.000547,0.014802,-0.003439,-0.003429,-0.006793,-0.037566,0.037915,0.019191,-78.498058,0.269157,3194585.0,3.0,603783.0,12.0,,2017-06-12,Riadna,1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,other,"NESS Slovensko, a.s.",ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1.0
3,4.0,2017.0,1.0,6245237,699.0,3016170.0,1224056.0,2649399.0,268147.0,3224.0,3442598.0,2902134.0,1355249.0,1954.0,-1327325.0,-1211994.0,7725983.0,1952011.0,-1442061.0,-1270731.0,-1386062.0,7651564.0,9093625.0,9037.0,3304021.0,0.462013,0.973543,0.974281,-0.002429,0.355562,,0.177121,0.319139,0.291347,2.222613,0.077891,0.084581,1.954917,1.221500,-0.241746,-0.457362,0.769593,0.000937,-0.179403,-0.166075,-0.164475,-0.369120,-4.738934,-0.385559,-0.173471,-2.600095,0.427651,3662172.0,2.0,603783.0,12.0,,2018-06-26,Riadna,1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,1991,other,"NESS Slovensko, a.s.",ok,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0,1.0
4,4.0,2018.0,1.0,6757564,699.0,2392774.0,763867.0,1984699.0,146903.0,2689.0,3396723.0,2635535.0,1208801.0,1954.0,-58437.0,-79956.0,7119951.0,1945400.0,-65289.0,-121244.0,-99725.0,7119894.0,7185183.0,13197.0,2241363.0,0.384878,0.993938,0.994923,-0.046015,0.224884,,0.169778,0.366801,0.276221,2.096107,0.043248,0.052797,1.641874,0.794601,-11.309838,-0.022173,0.584298,0.000792,-0.014006,-0.017029,-0.017029,-0.035694,-0.825334,-0.017204,-0.008208,-18.486383,0.314800,4072051.0,3.0,603783.0,12.0,,2019-06-26,Riadna,1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,1991,other,"NESS Slovensko, a.s.",ok,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955328,1634614.0,2019.0,1.0,7813015,687.0,6639.0,6639.0,1440.0,5199.0,0.0,6639.0,1440.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.610417,4.610417,4.610417,,1.000000,,,,,0.000000,0.783100,0.783100,,,,0.000000,0.216900,0.000000,,,,0.000000,0.000000,0.000000,,,,4917665.0,1.0,35913746.0,12.0,,2021-11-08,Riadna,2005-01-01,,66220.0,,112.0,0.0,7.0,False,Pomocné činnosti finančných služieb a poistenia,K,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,"0. <50,000",2005,other,H.Richter s.r.o.,ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0
955329,1634614.0,2020.0,1.0,7812822,687.0,6639.0,6639.0,1780.0,4859.0,0.0,6639.0,1780.0,0.0,0.0,-340.0,-340.0,0.0,0.0,-340.0,-340.0,-340.0,0.0,340.0,0.0,0.0,3.729775,3.729775,3.729775,-0.000000,1.000000,,,5.235294,5.235294,0.000000,0.731887,0.731887,,,14.291176,-0.191011,0.268113,0.000000,,,,-0.051213,-0.069973,-0.051213,,-0.000000,,4917576.0,1.0,35913746.0,12.0,,2021-11-08,Riadna,2005-01-01,,66220.0,,112.0,0.0,7.0,False,Pomocné činnosti finančných služieb a poistenia,K,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,"0. <50,000",2005,other,H.Richter s.r.o.,ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0
955330,1634614.0,2021.0,1.0,7942973,687.0,6639.0,6639.0,1780.0,4859.0,0.0,6639.0,1780.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.729775,3.729775,3.729775,,1.000000,,,,,0.000000,0.731887,0.731887,,,,0.000000,0.268113,0.000000,,,,0.000000,0.000000,0.000000,,,,5023110.0,1.0,35913746.0,12.0,,2022-03-15,Riadna,2005-01-01,,66220.0,,112.0,0.0,7.0,False,Pomocné činnosti finančných služieb a poistenia,K,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,"0. <50,000",2005,other,H.Richter s.r.o.,ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0
955331,1635085.0,2015.0,1.0,5405717,687.0,52611.0,796.0,133071.0,-80460.0,0.0,52611.0,133071.0,51815.0,0.0,0.0,-480.0,0.0,0.0,0.0,-480.0,0.0,0.0,0.0,0.0,0.0,0.005982,0.395360,0.395360,,0.015130,,,,,0.000000,-1.529338,-1.529338,2.568195,,,0.000000,2.529338,0.000000,,,,-0.009124,0.005966,0.000000,,-0.000000,,2986874.0,1.0,31104193.0,12.0,,,Riadna,1991-12-16,2006-12-20,47510.0,,112.0,0.0,2.0,False,Maloobchod okrem motorových vozidiel a motocyklov,G,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2006.0,1,1,1,1,"0. <50,000",1991,G,"DA-LAS TRADING, spol. s r.o.",ok,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0,0,0.0


In [22]:
subset_data = subset_data.query('plausible_sample == 1 and sal_class in ["2. Malý", "3. Stredný"] and period_length == 12 and RUZ_indirect_criteria == 0 and RUZ_cancelled == 0 and RU_event == 0 and (RUZ_cancelled_3y_off == 0 or parsed_state != "ok")')

In [23]:
master_sample = subset_data.query('RU_event_3y_off == 1').sample(frac=1, random_state=7).groupby('entity_id').first().reset_index()
master_sample_set = set(master_sample['entity_id'])

ruz_cancelled_entity = subset_data.query('RUZ_cancelled_3y_off == 1 and entity_id not in @master_sample_set').sample(frac=1, random_state=7).groupby('entity_id').first().reset_index()

master_sample = pd.concat([master_sample, ruz_cancelled_entity], axis = 0)
master_sample_set = set(master_sample['entity_id'])

ruz_indirect_criteria = subset_data.query('RUZ_indirect_criteria_3y_off == 1 and entity_id not in @master_sample_set').sample(frac=1, random_state=7).groupby('entity_id').first().reset_index()

master_sample = pd.concat([master_sample, ruz_indirect_criteria], axis = 0)
master_sample_set = set(master_sample['entity_id'])

rest = subset_data.query('entity_id not in @master_sample_set').sample(frac=1, random_state=7).groupby('entity_id').first().reset_index()

master_sample = pd.concat([master_sample, rest], axis = 0)

display(master_sample['entity_id'].value_counts())

27971.0      1
486300.0     1
485687.0     1
485759.0     1
485802.0     1
            ..
180469.0     1
180453.0     1
180437.0     1
180423.0     1
1611042.0    1
Name: entity_id, Length: 8988, dtype: int64

In [24]:
for dim in ['RU_event_3y_off', 'RUZ_cancelled_3y_off', 'RUZ_indirect_criteria_3y_off', 'combined_issue_criteria_3y_off']:
    print(dim)
    display(master_sample[dim].value_counts(normalize=True))

RU_event_3y_off


0.0    0.97441
1.0    0.02559
Name: RU_event_3y_off, dtype: float64

RUZ_cancelled_3y_off


0    0.988763
1    0.011237
Name: RUZ_cancelled_3y_off, dtype: float64

RUZ_indirect_criteria_3y_off


0.0    0.926235
1.0    0.073765
Name: RUZ_indirect_criteria_3y_off, dtype: float64

combined_issue_criteria_3y_off


0.0    0.902982
1.0    0.097018
Name: combined_issue_criteria_3y_off, dtype: float64

In [25]:
from sklearn.ensemble import IsolationForest

from sklearn.preprocessing import PowerTransformer
import sys

def detect_outliers_isolation_forest(df):
    filled_data = fill_missing_values_mean(df)

    isolation_forest = IsolationForest(contamination=0.02, random_state=42) 
    outliers = isolation_forest.fit_predict(filled_data)
    
    outlier_mask = pd.Series(outliers == -1, index=filled_data.index)

    return outlier_mask


def yeo_johnson_transform(data):
    pt = PowerTransformer(method='yeo-johnson')
    columns = data.columns
    index = data.index
    data = pd.DataFrame(pt.fit_transform(data), columns=columns, index=index)
    return data


def fill_missing_values_mean(data):
    data_filled = data.fillna(data.mean())
    data_filled_fallback = data_filled.fillna(0)
    return data_filled_fallback


def mahalanobis_distance(data):
    filled_data = fill_missing_values_mean(data)

    transformed_data = yeo_johnson_transform(filled_data)
    normalized_data = (transformed_data - transformed_data.mean()) / transformed_data.std()
    
    mean_vector = normalized_data.mean()
    covariance_matrix = normalized_data.cov()

    if np.linalg.cond(covariance_matrix) > 1 / sys.float_info.epsilon:
        regularization = 1e-6
        covariance_matrix += np.eye(covariance_matrix.shape[0]) * regularization

    try:
        inv_covariance_matrix = np.linalg.inv(covariance_matrix)

        distances = []
        for index, row in normalized_data.iterrows():
            diff = row - mean_vector
            distance = np.sqrt(np.dot(np.dot(diff.T, inv_covariance_matrix), diff))
            distances.append(distance)

        return pd.Series(distances, index=data.index)

    except np.linalg.LinAlgError:
        print("Error inverting covariance matrix. Data might be singular.")
        return pd.Series([], index=data.index)

def flag_outliers_mahalanobis(data, threshold=3):
    distances = mahalanobis_distance(data)
    chi2_threshold = threshold ** 2

    try:
        cutoff = np.percentile(distances, 100 * (1 - np.exp(-chi2_threshold / 2)))
        outliers = distances > cutoff
        return outliers

    except Exception as e:
        print(f"Error calculating cutoff: {e}")
        return pd.Series([], index=data.index)



In [26]:

outlier_boolean_1 = pd.DataFrame()
outlier_boolean_2 = pd.DataFrame()

for sal_class in master_sample['sal_class'].unique():
    subset = master_sample.query('sal_class == @sal_class').set_index(['financial_statement_id'])[RATIOS]

    outlier_boolean_1 = pd.concat([
        outlier_boolean_1,
        flag_outliers_mahalanobis(subset[RATIOS]).to_frame(name = 'is_mahalanobis_outlier').reset_index()
    ], axis=0)
            
    outlier_boolean_2 = pd.concat([
        outlier_boolean_2,
        detect_outliers_isolation_forest(subset[RATIOS]).to_frame(name = 'is_isolation_forest_outlier').reset_index()
    ], axis=0)



display(outlier_boolean_1['is_mahalanobis_outlier'].value_counts())
display(outlier_boolean_2['is_isolation_forest_outlier'].value_counts())



False    8887
True      101
Name: is_mahalanobis_outlier, dtype: int64

False    8807
True      181
Name: is_isolation_forest_outlier, dtype: int64

In [27]:

display(outlier_boolean_1['is_mahalanobis_outlier'].value_counts(normalize = True))
display(outlier_boolean_2['is_isolation_forest_outlier'].value_counts(normalize = True))

False    0.988763
True     0.011237
Name: is_mahalanobis_outlier, dtype: float64

False    0.979862
True     0.020138
Name: is_isolation_forest_outlier, dtype: float64

In [28]:
for col in ['is_mahalanobis_outlier', 'is_isolation_forest_outlier']:
    # if col in master_sample.columns: print('drop')
    if col in master_sample.columns: master_sample.drop(columns=[col], inplace=True)

In [29]:
display(master_sample.shape)

master_sample = master_sample.merge(outlier_boolean_1, on = ['financial_statement_id'], how = 'left')
master_sample = master_sample.merge(outlier_boolean_2, on = ['financial_statement_id'], how = 'left')

display(master_sample.shape)
display(master_sample)

(8988, 99)

(8988, 101)

Unnamed: 0,entity_id,year,month,financial_report_id,financial_report_template_id,CA,CASH,CL,EQ,NCL,TA,TL,CC,INV,CF_NETTO,CF_SELFFIN,REV,VA,OE,EAT,EBIT,SAL,COST,INT,LABOR,L1,L2,L3,NCL_CF,CASH_TA,INV_COGS,CC_SAL,TL_COST,CL_COST,SAL_TA,EQ_TA,EQ_TL,CL_CC,LTC_NCA,TZ,CF_TL,CL_TA,NCL_TA,EBT_REV,ROS,EAT_REV,ROA,ROE,CF_TA,CF_SAL,...,LABOR_REV,financial_statement_id,financial_report_order_number,entity_ico,period_length,consolidated,approved_date,type,establishment_date,cancellation_date,sknace_code,entity_state,legal_form_code,org_size_code,ownership_category_code,entity_consolidated,sknace_division_name,sknace_division,sknace_subcategory,DPHZ_vat_registration_cancelled,DPHZ_vat_registration_cancelled_1y_off,DPHZ_vat_registration_cancelled_2y_off,DPHZ_vat_registration_cancelled_3y_off,RU_event,RU_event_1y_off,RU_event_2y_off,RU_event_3y_off,cancellation_year,RUZ_cancelled,RUZ_cancelled_1y_off,RUZ_cancelled_2y_off,RUZ_cancelled_3y_off,sal_class,establishment_year,sknace_division_normalized,entity_name,parsed_state,RUZ_indirect_criteria,RUZ_indirect_criteria_1y_off,RUZ_indirect_criteria_2y_off,RUZ_indirect_criteria_3y_off,combined_issue_criteria_1y_off,combined_issue_criteria_2y_off,combined_issue_criteria_3y_off,RUZ_liquidation_3y_off,RUZ_liquidation_2y_off,RUZ_liquidation_1y_off,plausible_sample,is_mahalanobis_outlier,is_isolation_forest_outlier
0,27971.0,2018.0,1.0,6791492,699.0,824851.0,137120.0,564418.0,17376.0,46374.0,875393.0,846466.0,673298.0,11072.0,13087.0,9028.0,3031970.0,260410.0,24567.0,2651.0,6710.0,3031963.0,3007396.0,8218.0,370870.0,0.242941,1.435847,1.455464,3.543516,0.156638,,0.222067,0.281461,0.187677,3.463545,0.019849,0.020115,0.838289,5.924261,1.908459,0.015461,0.644760,0.052975,0.002213,0.000874,0.000874,0.003028,0.152567,0.014950,0.004316,...,0.122320,4101516.0,1.0,652423.0,12.0,,2019-06-28,Riadna,1991-06-25,,49410.0,,112.0,2.0,2.0,False,Pozemná doprava a doprava potrubím,H,49.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,,0,0,0,0,2. Malý,1991,H,KUHO - mix. spol. s r.o.,ok,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,0,1.0,False,False
1,60314.0,2016.0,1.0,5545966,699.0,58815.0,15639.0,56837.0,127982.0,76345.0,2248496.0,1745181.0,23576.0,18552.0,2741491.0,2731211.0,3322652.0,199899.0,-472509.0,2073411.0,2079171.0,692195.0,1164704.0,1958.0,224570.0,0.275155,0.689955,1.016363,0.027848,0.006955,2.734670,0.034060,1.498390,0.048800,0.307848,0.056919,0.068324,2.410799,0.829493,0.615510,1.570892,0.025278,0.033954,0.625756,2.995415,0.624023,0.922132,16.200802,1.219255,3.960576,...,0.067588,3090140.0,1.0,36589993.0,12.0,,,Riadna,2005-02-22,2021-06-24,11050.0,,112.0,0.0,8.0,False,Výroba nápojov,C,11.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2021.0,0,0,0,0,2. Malý,2005,C,Pivovar Kaltenecker s.r.o. v konkurze,konkurz,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0,0,0,1.0,False,False
2,71752.0,2015.0,1.0,5058691,699.0,912862.0,2213.0,48840.0,2399.0,18406.0,948368.0,945969.0,910649.0,0.0,-35846.0,-38726.0,2179262.0,-27686.0,-68089.0,-57122.0,-54242.0,2134583.0,2202672.0,67110.0,12114.0,0.045311,18.690868,18.690868,-0.513474,0.002333,0.000000,0.426617,0.429464,0.022173,2.250796,0.002530,0.002530,0.053632,25.334535,-0.923590,-0.037893,0.051499,0.019408,-0.024890,-0.026760,-0.026212,-0.060232,-23.810754,-0.037798,-0.016793,...,0.005559,2703097.0,1.0,36584649.0,12.0,,2016-03-10,Riadna,2004-10-08,,46720.0,,112.0,0.0,8.0,False,"Veľkoobchod, okrem motorových vozidiel a motoc...",G,46.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,,0,0,0,0,2. Malý,2004,G,FeCom s.r.o. v konkurze,konkurz,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0,0,1.0,False,False
3,72912.0,2017.0,1.0,6351388,699.0,1481047.0,1630.0,1326396.0,-718007.0,549510.0,1521883.0,2239890.0,1417869.0,61548.0,47298.0,44418.0,4512573.0,802511.0,52068.0,10312.0,13192.0,4512342.0,4460274.0,32121.0,692228.0,0.001229,1.070192,1.116595,11.618039,0.001071,,0.314220,0.502187,0.297380,2.964973,-0.471789,-0.471789,0.935486,4.787124,16.043871,0.021116,0.871549,0.361072,0.002923,0.002285,0.002285,0.006776,-0.014362,0.031079,0.010482,...,0.153400,3748560.0,1.0,35904798.0,12.0,,2018-06-21,Riadna,2004-10-23,,33200.0,,112.0,0.0,8.0,False,Oprava a inštalácia strojov a prístrojov,C,33.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,,0,0,0,0,2. Malý,2004,C,"WETRON, s.r.o.",ok,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0,1.0,False,False
4,74296.0,2016.0,1.0,5844975,699.0,514344.0,10817.0,477398.0,-100409.0,0.0,1660755.0,1761164.0,410644.0,92883.0,66009.0,-141558.0,4362628.0,608076.0,-305083.0,-344270.0,-341390.0,4362628.0,4667711.0,34421.0,501993.0,0.022658,0.882829,1.077390,0.000000,0.006513,19.444119,0.094128,0.377308,0.102277,2.626894,-0.060460,-0.060460,1.162559,1.032228,18.888636,0.037480,0.287458,0.000000,-0.078253,-0.078913,-0.078913,-0.207297,3.428677,0.039746,0.015131,...,0.115067,3338450.0,2.0,36561380.0,12.0,,2017-07-28,Riadna,2004-06-25,,25620.0,,112.0,0.0,7.0,False,Výroba kovových konštrukcií okrem strojov a za...,C,25.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,,0,0,0,0,2. Malý,2004,C,Hoeckle s.r.o.,ok,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,0,1.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8983,1490180.0,2018.0,1.0,6573202,699.0,936228.0,11695.0,889625.0,73183.0,94.0,1021804.0,948621.0,924533.0,0.0,8448.0,2143.0,2660003.0,36466.0,10527.0,313.0,6618.0,2660003.0,2649476.0,2048.0,8135.0,0.013146,1.052385,1.052385,0.011127,0.011445,,0.347568,0.358041,0.335774,2.603242,0.071621,0.071621,0.962243,1.544580,1.466974,0.008906,0.870642,0.000092,0.002488,0.000118,0.000118,0.000306,0.004277,0.008268,0.003176,...,0.003058,3924270.0,2.0,46807454.0,12.0,,2019-09-24,Riadna,2013-01-01,,52100.0,,112.0,0.0,2.0,False,Skladové a pomocné činnosti v doprave,H,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,2013,H,"Anton Putiš, s.r.o.",ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1.0,False,False
8984,1497752.0,2018.0,1.0,6621827,699.0,2859695.0,4051.0,3612944.0,67547.0,108.0,3682749.0,3615202.0,10040.0,2845604.0,206590.0,784716.0,6460950.0,243379.0,35008.0,27138.0,34923.0,6460950.0,6425942.0,0.0,36674.0,0.001121,0.003900,0.791514,0.000523,0.001100,,0.001554,0.562595,0.562243,1.754382,0.018341,0.018341,359.854980,0.084812,3.657036,0.057145,0.981045,0.000029,0.005405,0.004200,0.004200,0.007369,0.401765,0.056097,0.031975,...,0.005676,3965013.0,1.0,46943455.0,12.0,,2019-03-29,Riadna,2012-12-12,,25990.0,,112.0,2.0,2.0,False,Výroba kovových konštrukcií okrem strojov a za...,C,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,2012,C,"ELTE, s.r.o.",ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1.0,False,False
8985,1501557.0,2018.0,1.0,6645297,699.0,627336.0,215394.0,294697.0,336724.0,41072.0,684838.0,342686.0,318813.0,37679.0,428880.0,340885.0,4046254.0,520738.0,407710.0,318119.0,406114.0,4044062.0,3636352.0,4473.0,75015.0,0.730900,1.812733,1.940590,0.095766,0.314518,0.011857,0.078835,0.094239,0.081042,5.905137,0.491684,0.495612,0.924357,6.690428,-0.534415,1.251525,0.430316,0.059973,0.100368,0.078663,0.078621,0.464517,0.944747,0.626250,0.106052,...,0.018539,3986615.0,1.0,46946853.0,12.0,,,Riadna,2013-01-01,,69200.0,,112.0,5.0,2.0,False,Právne a účtovnícke činnosti,M,69.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,2013,M,ARTINIT s. r. o.,ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1.0,False,False
8986,1505752.0,2016.0,1.0,5658799,699.0,251528.0,17576.0,119335.0,107134.0,15304.0,255183.0,148049.0,233952.0,0.0,58519.0,46105.0,2071626.0,1815900.0,58850.0,41744.0,54158.0,2071619.0,2012769.0,3.0,1749578.0,0.147283,2.107747,2.107747,0.261522,0.068876,,0.112932,0.073555,0.059289,8.118170,0.419832,0.419832,0.510083,37.167715,-1.768297,0.395268,0.467645,0.059973,0.026143,0.020150,0.020150,0.163585,0.389643,0.229322,0.028248,...,0.844543,3188105.0,2.0,46946098.0,12.0,,2017-06-16,Riadna,2012-12-21,,78200.0,,112.0,6.0,8.0,False,Sprostredkovanie práce,N,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,2012,other,Job Impulse s.r.o.,ok,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1.0,False,False


In [30]:
display(get_sample_sizes(master_sample, 'is_mahalanobis_outlier', 'combined_issue_criteria_3y_off'))
display(get_sample_sizes(master_sample, 'is_isolation_forest_outlier', 'combined_issue_criteria_3y_off'))

Unnamed: 0_level_0,count,count
combined_issue_criteria_3y_off,0.0,1.0
is_mahalanobis_outlier,Unnamed: 1_level_2,Unnamed: 2_level_2
False,8048,839
True,68,33


Unnamed: 0_level_0,count,count
combined_issue_criteria_3y_off,0.0,1.0
is_isolation_forest_outlier,Unnamed: 1_level_2,Unnamed: 2_level_2
False,7973,834
True,143,38


In [31]:
master_sample = master_sample.query('sknace_division_normalized != "non-relevant"')
testing_sample = master_sample.sample(frac=0.2, random_state=17)
testing_sample_ids = set(testing_sample['entity_id'])

In [32]:
training_sample = master_sample.query('entity_id not in @testing_sample_ids')

In [33]:
for dim in ['RU_event_3y_off', 'RUZ_cancelled_3y_off', 'RUZ_indirect_criteria_3y_off', 'combined_issue_criteria_3y_off']:
    print(dim)
    display(
        pd.concat([
            master_sample[dim].value_counts(normalize=True).to_frame('dataset'),
            training_sample[dim].value_counts(normalize=True).to_frame('training_sample'),
            testing_sample[dim].value_counts(normalize=True).to_frame('testing_sample')
        ], axis = 1)
    )

RU_event_3y_off


Unnamed: 0,dataset,training_sample,testing_sample
0.0,0.974402,0.973428,0.978297
1.0,0.025598,0.026572,0.021703


RUZ_cancelled_3y_off


Unnamed: 0,dataset,training_sample,testing_sample
0,0.988759,0.98887,0.988314
1,0.011241,0.01113,0.011686


RUZ_indirect_criteria_3y_off


Unnamed: 0,dataset,training_sample,testing_sample
0.0,0.92621,0.926544,0.924875
1.0,0.07379,0.073456,0.075125


combined_issue_criteria_3y_off


Unnamed: 0,dataset,training_sample,testing_sample
0.0,0.902949,0.902615,0.904285
1.0,0.097051,0.097385,0.095715


In [34]:
master_sample.to_csv('../../DATA/FINAL/super_sample.csv', index=False)
training_sample.to_csv('../../DATA/FINAL/training_sample.csv', index=False)
testing_sample.to_csv('../../DATA/FINAL/testing_sample.csv', index=False)

In [35]:
for dim in DIMENSIONS:
    print(dim)
    display(
        pd.concat([
            master_sample[dim].value_counts(normalize=True).to_frame('dataset'),
            training_sample[dim].value_counts(normalize=True).to_frame('training_sample'),
            testing_sample[dim].value_counts(normalize=True).to_frame('testing_sample')
        ], axis = 1)
    )

year


Unnamed: 0,dataset,training_sample,testing_sample
2015.0,0.256984,0.2532,0.27212
2016.0,0.233278,0.231775,0.239288
2017.0,0.235726,0.236505,0.23261
2018.0,0.274012,0.27852,0.255982


period_length


Unnamed: 0,dataset,training_sample,testing_sample
12.0,1.0,1.0,1.0


type


Unnamed: 0,dataset,training_sample,testing_sample
Riadna,0.997885,0.998052,0.997218
Mimoriadna,0.002115,0.001948,0.002782


financial_report_template_id


Unnamed: 0,dataset,training_sample,testing_sample
699.0,0.961825,0.961881,0.961603
687.0,0.038175,0.038119,0.038397


legal_form_code


Unnamed: 0,dataset,training_sample,testing_sample
112.0,0.848525,0.851141,0.838063
121.0,0.151475,0.148859,0.161937


org_size_code


Unnamed: 0,dataset,training_sample,testing_sample
0.0,0.129661,0.130495,0.126322
1.0,0.022482,0.02212,0.023929
2.0,0.049416,0.04911,0.05064
3.0,0.036171,0.037841,0.029494
4.0,0.052198,0.051196,0.056205
5.0,0.114858,0.11227,0.125209
6.0,0.150362,0.153172,0.139121
7.0,0.058097,0.059961,0.05064
11.0,0.164608,0.166667,0.156372
12.0,0.114302,0.11227,0.122426


ownership_category_code


Unnamed: 0,dataset,training_sample,testing_sample
2.0,0.647858,0.648303,0.646077
7.0,0.238954,0.236923,0.247078
8.0,0.113189,0.114775,0.106845


entity_consolidated


Unnamed: 0,dataset,training_sample,testing_sample
False,0.983194,0.983445,0.982193
True,0.016806,0.016555,0.017807


sknace_division


Unnamed: 0,dataset,training_sample,testing_sample
G,0.322204,0.32276,0.319978
C,0.20512,0.206873,0.198108
F,0.092042,0.090568,0.097941
M,0.079354,0.079855,0.077351
H,0.067112,0.068865,0.0601
N,0.050751,0.052309,0.044519
L,0.049193,0.048831,0.05064
J,0.034613,0.03325,0.040067
A,0.030161,0.029772,0.03172
I,0.013578,0.013216,0.015025


sknace_division_normalized


Unnamed: 0,dataset,training_sample,testing_sample
G,0.322204,0.32276,0.319978
other,0.234168,0.23108,0.246522
C,0.20512,0.206873,0.198108
F,0.092042,0.090568,0.097941
M,0.079354,0.079855,0.077351
H,0.067112,0.068865,0.0601


sal_class


Unnamed: 0,dataset,training_sample,testing_sample
2. Malý,0.815915,0.818587,0.805231
3. Stredný,0.184085,0.181413,0.194769


In [36]:
for dim in DIMENSIONS:
    print(dim)
    display(
        pd.concat([
            master_sample[dim].value_counts(normalize=True).to_frame('dataset'),
            training_sample[dim].value_counts(normalize=True).to_frame('training_sample'),
            testing_sample[dim].value_counts(normalize=True).to_frame('testing_sample')
        ], axis = 1)
    )

year


Unnamed: 0,dataset,training_sample,testing_sample
2015.0,0.256984,0.2532,0.27212
2016.0,0.233278,0.231775,0.239288
2017.0,0.235726,0.236505,0.23261
2018.0,0.274012,0.27852,0.255982


period_length


Unnamed: 0,dataset,training_sample,testing_sample
12.0,1.0,1.0,1.0


type


Unnamed: 0,dataset,training_sample,testing_sample
Riadna,0.997885,0.998052,0.997218
Mimoriadna,0.002115,0.001948,0.002782


financial_report_template_id


Unnamed: 0,dataset,training_sample,testing_sample
699.0,0.961825,0.961881,0.961603
687.0,0.038175,0.038119,0.038397


legal_form_code


Unnamed: 0,dataset,training_sample,testing_sample
112.0,0.848525,0.851141,0.838063
121.0,0.151475,0.148859,0.161937


org_size_code


Unnamed: 0,dataset,training_sample,testing_sample
0.0,0.129661,0.130495,0.126322
1.0,0.022482,0.02212,0.023929
2.0,0.049416,0.04911,0.05064
3.0,0.036171,0.037841,0.029494
4.0,0.052198,0.051196,0.056205
5.0,0.114858,0.11227,0.125209
6.0,0.150362,0.153172,0.139121
7.0,0.058097,0.059961,0.05064
11.0,0.164608,0.166667,0.156372
12.0,0.114302,0.11227,0.122426


ownership_category_code


Unnamed: 0,dataset,training_sample,testing_sample
2.0,0.647858,0.648303,0.646077
7.0,0.238954,0.236923,0.247078
8.0,0.113189,0.114775,0.106845


entity_consolidated


Unnamed: 0,dataset,training_sample,testing_sample
False,0.983194,0.983445,0.982193
True,0.016806,0.016555,0.017807


sknace_division


Unnamed: 0,dataset,training_sample,testing_sample
G,0.322204,0.32276,0.319978
C,0.20512,0.206873,0.198108
F,0.092042,0.090568,0.097941
M,0.079354,0.079855,0.077351
H,0.067112,0.068865,0.0601
N,0.050751,0.052309,0.044519
L,0.049193,0.048831,0.05064
J,0.034613,0.03325,0.040067
A,0.030161,0.029772,0.03172
I,0.013578,0.013216,0.015025


sknace_division_normalized


Unnamed: 0,dataset,training_sample,testing_sample
G,0.322204,0.32276,0.319978
other,0.234168,0.23108,0.246522
C,0.20512,0.206873,0.198108
F,0.092042,0.090568,0.097941
M,0.079354,0.079855,0.077351
H,0.067112,0.068865,0.0601


sal_class


Unnamed: 0,dataset,training_sample,testing_sample
2. Malý,0.815915,0.818587,0.805231
3. Stredný,0.184085,0.181413,0.194769
