In [33]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)


In [34]:
data_raw = pd.read_csv('../../DATA/FINAL/transformed_set_enriched.csv')

In [35]:
KEYS = [
    'entity_id',
    'financial_report_id',
    'financial_statement_id',
    'entity_ico',
    'entity_name']

DIMENSIONS = [
    'year',
    # 'month',
    # 'financial_report_order_number',
    'period_length',
    # 'consolidated',
    # 'approved_date',
    'type',
    'financial_report_template_id',
    # 'establishment_date',
    # 'cancellation_date',
    # 'sknace_code',
    # 'entity_state',
    'legal_form_code',
    'org_size_code',
    'ownership_category_code',
    'entity_consolidated',
    # 'sknace_division_name',
    'sknace_division',
    # 'sknace_subcategory',
    'sal_class',
    'is_outlier']

RAW_VALUES = [ 
    'total_assets',
    'non_current_assets',
    'non_current_intangible_assets',
    'non_current_tangible_assets',
    'non_current_financial_assets',
    'current_assets',
    'inventories',
    'non_current_receivables',
    'current_receivables',
    'financial_assets',
    'current_financial_assets',
    'cash_and_bank_balances',
    'total_equity_and_liabilities',
    'equity',
    'share_capital',
    'share_premium',
    'net_profit_of_previous_years',
    'net_profit_after_tax',
    'liabilities',
    'non_current_liabilities',
    'long_term_provisions',
    'long_term_bank_loans',
    'current_liabilities',
    'short_term_provisions',
    'current_bank_loans',
    'short_term_financial_assistance',
    'operating_revenues',
    'sales_from_the_merchandise',
    'sales_from_the_own_products_and_services',
    'sales_from_the_other',
    'other_revenues_from_operating_activities',
    'operating_costs',
    'costs_of_merchandise_sold',
    'consumed_materials_energy_non_inventory_supplies',
    'services',
    'personnel_costs',
    'taxes_and_fees',
    'depreciation',
    'remaining_cost_of_sold_long_term_assets_and_materials',
    'adjustments_to_receivables',
    'other_costs_of_operating_activities',
    'operating_result',
    'value_added',
    'revenues_from_financial_activities',
    'cost_of_financial_activities',
    'result_from_fincancial_activities',
    'result_before_tax',
    'income_tax',
    'result_after_tax']

PRIMARY_VALUES = [
    'CA',
    'CASH',
    'CL',
    'EQ',
    'NCL',
    'TA',
    'TL',
    'CC',
    'INV',
    'CF_NETTO',
    'CF_SELFFIN',
    'EAT',
    'EBIT',
    'SAL',
    'COST',
    'INT',
    'YIE',
    'LABOR']

RATIOS = [
    'L3',
    'L2',
    'L1',
    'CF_CL',
    'CASH_TA',
    'SAL_TA',
    'TL_SAL',
    'INV_COST',
    'INV_SAL',
    'CC_SAL',
    'TA_SAL',
    'TL_TA',
    'CF_TL',
    'CL_TA',
    'NCL_TA',
    'EQ_TL',
    'EQ_TA',
    'EBIT_INT',
    'CL_CC',
    'ROE',
    'EAT_TA',
    'ROA_BRUTTO',
    'CF_TA',
    'CF_SAL',
    'ROS',
    'EAT_YIE',
    'ROI',
    'ROA_NETTO',
    'LABOR_PRODUCTIVITY']

DISTRESS_SIGNALS = [
    'DPHZ_vat_registration_cancelled',
    'DPHZ_vat_registration_cancelled_1y_off',
    'DPHZ_vat_registration_cancelled_2y_off',
    'DPHZ_vat_registration_cancelled_3y_off',
    'RU_event',
    'RU_event_1y_off',
    'RU_event_2y_off',
    'RU_event_3y_off',
    'cancellation_year',
    'RUZ_cancelled',
    'RUZ_cancelled_1y_off',
    'RUZ_cancelled_2y_off',
    'RUZ_cancelled_3y_off']

In [36]:
data = data_raw.drop(columns=RAW_VALUES)
data_raw = None
display(data.shape)

(921211, 87)

In [37]:
# pomer VI a zavazkov je mensi ako 8, t.j EQ_TL < 0.08
# 2016 : 0.04, 2017 : 0.06, 2018 : 0.08, 
# EQ < 0
# L3 < 1


def process_issues(row):
    EQ_TL = row.EQ_TL
    EQ = row.EQ
    L3 = row.L3
    EAT = row.EAT
    y = row.year

    if y <= 2016: threat_criterion = 0.04
    elif y == 2017: threat_criterion = 0.06
    else: threat_criterion = 0.08

    if EQ_TL < threat_criterion and EQ < 0 and L3 < 1 and EAT < 0:
        return 1
    return 0
    

In [38]:
ruz_indirect_list = []

for _, row in data.iterrows():
    criteria_value = process_issues(row)
    ruz_indirect_list.append([row['entity_id'], row['year'], row['period_length'], criteria_value])

ruz_indirect_df = pd.DataFrame(ruz_indirect_list, columns=['entity_id', 'year', 'period_length', 'RUZ_indirect_criteria'])


In [39]:
ruz_indirect_df['period_length_check'] = ruz_indirect_df['period_length'].apply(lambda x: 1 if x == 12 else 0)
ruz_indirect_df

ruz_indirect_df = (ruz_indirect_df
                   .sort_values(['period_length_check', 'RUZ_indirect_criteria'], ascending=[False, False])
                   .groupby(['entity_id', 'year'])
                   .first()
                   .reset_index())[['entity_id', 'year', 'RUZ_indirect_criteria']]

In [40]:
def process_entity_issues_2y_off(row, issue_type):
        return max(row[issue_type+'_1y_off'], row[issue_type+'_2y_off'])

def process_entity_issues_3y_off(row, issue_type):
        return max(row[issue_type+'_1y_off'], row[issue_type+'_2y_off'], row[issue_type+'_3y_off'])


entity_issues_indicators_1y_off = ruz_indirect_df.copy()
entity_issues_indicators_2y_off = ruz_indirect_df.copy()
entity_issues_indicators_3y_off = ruz_indirect_df.copy()


entity_issues_indicators_1y_off['year'] = entity_issues_indicators_1y_off['year'] - 1
entity_issues_indicators_1y_off.rename(columns={'RUZ_indirect_criteria':'RUZ_indirect_criteria_1y_off'}, inplace=True)

entity_issues_indicators_2y_off['year'] = entity_issues_indicators_2y_off['year'] - 2
entity_issues_indicators_2y_off.rename(columns={'RUZ_indirect_criteria':'RUZ_indirect_criteria_2y_off'}, inplace=True)

entity_issues_indicators_3y_off['year'] = entity_issues_indicators_3y_off['year'] - 3
entity_issues_indicators_3y_off.rename(columns={'RUZ_indirect_criteria':'RUZ_indirect_criteria_3y_off'}, inplace=True)


entity_issues_indicators = (ruz_indirect_df
                            .merge(
                                entity_issues_indicators_1y_off,
                                on = ['entity_id', 'year'],
                                how = 'outer'
                            )                           
                            .merge(
                                entity_issues_indicators_2y_off,
                                on = ['entity_id', 'year'],
                                how = 'outer'
                            )                                                    
                            .merge(
                                entity_issues_indicators_3y_off,
                                on = ['entity_id', 'year'],
                                how = 'outer'
                            ))

for col in ['RUZ_indirect_criteria',
       'RUZ_indirect_criteria_1y_off', 'RUZ_indirect_criteria_2y_off',
       'RUZ_indirect_criteria_3y_off']:
       entity_issues_indicators[col] = entity_issues_indicators[col].fillna(0)

entity_issues_indicators['RUZ_indirect_criteria_2y_off'] = entity_issues_indicators.apply(lambda x: process_entity_issues_2y_off(x, 'RUZ_indirect_criteria'), axis = 1)
entity_issues_indicators['RUZ_indirect_criteria_3y_off'] = entity_issues_indicators.apply(lambda x: process_entity_issues_3y_off(x, 'RUZ_indirect_criteria'), axis = 1)

entity_issues_indicators


Unnamed: 0,entity_id,year,RUZ_indirect_criteria,RUZ_indirect_criteria_1y_off,RUZ_indirect_criteria_2y_off,RUZ_indirect_criteria_3y_off
0,4.0,2014.0,0.0,0.0,0.0,0.0
1,4.0,2015.0,0.0,0.0,0.0,0.0
2,4.0,2016.0,0.0,0.0,0.0,0.0
3,4.0,2017.0,0.0,0.0,0.0,0.0
4,4.0,2018.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
1359507,1598000.0,2012.0,0.0,0.0,0.0,0.0
1359508,1611042.0,2011.0,0.0,0.0,0.0,0.0
1359509,1611565.0,2011.0,0.0,0.0,0.0,0.0
1359510,1635085.0,2012.0,0.0,0.0,0.0,1.0


In [41]:
data = data.merge(entity_issues_indicators, on = ['entity_id', 'year'], how = 'left')

In [42]:
DISTRESS_SIGNALS = [
    'DPHZ_vat_registration_cancelled',
    'DPHZ_vat_registration_cancelled_1y_off',
    'DPHZ_vat_registration_cancelled_2y_off',
    'DPHZ_vat_registration_cancelled_3y_off',
    'RU_event',
    'RU_event_1y_off',
    'RU_event_2y_off',
    'RU_event_3y_off',
    'cancellation_year',
    'RUZ_cancelled',
    'RUZ_cancelled_1y_off',
    'RUZ_cancelled_2y_off',
    'RUZ_cancelled_3y_off',
    'RUZ_indirect_criteria',
    'RUZ_indirect_criteria_1y_off',
    'RUZ_indirect_criteria_2y_off',
    'RUZ_indirect_criteria_3y_off']


In [43]:
data.query('sal_class in ["2. Malý", "3. Stredný"]')['cancellation_year'].value_counts()

2017.0    469
2021.0    409
2020.0    379
2022.0    330
2016.0    305
2018.0    286
2019.0    219
2015.0     82
2014.0      1
Name: cancellation_year, dtype: int64

In [44]:
data.query('cancellation_year > 0')[['entity_id', 'entity_ico', 'entity_name']]

Unnamed: 0,entity_id,entity_ico,entity_name
16,65.0,633861.0,TRADEF spol. s r.o.
17,65.0,633861.0,TRADEF spol. s r.o. v likvidácii
18,65.0,633861.0,TRADEF spol. s r.o. v likvidácii
91,623.0,695904.0,"POPEY CONNEXION, spoločnosť s"
92,623.0,695904.0,"POPEY CONNEXION, spoločnosť s ručením obmedzen..."
...,...,...,...
921206,1611565.0,30227011.0,"SAMKO, spol. s r.o. "" v likvidácii """
921207,1611565.0,30227011.0,"SAMKO, spol. s r.o. "" v likvidácii """
921208,1611565.0,30227011.0,"SAMKO, spol. s r.o. "" v likvidácii """
921209,1635085.0,31104193.0,DA-LAS TRADING s.r.o.


In [45]:
def get_sample_sizes(data, row_label, column_label):
    samples_df = data.groupby([row_label, column_label])['financial_statement_id'].nunique().to_frame('count').reset_index()
    wide_df = pd.pivot_table(samples_df, index = row_label, columns=column_label, fill_value=0)
    return wide_df

In [46]:
def combined_criteria(row, years_offset):
    return max(row[f'RU_event_{int(years_offset)}y_off'], row[f'RUZ_cancelled_{int(years_offset)}y_off'], row[f'RUZ_indirect_criteria_{int(years_offset)}y_off'])

data['combined_issue_criteria_1y_off'] = data.apply(lambda x: combined_criteria(x, 1), axis = 1)
data['combined_issue_criteria_2y_off'] = data.apply(lambda x: combined_criteria(x, 2), axis = 1)
data['combined_issue_criteria_3y_off'] = data.apply(lambda x: combined_criteria(x, 3), axis = 1)

data

Unnamed: 0,entity_id,year,month,financial_report_id,financial_report_template_id,CA,CASH,CL,EQ,NCL,TA,TL,CC,INV,CF_NETTO,CF_SELFFIN,EAT,EBIT,SAL,COST,INT,YIE,LABOR,L3,L2,L1,CF_CL,CASH_TA,SAL_TA,TL_SAL,INV_COST,INV_SAL,CC_SAL,TA_SAL,TL_TA,CF_TL,CL_TA,NCL_TA,EQ_TL,EQ_TA,EBIT_INT,CL_CC,ROE,EAT_TA,ROA_BRUTTO,CF_TA,CF_SAL,ROS,EAT_YIE,ROI,ROA_NETTO,LABOR_PRODUCTIVITY,financial_statement_id,financial_report_order_number,entity_ico,period_length,consolidated,approved_date,type,entity_name,establishment_date,cancellation_date,sknace_code,entity_state,legal_form_code,org_size_code,ownership_category_code,entity_consolidated,sknace_division_name,sknace_division,sknace_subcategory,DPHZ_vat_registration_cancelled,DPHZ_vat_registration_cancelled_1y_off,DPHZ_vat_registration_cancelled_2y_off,DPHZ_vat_registration_cancelled_3y_off,RU_event,RU_event_1y_off,RU_event_2y_off,RU_event_3y_off,cancellation_year,RUZ_cancelled,RUZ_cancelled_1y_off,RUZ_cancelled_2y_off,RUZ_cancelled_3y_off,sal_class,establishment_year,is_outlier,RUZ_indirect_criteria,RUZ_indirect_criteria_1y_off,RUZ_indirect_criteria_2y_off,RUZ_indirect_criteria_3y_off,combined_issue_criteria_1y_off,combined_issue_criteria_2y_off,combined_issue_criteria_3y_off
0,4.0,2014.0,1.0,4644773,699.0,8787618.0,5238983.0,4658457.0,2112122.0,3232.0,11200988.0,8657266.0,2578606.0,16429.0,1171078.0,1063283.0,1007897.0,1115692.0,18580340.0,18172171.0,4446.0,19361305.0,3384185.0,1.886380,1.678150,1.124618,0.251388,0.467725,1.658813,0.465937,0.325467,0.318317,0.138781,217.022707,0.772902,0.135271,0.415897,0.000289,0.243971,0.188566,251.942870,1.806580,0.477196,0.089983,0.099607,0.104551,0.063028,0.054245,0.052057,0.090380,0.089983,0.182138,2382229.0,1.0,603783.0,12.0,,2015-07-07,Riadna,"NESS Slovensko, a.s.",1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,2015.0,1.0,5382540,699.0,20276962.0,5687157.0,6153737.0,13229589.0,2890.0,20862929.0,7184264.0,14037315.0,6792.0,3940396.0,3245262.0,6536244.0,7231378.0,17608945.0,11551924.0,2910.0,29573853.0,3581817.0,3.295065,3.205284,0.924179,0.640326,0.272596,0.844030,0.407989,0.211663,0.138857,0.797170,426.524953,0.344355,0.548476,0.294960,0.000139,1.841468,0.634119,2486.009622,0.438384,0.494063,0.313295,0.346614,0.188871,0.223772,0.371189,0.221014,0.313434,0.313295,0.203409,2968194.0,1.0,603783.0,12.0,,2016-09-28,Riadna,"NESS Slovensko, a.s.",1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,2016.0,1.0,5665728,699.0,5991927.0,1828391.0,4859422.0,1178778.0,3568.0,6518310.0,5120402.0,3840942.0,5894.0,247140.0,11702.0,-44282.0,191156.0,12878215.0,12564620.0,38229.0,12914606.0,3476051.0,1.233053,1.166668,0.376257,0.050858,0.280501,1.975698,0.397602,0.168874,0.164762,0.298251,182.214041,0.785541,0.048266,0.745503,0.000547,0.230212,0.180841,6.000288,1.265164,-0.037566,-0.006793,0.029326,0.037915,0.019191,-0.003439,-0.003429,-0.000929,-0.006793,0.269917,3194585.0,3.0,603783.0,12.0,,2017-06-12,Riadna,"NESS Slovensko, a.s.",1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,2017.0,1.0,6245237,699.0,3016170.0,1224056.0,2649399.0,268147.0,3224.0,3442598.0,2902134.0,1355249.0,1954.0,-1327325.0,-1211994.0,-1270731.0,-1386062.0,7651564.0,9093625.0,9037.0,7725983.0,3304021.0,1.138436,0.973543,0.462013,-0.500991,0.355562,2.222613,0.379286,0.077355,0.091934,0.177121,161.971498,0.843007,-0.457362,0.769593,0.000937,0.092396,0.077891,-152.376342,1.954917,-4.738934,-0.369120,-0.402621,-0.385559,-0.173471,-0.166075,-0.164475,-0.366495,-0.369120,0.431810,3662172.0,2.0,603783.0,12.0,,2018-06-26,Riadna,"NESS Slovensko, a.s.",1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,1991,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,2018.0,1.0,6757564,699.0,2392774.0,763867.0,1984699.0,146903.0,2689.0,3396723.0,2635535.0,1208801.0,1954.0,-58437.0,-79956.0,-121244.0,-99725.0,7119894.0,7185183.0,13197.0,7119951.0,2241363.0,1.205611,0.993938,0.384878,-0.029444,0.224884,2.096107,0.370165,0.097901,0.098799,0.169778,171.746978,0.775905,-0.022173,0.584298,0.000792,0.055739,0.043248,-6.556642,1.641874,-0.825334,-0.035694,-0.029359,-0.017204,-0.008208,-0.017029,-0.017029,-0.031809,-0.035694,0.314803,4072051.0,3.0,603783.0,12.0,,2019-06-26,Riadna,"NESS Slovensko, a.s.",1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,1991,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
921206,1611565.0,2018.0,1.0,6559224,687.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3912465.0,1.0,30227011.0,12.0,,2019-03-24,Riadna,"SAMKO, spol. s r.o. "" v likvidácii """,1991-12-27,2021-11-23,41209.0,,112.0,0.0,2.0,False,Výstavba budov,F,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0,0,0,1,"0. <50,000",1991,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0
921207,1611565.0,2019.0,1.0,7300905,687.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4506648.0,1.0,30227011.0,12.0,,2020-03-31,Riadna,"SAMKO, spol. s r.o. "" v likvidácii """,1991-12-27,2021-11-23,41209.0,,112.0,0.0,2.0,False,Výstavba budov,F,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0,0,1,1,"0. <50,000",1991,False,0.0,0.0,0.0,0.0,0.0,1.0,1.0
921208,1611565.0,2020.0,1.0,7547445,687.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4708083.0,1.0,30227011.0,12.0,,2021-03-28,Riadna,"SAMKO, spol. s r.o. "" v likvidácii """,1991-12-27,2021-11-23,41209.0,,112.0,0.0,2.0,False,Výstavba budov,F,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0,1,1,1,"0. <50,000",1991,False,0.0,0.0,0.0,0.0,1.0,1.0,1.0
921209,1635085.0,2015.0,1.0,5405717,687.0,52611.0,796.0,133071.0,-80460.0,0.0,52611.0,133071.0,51815.0,0.0,0.0,-480.0,-480.0,0.0,0.0,0.0,0.0,0.0,0.0,0.395360,0.395360,0.005982,0.000000,0.015130,0.000000,,,,,,2.529338,0.000000,2.529338,0.000000,-0.604640,-1.529338,,2.568195,0.005966,-0.009124,0.000000,0.000000,,,,-0.009124,-0.009124,,2986874.0,1.0,31104193.0,12.0,,,Riadna,DA-LAS TRADING s.r.o.,1991-12-16,2006-12-20,47510.0,,112.0,0.0,2.0,False,Maloobchod okrem motorových vozidiel a motocyklov,G,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2006.0,1,1,1,1,"0. <50,000",1991,False,1.0,0.0,0.0,0.0,1.0,1.0,1.0


In [47]:
temp = data.query('period_length == 12')[['entity_id', 'year', 'cancellation_year', 'RU_event_3y_off', 'RUZ_cancelled_3y_off']]

In [48]:
plausible_entity_ids = pd.DataFrame()

for current_year in 2015, 2016, 2017, 2018:
    previous_year = current_year - 1
    two_consequent_years = set(temp.query('year == @current_year')['entity_id']).intersection(temp.query('year == @previous_year')['entity_id'])

    one_off = current_year + 1
    two_off = current_year + 2
    three_off = current_year + 3

    all_three = (set(temp.query('year == @one_off')['entity_id'])
                 .intersection(temp.query('year == @two_off')['entity_id'])
                 .intersection(temp.query('year == @three_off')['entity_id']))
    
    two_and_cancelled = (set(temp.query('year == @one_off')['entity_id'])
                         .intersection(temp.query('year == @two_off')['entity_id'])
                         .intersection(temp.query('cancellation_year == @three_off')['entity_id']))
    
    one_and_cancelled = (set(temp.query('year == @one_off')['entity_id'])
                         .intersection(temp.query('cancellation_year == @two_off')['entity_id']))
    
    next_cancelled = set(temp.query('cancellation_year == @one_off')['entity_id'])

    ru_event_pass = set(temp.query('year == @current_year and RU_event_3y_off == 1 or RUZ_cancelled_3y_off == 1')['entity_id'])

    future_set = (all_three
                  .union(two_and_cancelled)
                  .union(one_and_cancelled)
                  .union(next_cancelled)
                  .union(ru_event_pass))
    
    total = two_consequent_years.intersection(future_set)

    yearly_plausible_entity_ids = pd.DataFrame(total, columns = ['entity_id'])
    yearly_plausible_entity_ids['year'] = current_year
    
    plausible_entity_ids = pd.concat([plausible_entity_ids, yearly_plausible_entity_ids], axis = 0)

plausible_entity_ids['plausible_sample'] = 1

subset_data = data.merge(plausible_entity_ids, on = ['entity_id', 'year'], how = 'left')
subset_data['plausible_sample'] = subset_data['plausible_sample'].fillna(0)

subset_data

Unnamed: 0,entity_id,year,month,financial_report_id,financial_report_template_id,CA,CASH,CL,EQ,NCL,TA,TL,CC,INV,CF_NETTO,CF_SELFFIN,EAT,EBIT,SAL,COST,INT,YIE,LABOR,L3,L2,L1,CF_CL,CASH_TA,SAL_TA,TL_SAL,INV_COST,INV_SAL,CC_SAL,TA_SAL,TL_TA,CF_TL,CL_TA,NCL_TA,EQ_TL,EQ_TA,EBIT_INT,CL_CC,ROE,EAT_TA,ROA_BRUTTO,CF_TA,CF_SAL,ROS,EAT_YIE,ROI,ROA_NETTO,LABOR_PRODUCTIVITY,financial_statement_id,financial_report_order_number,entity_ico,period_length,consolidated,approved_date,type,entity_name,establishment_date,cancellation_date,sknace_code,entity_state,legal_form_code,org_size_code,ownership_category_code,entity_consolidated,sknace_division_name,sknace_division,sknace_subcategory,DPHZ_vat_registration_cancelled,DPHZ_vat_registration_cancelled_1y_off,DPHZ_vat_registration_cancelled_2y_off,DPHZ_vat_registration_cancelled_3y_off,RU_event,RU_event_1y_off,RU_event_2y_off,RU_event_3y_off,cancellation_year,RUZ_cancelled,RUZ_cancelled_1y_off,RUZ_cancelled_2y_off,RUZ_cancelled_3y_off,sal_class,establishment_year,is_outlier,RUZ_indirect_criteria,RUZ_indirect_criteria_1y_off,RUZ_indirect_criteria_2y_off,RUZ_indirect_criteria_3y_off,combined_issue_criteria_1y_off,combined_issue_criteria_2y_off,combined_issue_criteria_3y_off,plausible_sample
0,4.0,2014.0,1.0,4644773,699.0,8787618.0,5238983.0,4658457.0,2112122.0,3232.0,11200988.0,8657266.0,2578606.0,16429.0,1171078.0,1063283.0,1007897.0,1115692.0,18580340.0,18172171.0,4446.0,19361305.0,3384185.0,1.886380,1.678150,1.124618,0.251388,0.467725,1.658813,0.465937,0.325467,0.318317,0.138781,217.022707,0.772902,0.135271,0.415897,0.000289,0.243971,0.188566,251.942870,1.806580,0.477196,0.089983,0.099607,0.104551,0.063028,0.054245,0.052057,0.090380,0.089983,0.182138,2382229.0,1.0,603783.0,12.0,,2015-07-07,Riadna,"NESS Slovensko, a.s.",1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,2015.0,1.0,5382540,699.0,20276962.0,5687157.0,6153737.0,13229589.0,2890.0,20862929.0,7184264.0,14037315.0,6792.0,3940396.0,3245262.0,6536244.0,7231378.0,17608945.0,11551924.0,2910.0,29573853.0,3581817.0,3.295065,3.205284,0.924179,0.640326,0.272596,0.844030,0.407989,0.211663,0.138857,0.797170,426.524953,0.344355,0.548476,0.294960,0.000139,1.841468,0.634119,2486.009622,0.438384,0.494063,0.313295,0.346614,0.188871,0.223772,0.371189,0.221014,0.313434,0.313295,0.203409,2968194.0,1.0,603783.0,12.0,,2016-09-28,Riadna,"NESS Slovensko, a.s.",1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4.0,2016.0,1.0,5665728,699.0,5991927.0,1828391.0,4859422.0,1178778.0,3568.0,6518310.0,5120402.0,3840942.0,5894.0,247140.0,11702.0,-44282.0,191156.0,12878215.0,12564620.0,38229.0,12914606.0,3476051.0,1.233053,1.166668,0.376257,0.050858,0.280501,1.975698,0.397602,0.168874,0.164762,0.298251,182.214041,0.785541,0.048266,0.745503,0.000547,0.230212,0.180841,6.000288,1.265164,-0.037566,-0.006793,0.029326,0.037915,0.019191,-0.003439,-0.003429,-0.000929,-0.006793,0.269917,3194585.0,3.0,603783.0,12.0,,2017-06-12,Riadna,"NESS Slovensko, a.s.",1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,3. Stredný,1991,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4.0,2017.0,1.0,6245237,699.0,3016170.0,1224056.0,2649399.0,268147.0,3224.0,3442598.0,2902134.0,1355249.0,1954.0,-1327325.0,-1211994.0,-1270731.0,-1386062.0,7651564.0,9093625.0,9037.0,7725983.0,3304021.0,1.138436,0.973543,0.462013,-0.500991,0.355562,2.222613,0.379286,0.077355,0.091934,0.177121,161.971498,0.843007,-0.457362,0.769593,0.000937,0.092396,0.077891,-152.376342,1.954917,-4.738934,-0.369120,-0.402621,-0.385559,-0.173471,-0.166075,-0.164475,-0.366495,-0.369120,0.431810,3662172.0,2.0,603783.0,12.0,,2018-06-26,Riadna,"NESS Slovensko, a.s.",1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,1991,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4.0,2018.0,1.0,6757564,699.0,2392774.0,763867.0,1984699.0,146903.0,2689.0,3396723.0,2635535.0,1208801.0,1954.0,-58437.0,-79956.0,-121244.0,-99725.0,7119894.0,7185183.0,13197.0,7119951.0,2241363.0,1.205611,0.993938,0.384878,-0.029444,0.224884,2.096107,0.370165,0.097901,0.098799,0.169778,171.746978,0.775905,-0.022173,0.584298,0.000792,0.055739,0.043248,-6.556642,1.641874,-0.825334,-0.035694,-0.029359,-0.017204,-0.008208,-0.017029,-0.017029,-0.031809,-0.035694,0.314803,4072051.0,3.0,603783.0,12.0,,2019-06-26,Riadna,"NESS Slovensko, a.s.",1991-01-17,,62090.0,,121.0,11.0,7.0,True,"Počítačové programovanie, poradenstvo a súvisi...",J,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0,0,0,2. Malý,1991,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
921206,1611565.0,2018.0,1.0,6559224,687.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3912465.0,1.0,30227011.0,12.0,,2019-03-24,Riadna,"SAMKO, spol. s r.o. "" v likvidácii """,1991-12-27,2021-11-23,41209.0,,112.0,0.0,2.0,False,Výstavba budov,F,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0,0,0,1,"0. <50,000",1991,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
921207,1611565.0,2019.0,1.0,7300905,687.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4506648.0,1.0,30227011.0,12.0,,2020-03-31,Riadna,"SAMKO, spol. s r.o. "" v likvidácii """,1991-12-27,2021-11-23,41209.0,,112.0,0.0,2.0,False,Výstavba budov,F,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0,0,1,1,"0. <50,000",1991,False,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
921208,1611565.0,2020.0,1.0,7547445,687.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4708083.0,1.0,30227011.0,12.0,,2021-03-28,Riadna,"SAMKO, spol. s r.o. "" v likvidácii """,1991-12-27,2021-11-23,41209.0,,112.0,0.0,2.0,False,Výstavba budov,F,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0,1,1,1,"0. <50,000",1991,False,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
921209,1635085.0,2015.0,1.0,5405717,687.0,52611.0,796.0,133071.0,-80460.0,0.0,52611.0,133071.0,51815.0,0.0,0.0,-480.0,-480.0,0.0,0.0,0.0,0.0,0.0,0.0,0.395360,0.395360,0.005982,0.000000,0.015130,0.000000,,,,,,2.529338,0.000000,2.529338,0.000000,-0.604640,-1.529338,,2.568195,0.005966,-0.009124,0.000000,0.000000,,,,-0.009124,-0.009124,,2986874.0,1.0,31104193.0,12.0,,,Riadna,DA-LAS TRADING s.r.o.,1991-12-16,2006-12-20,47510.0,,112.0,0.0,2.0,False,Maloobchod okrem motorových vozidiel a motocyklov,G,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2006.0,1,1,1,1,"0. <50,000",1991,False,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0


In [49]:
subset_data = subset_data.query('plausible_sample == 1 and sal_class in ["2. Malý", "3. Stredný"] and period_length == 12 and RUZ_indirect_criteria == 0 and RUZ_cancelled == 0 and RU_event == 0')


In [50]:
master_sample = subset_data.query('RU_event_3y_off == 1').sample(frac=1).groupby('entity_id').first().reset_index()
master_sample_set = set(master_sample['entity_id'])

ruz_cancelled_entity = subset_data.query('RUZ_cancelled_3y_off == 1 and entity_id not in @master_sample_set').sample(frac=1).groupby('entity_id').first().reset_index()

master_sample = pd.concat([master_sample, ruz_cancelled_entity], axis = 0)
master_sample_set = set(master_sample['entity_id'])

ruz_indirect_criteria = subset_data.query('RUZ_indirect_criteria_3y_off == 1 and entity_id not in @master_sample_set').sample(frac=1).groupby('entity_id').first().reset_index()

master_sample = pd.concat([master_sample, ruz_indirect_criteria], axis = 0)
master_sample_set = set(master_sample['entity_id'])

rest = subset_data.query('entity_id not in @master_sample_set').sample(frac=1).groupby('entity_id').first().reset_index()

master_sample = pd.concat([master_sample, rest], axis = 0)

display(master_sample['entity_id'].value_counts())

27971.0      1
476371.0     1
475895.0     1
475925.0     1
476235.0     1
            ..
171224.0     1
171208.0     1
171180.0     1
171176.0     1
1611042.0    1
Name: entity_id, Length: 9051, dtype: int64

In [51]:
for dim in ['RU_event_3y_off', 'RUZ_cancelled_3y_off', 'RUZ_indirect_criteria_3y_off', 'combined_issue_criteria_3y_off']:
    print(dim)
    display(master_sample[dim].value_counts(normalize=True))

RU_event_3y_off


0.0    0.974588
1.0    0.025412
Name: RU_event_3y_off, dtype: float64

RUZ_cancelled_3y_off


0    0.940559
1    0.059441
Name: RUZ_cancelled_3y_off, dtype: float64

RUZ_indirect_criteria_3y_off


0.0    0.926306
1.0    0.073694
Name: RUZ_indirect_criteria_3y_off, dtype: float64

combined_issue_criteria_3y_off


0.0    0.857364
1.0    0.142636
Name: combined_issue_criteria_3y_off, dtype: float64

In [52]:
testing_sample = master_sample.sample(frac=0.2)
testing_sample_ids = set(testing_sample['entity_id'])

In [53]:
training_sample = master_sample.query('entity_id not in @testing_sample_ids')

In [54]:
for dim in ['RU_event_3y_off', 'RUZ_cancelled_3y_off', 'RUZ_indirect_criteria_3y_off', 'combined_issue_criteria_3y_off']:
    print(dim)
    display(
        pd.concat([
            master_sample[dim].value_counts(normalize=True).to_frame('dataset'),
            training_sample[dim].value_counts(normalize=True).to_frame('training_sample'),
            testing_sample[dim].value_counts(normalize=True).to_frame('testing_sample')
        ], axis = 1)
    )

RU_event_3y_off


Unnamed: 0,dataset,training_sample,testing_sample
0.0,0.974588,0.974451,0.975138
1.0,0.025412,0.025549,0.024862


RUZ_cancelled_3y_off


Unnamed: 0,dataset,training_sample,testing_sample
0,0.940559,0.941583,0.936464
1,0.059441,0.058417,0.063536


RUZ_indirect_criteria_3y_off


Unnamed: 0,dataset,training_sample,testing_sample
0.0,0.926306,0.928049,0.919337
1.0,0.073694,0.071951,0.080663


combined_issue_criteria_3y_off


Unnamed: 0,dataset,training_sample,testing_sample
0.0,0.857364,0.86024,0.845856
1.0,0.142636,0.13976,0.154144


In [55]:
master_sample.to_csv('../../DATA/FINAL/super_sample.csv', index=False)
training_sample.to_csv('../../DATA/FINAL/training_sample.csv', index=False)
testing_sample.to_csv('../../DATA/FINAL/testing_sample.csv', index=False)

In [56]:
for dim in DIMENSIONS:
    print(dim)
    display(
        pd.concat([
            master_sample[dim].value_counts(normalize=True).to_frame('dataset'),
            training_sample[dim].value_counts(normalize=True).to_frame('training_sample'),
            testing_sample[dim].value_counts(normalize=True).to_frame('testing_sample')
        ], axis = 1)
    )

year


Unnamed: 0,dataset,training_sample,testing_sample
2015.0,0.272567,0.272614,0.272376
2016.0,0.228483,0.226212,0.237569
2017.0,0.225831,0.222069,0.240884
2018.0,0.273119,0.279105,0.249171


period_length


Unnamed: 0,dataset,training_sample,testing_sample
12.0,1.0,1.0,1.0


type


Unnamed: 0,dataset,training_sample,testing_sample
Riadna,0.997238,0.997514,0.996133
Mimoriadna,0.002762,0.002486,0.003867


financial_report_template_id


Unnamed: 0,dataset,training_sample,testing_sample
699.0,0.956469,0.957188,0.953591
687.0,0.043531,0.042812,0.046409


legal_form_code


Unnamed: 0,dataset,training_sample,testing_sample
112.0,0.862557,0.863555,0.858564
121.0,0.137443,0.136445,0.141436


org_size_code


Unnamed: 0,dataset,training_sample,testing_sample
0.0,0.170699,0.169866,0.174033
1.0,0.020329,0.019334,0.024309
2.0,0.047288,0.047783,0.045304
3.0,0.034361,0.033697,0.037017
4.0,0.049608,0.050131,0.047514
5.0,0.109159,0.106063,0.121547
6.0,0.14341,0.145836,0.133702
7.0,0.054801,0.056622,0.047514
11.0,0.159319,0.163099,0.144199
12.0,0.107612,0.104958,0.118232


ownership_category_code


Unnamed: 0,dataset,training_sample,testing_sample
2.0,0.652525,0.65212,0.654144
7.0,0.240084,0.240713,0.237569
8.0,0.107391,0.107168,0.108287


entity_consolidated


Unnamed: 0,dataset,training_sample,testing_sample
False,0.984864,0.985361,0.982873
True,0.015136,0.014639,0.017127


sknace_division


Unnamed: 0,dataset,training_sample,testing_sample
G,0.338305,0.339594,0.333149
C,0.209701,0.20812,0.216022
F,0.097006,0.097224,0.096133
M,0.082422,0.080376,0.090608
H,0.070489,0.07278,0.061326
N,0.055243,0.05607,0.051934
L,0.051376,0.051098,0.052486
J,0.036902,0.038116,0.032044
A,0.031046,0.030244,0.034254
I,0.014474,0.01381,0.017127


sal_class


Unnamed: 0,dataset,training_sample,testing_sample
2. Malý,0.819026,0.8195,0.817127
3. Stredný,0.180974,0.1805,0.182873


is_outlier


Unnamed: 0,dataset,training_sample,testing_sample
False,0.857364,0.856235,0.861878
True,0.142636,0.143765,0.138122
