In [1]:
import pandas as pd
import numpy as np


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Selection

In [2]:
financial_ratios_table = pd.read_csv('../../DATA/MODEL/financial_ratios_df.csv')

In [3]:
column = 'SAL'

pivot = pd.pivot_table(financial_ratios_table.query('year <= 2019'), index = 'entity_ico', columns='year', values=column)
bool_wide = (pivot == pivot)
bool_wide['count'] = bool_wide.apply(sum, axis = 1)
bool_wide['count'].value_counts()

bool_wide.reset_index(inplace=True)
bool_wide.columns = ['entity_ico', 'y14', 'y15', 'y16', 'y17', 'y18', 'y19', 'count']
bool_wide

Unnamed: 0,entity_ico,y14,y15,y16,y17,y18,y19,count
0,671.0,True,True,True,True,False,False,4
1,698.0,True,True,True,True,True,True,6
2,710.0,True,True,True,True,True,True,6
3,2313.0,True,True,True,True,True,True,6
4,5819.0,True,True,True,True,True,True,6
...,...,...,...,...,...,...,...,...
227369,52831604.0,False,False,False,False,False,True,1
227370,52831728.0,False,False,False,False,False,True,1
227371,52834476.0,False,False,False,False,False,True,1
227372,60748940.0,True,True,False,False,False,False,2


In [4]:
entity_ico_list = list(bool_wide.query('y15 and y16 and y17')['entity_ico'].unique())

financial_ratios_table.query('entity_ico in @entity_ico_list', inplace= True)
# financial_ratios_table

## Outliers

In [5]:

FINANCIAL_RATIOS = [
    'L3', 'L2', 'L1', 'CF_CL', 'CASH_TA', 'SAL_TA', 'TL_SAL',
    'INV_COST', 'INV_SAL', 'CC_SAL', 'TA_SAL',
    'TL_TA', 'CF_TL', 'CL_TA', 'NCL_TA', 'EQ_TL',
    'EQ_TA', 
    # 'EBIT_INT', 
    'CL_CC', 'ROE', 'EAT_TA',
    'ROA_BRUTTO', 'CF_TA', 'CF_SAL', 'ROS',
    'EAT_YIE', 'ROI', 'ROA_NETTO']

SKNACE_DIVISIONS = list(financial_ratios_table['sknace_division'].unique())
YEARS = list(financial_ratios_table['year'].unique())
TEMPLATES = list(financial_ratios_table['financial_report_template_id'].unique())

In [6]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def get_outliers(l):
    q1 = l.quantile(0.1)  
    q3 = l.quantile(0.9)
    iqr = q3-q1
    fenceLow = q1 - 1.5 * iqr
    fenceHigh = q3 + 1.5 * iqr

    return [(i<fenceLow or i>fenceHigh) for i in l]


# def get_outliers(l, z_threshold=0.5):
#     mean = np.mean(l)
#     std_dev = np.std(l)
#     z_scores = [(i - mean) / std_dev for i in l]
#     return [abs(z) > z_threshold for z in z_scores]


def plot_histograms(df, columns):
    # Plotting the histograms
    fig, axes = plt.subplots(nrows=int(len(columns) / 3) + 1, ncols=3, figsize=(20, 30))
    axes = axes.flatten()

    for i, column in enumerate(columns):
        ax = axes[i]

        data = df[column]
        
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        
        IQR = Q3 - Q1
        
        # lower_bound = Q1 - 1.5 * IQR
        # upper_bound = Q3 + 1.5 * IQR

        # cleaned_data = data[~((data < lower_bound) | (data > upper_bound))]

        bin_width = 2 * IQR / (len(data) ** (1/3))

        bin_width = bin_width if bin_width > 0 else 1
        num_bins = int(np.ceil((np.max(data) - np.min(data)) / bin_width))

        ax.hist(data, bins=num_bins, color='skyblue', edgecolor='black')
        ax.set_title(column)
        ax.set_xlabel('Value')
        ax.set_ylabel('Frequency')

        

    plt.tight_layout()
    plt.show()


In [None]:
display(financial_ratios_table)

In [41]:
outlier_boolean = pd.DataFrame()

for year in YEARS:
    print(year)
    for sknace in SKNACE_DIVISIONS:
        for template_id in TEMPLATES:
            subset = financial_ratios_table.query('financial_report_template_id == @template_id and year == @year and sknace_division == @sknace')[['financial_statement_id'] + FINANCIAL_RATIOS].set_index(['financial_statement_id'])
            outliers = subset.apply(get_outliers)

            # print('size', subset.shape[0])
            # print('not outliers', round(subset[~outliers.apply(lambda x:any(x), axis=1)].shape[0]/subset.shape[0], 2))
            # print('outliers', round(subset[outliers.apply(lambda x:any(x), axis=1)].shape[0]/subset.shape[0], 2))

            # display(outliers.apply(lambda x: x.value_counts(normalize=True)).T.sort_values(True, ascending = False))

            non_outliers_index = subset[~outliers.apply(lambda x:any(x), axis=1)].index
            outliers_index = subset[outliers.apply(lambda x:any(x), axis=1)].index



            subset_non_outliers = pd.DataFrame([0 for _ in range(len(non_outliers_index))], index = non_outliers_index, columns = ['is_outlier'])
            subset_outliers = pd.DataFrame([1 for _ in range(len(outliers_index))], index = outliers_index, columns = ['is_outlier'])


            outlier_boolean = pd.concat([outlier_boolean, subset_non_outliers, subset_outliers], axis = 0)

   

2014
2015
2016
2017
2018
2019
2020
2021


In [45]:
display(financial_ratios_table.shape)

financial_ratios_table = financial_ratios_table.merge(outlier_boolean.reset_index(), on = 'financial_statement_id', how = 'left')

display(financial_ratios_table.shape)
display(financial_ratios_table)

(934262, 82)

(934262, 83)

Unnamed: 0,financial_report_id,financial_statement_id,financial_report_order_number,entity_id,approved_date,entity_ico,year,month,financial_report_template_id,entity_name,establishment_date,sknace_code,entity_state,CA,CASH,CL,EQ,NCL,TA,TL,CC,INV,CF_NETTO,CF_SELFFIN,EAT,EBIT,SAL,COST,INT,YIE,L3,L2,L1,CF_CL,CASH_TA,SAL_TA,TL_SAL,INV_COST,INV_SAL,CC_SAL,TA_SAL,TL_TA,CF_TL,CL_TA,NCL_TA,EQ_TL,EQ_TA,EBIT_INT,CL_CC,ROE,EAT_TA,ROA_BRUTTO,CF_TA,CF_SAL,ROS,EAT_YIE,ROI,ROA_NETTO,DPHZ_vat_registration_cancelled,DPHZ_vat_registration_cancelled_1y_off,DPHZ_vat_registration_cancelled_2y_off,RO_cancelled,RO_cancelled_1y_off,RO_cancelled_2y_off,RUZ_cancelled,RUZ_cancelled_1y_off,RUZ_cancelled_2y_off,RUZ_established,RUZ_established_1y_off,RUZ_established_2y_off,RU_konkurz,RU_konkurz_1y_off,RU_konkurz_2y_off,RU_ostatne_konania,RU_ostatne_konania_1y_off,RU_ostatne_konania_2y_off,RUZ_indirect_criterion,RUZ_indirect_criterion_1y_off,RUZ_indirect_criterion_2y_off,sknace_division_name,sknace_division,sknace_subcategory,is_outlier
0,4890214,2575566,2,25527,2015-07-31,671.0,2014,1,699.0,"Kerametal, akciová spoločnosť,",1970-02-02,46180.0,,5174843.0,1479.0,1366130.0,3807863.0,1255.0,5175248.0,1367385.0,5173364.0,0.0,-547786.0,-548266.0,-550029.0,-549549.0,6350.0,560563.0,0.0,11014.0,3.787958,3.787958,0.001083,-0.400976,0.000286,0.001227,215.336220,0.000000,0.000000,814.702992,293399.886614,0.264216,-0.400608,0.263974,0.000243,2.784778,0.735784,,0.264070,-0.144446,-0.106281,-0.106188,-0.105847,-86.265512,-86.618740,-49.939078,-0.106281,-0.106281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,"Veľkoobchod, okrem motorových vozidiel a motoc...",G,46.0,1
1,5392251,2976863,2,25527,2016-12-31,671.0,2015,1,699.0,"Kerametal, akciová spoločnosť, Bratislava",1970-02-02,46180.0,,5174245.0,1868.0,1693023.0,3480567.0,1255.0,5174845.0,1694278.0,5172377.0,0.0,-326822.0,-327302.0,-327302.0,-326822.0,3633.0,336975.0,0.0,10153.0,3.056217,3.056217,0.001103,-0.193040,0.000361,0.000702,466.357831,0.000000,0.000000,1423.720617,512783.980182,0.327407,-0.192898,0.327164,0.000243,2.054307,0.672593,,0.327320,-0.094037,-0.063249,-0.063156,-0.063156,-89.959262,-90.091385,-32.236974,-0.063249,-0.063249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,"Veľkoobchod, okrem motorových vozidiel a motoc...",G,46.0,1
2,5887438,3375702,2,25527,2017-12-27,671.0,2016,1,699.0,"Kerametal, akciová spoločnosť, Bratislava",1970-02-02,46180.0,,14679.0,1034.0,1697982.0,-1683081.0,0.0,14901.0,1697982.0,13645.0,0.0,-5163168.0,-5163648.0,-5163648.0,-5163168.0,1994.0,5166660.0,0.0,3492.0,0.008645,0.008645,0.000609,-3.040767,0.069391,0.133817,851.545637,0.000000,0.000000,6.843029,2690.250752,113.950876,-3.040767,113.950876,0.000000,-0.991224,-112.950876,,124.439868,,-346.530300,-346.498087,-346.498087,-2589.352056,-2589.592778,-1478.707904,-346.530300,-346.530300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1.0,1.0,"Veľkoobchod, okrem motorových vozidiel a motoc...",G,46.0,1
3,6361460,3757740,1,25527,2018-12-17,671.0,2017,1,699.0,"Kerametal, akciová spoločnosť, Bratislava",1970-02-02,46180.0,,14501.0,942.0,1729112.0,-1714376.0,0.0,14736.0,1729112.0,13559.0,0.0,-30827.0,-31307.0,-31307.0,-30827.0,1993.0,33542.0,11844.0,2715.0,0.008386,0.008386,0.000545,-0.017828,0.063925,0.135247,867.592574,0.000000,0.000000,6.803312,2661.796287,117.339305,-0.017828,117.339305,0.000000,-0.991478,-116.339305,-1.602752,127.525039,,-2.124525,-2.091952,-2.091952,-15.467637,-15.708480,-11.531123,-1.320779,-2.124525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,1.0,1.0,"Veľkoobchod, okrem motorových vozidiel a motoc...",G,46.0,1
4,4593745,2340588,1,16410,2015-06-11,698.0,2014,1,699.0,"TECHNOPOL, a.s.",1970-02-18,70220.0,,1543495.0,315617.0,3796505.0,17186564.0,102542.0,21085611.0,3899047.0,1227526.0,352.0,-245840.0,-246827.0,-275229.0,-274242.0,282838.0,611268.0,51413.0,337026.0,0.406557,0.406464,0.083134,-0.064754,0.014968,0.013414,13.785443,0.207307,0.448030,4.340032,26838.048494,0.184915,-0.063051,0.180052,0.004863,4.407888,0.815085,-4.334098,3.092810,-0.016014,-0.013053,-0.013006,-0.011659,-0.869190,-0.973098,-0.816640,-0.010615,-0.013053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,1.0,Vedenie firiem; poradenstvo v oblasti riadenia,M,70.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
934257,6182020,3612999,2,1609283,2018-12-18,50109936.0,2017,1,699.0,"Tehelné, a. s.",2016-01-01,70220.0,,932523.0,505452.0,4676784.0,1148608.0,5524591.0,11349983.0,10201375.0,427071.0,0.0,69235.0,66355.0,-14939.0,-12059.0,542412.0,574804.0,17000.0,562745.0,0.199394,0.199394,0.108077,0.014804,0.044533,0.047790,18.807429,0.000000,0.000000,0.787355,7533.007898,0.898801,0.006787,0.412052,0.486749,0.112593,0.101199,0.290647,10.950835,-0.013006,-0.001316,-0.001062,0.006100,0.127643,-0.027542,-0.026547,0.000182,-0.001316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,1.0,Vedenie firiem; poradenstvo v oblasti riadenia,M,70.0,1
934258,6624876,3967890,3,1609283,2019-12-20,50109936.0,2018,1,699.0,"Tehelné, a. s.",2016-01-01,70220.0,,1151005.0,725844.0,21800561.0,1278106.0,5055.0,23083722.0,21805616.0,425161.0,0.0,188672.0,164643.0,83349.0,107378.0,272116.0,1716299.0,31581.0,1823677.0,0.052797,0.052797,0.033295,0.008654,0.031444,0.011788,80.133531,0.000000,0.000000,1.562426,30538.961031,0.944632,0.008652,0.944413,0.000219,0.058614,0.055368,4.400082,51.276013,0.065213,0.003611,0.004652,0.008173,0.693351,0.306300,0.045704,0.004979,0.003611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,1.0,Vedenie firiem; poradenstvo v oblasti riadenia,M,70.0,1
934259,7320707,4525289,1,1609283,2020-10-28,50109936.0,2019,1,699.0,"Tehelné, a. s.",2016-01-01,70220.0,,14646771.0,444527.0,57916339.0,1817851.0,3555.0,59737745.0,57919894.0,14202244.0,0.0,759174.0,621039.0,539745.0,677880.0,3029764.0,2542874.0,187144.0,3220754.0,0.252895,0.252895,0.007675,0.013108,0.007441,0.050718,19.116966,0.000000,0.000000,4.687574,7098.106717,0.969569,0.013107,0.969510,0.000060,0.031386,0.030431,4.622237,4.077971,0.296914,0.009035,0.011348,0.012708,0.250572,0.178148,0.167583,0.012168,0.009035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,1.0,Vedenie firiem; poradenstvo v oblasti riadenia,M,70.0,1
934260,7714346,4845864,1,1609283,2021-06-30,50109936.0,2020,1,699.0,"Tehelné, a. s.",2016-01-01,70220.0,,7188649.0,610316.0,38265287.0,4415143.0,1621620.0,46145599.0,39886907.0,4057585.0,2520748.0,3568935.0,-18773066.0,2597293.0,3424589.0,42086233.0,20903023.0,532654.0,24327612.0,0.187863,0.121988,0.015950,0.093268,0.013226,0.912031,0.947742,43.413303,21.562141,0.096411,394.723273,0.864371,0.089476,0.829229,0.035141,0.110692,0.095679,7.429294,9.430557,0.588269,0.056285,0.074213,0.077341,0.084801,0.061714,0.106763,0.067828,0.056285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,1.0,Vedenie firiem; poradenstvo v oblasti riadenia,M,70.0,0


In [46]:
financial_ratios_table.to_csv('../../DATA/MODEL/FINAL_financial_ratios_df.csv', index = False)