In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)

In [2]:
public_df = pd.read_csv('../data/inn_info_public.csv', dtype={'okved2': 'str', 'region': 'str'})
private_df = pd.read_csv('../data/inn_info_private.csv', dtype={'okved2': 'str'})
private_df['okved2'] = '1000'
pays_df = pd.read_csv('../data/pays.csv')

In [3]:
public_df.head(2)

Unnamed: 0,hash_inn,okved2,region,is_public
0,61058,34,86,True
1,8311,18,86,True


In [4]:
pays_df.head(2)

Unnamed: 0,hash_inn_kt,hash_inn_dt,week,count,sum
0,0,0,1,4,38399.6
1,0,0,5,2,399.6


In [5]:
unknown_targets = public_df[public_df['is_public']==False][['hash_inn', 'region']]

In [6]:
unknown_targets = pd.merge(unknown_targets, 
                           private_df, 
                           on = 'hash_inn', 
                           how = 'left')

In [7]:
categotical_df = pd.concat([public_df[public_df['is_public']==True][['hash_inn', 'okved2', 'region']], 
                           unknown_targets[['hash_inn', 'okved2', 'region']]
                           ], axis = 0)

In [8]:
def create_pivot(pays_df, categotical_df, 
                 direction: str,
                 groupby_col: str,
                 agg_col: str,
                 cnt_not_null_cols = 1):
    """
    :direction - направление платежа: kt - входящий, dt - исходящий
    :groupby_col - категорий, по которой группируем (okved2 или region)
    :agg_col - колонка для группировки 
    :cnt_not_null_cols = 0 - количество колонок с ненулевым результатом после пивота
    """
            
    if direction == 'to':
        inn = 'hash_inn_dt'
        postfix = '_kt'
    elif direction == 'from':
        inn = 'hash_inn_kt'
        postfix = '_dt'
        
    pays_df = pd.merge(pays_df, categotical_df,
                       left_on = inn,
                       right_on = 'hash_inn',
                       how = 'left')

    pays_df = pd.pivot_table(pays_df, 
                             values=agg_col, 
                             index='hash_inn'+postfix,
                             columns=[groupby_col],
                             aggfunc='sum')
    
    cols_list = [col for col in pays_df.columns if col not in ['total', 'cnt_not_null']]
    
    pays_df['cnt_not_null'] = pays_df[cols_list].count(axis=1)
    pays_df = pays_df[pays_df['cnt_not_null']>cnt_not_null_cols]
    print('len of data: {}'.format(pays_df.shape[0]))
    pays_df['total'] = pays_df[cols_list].sum(axis=1)

    for col in pays_df[cols_list]:
        pays_df['share_' + col + postfix] = pays_df[col] / pays_df['total']
        pays_df.rename(columns={col: col+'_'+agg_col+postfix}, inplace=True)
        
    for col in pays_df.columns:
        if '_kt' not in col: 
            pays_df.rename(columns={col: col+postfix}, inplace=True)

    pays_df = pays_df \
                .reset_index(drop=False) \
                .rename(columns={'hash_inn'+postfix: 'hash_inn'})
    return pays_df

In [9]:
%%time
df_to = create_pivot(pays_df, categotical_df, 
                 direction='to',
                 agg_col='count',
                 groupby_col = 'okved2',
                 cnt_not_null_cols=0)

len of data: 152160
CPU times: user 14.1 s, sys: 11 s, total: 25.1 s
Wall time: 25.2 s


In [10]:
df_to.head(5)

okved2,hash_inn,0_count_kt,1_count_kt,10_count_kt,1000_count_kt,11_count_kt,12_count_kt,13_count_kt,14_count_kt,15_count_kt,16_count_kt,17_count_kt,18_count_kt,19_count_kt,2_count_kt,20_count_kt,21_count_kt,22_count_kt,23_count_kt,24_count_kt,25_count_kt,26_count_kt,27_count_kt,28_count_kt,29_count_kt,3_count_kt,30_count_kt,31_count_kt,32_count_kt,33_count_kt,34_count_kt,35_count_kt,36_count_kt,37_count_kt,38_count_kt,39_count_kt,4_count_kt,40_count_kt,41_count_kt,42_count_kt,43_count_kt,44_count_kt,45_count_kt,46_count_kt,47_count_kt,48_count_kt,49_count_kt,5_count_kt,50_count_kt,51_count_kt,52_count_kt,53_count_kt,54_count_kt,55_count_kt,56_count_kt,57_count_kt,58_count_kt,59_count_kt,6_count_kt,60_count_kt,61_count_kt,62_count_kt,63_count_kt,64_count_kt,65_count_kt,66_count_kt,67_count_kt,68_count_kt,69_count_kt,7_count_kt,70_count_kt,71_count_kt,72_count_kt,73_count_kt,74_count_kt,75_count_kt,76_count_kt,77_count_kt,78_count_kt,79_count_kt,8_count_kt,9_count_kt,cnt_not_null_kt,total_kt,share_0_kt,share_1_kt,share_10_kt,share_1000_kt,share_11_kt,share_12_kt,share_13_kt,share_14_kt,share_15_kt,share_16_kt,share_17_kt,share_18_kt,share_19_kt,share_2_kt,share_20_kt,share_21_kt,share_22_kt,share_23_kt,share_24_kt,share_25_kt,share_26_kt,share_27_kt,share_28_kt,share_29_kt,share_3_kt,share_30_kt,share_31_kt,share_32_kt,share_33_kt,share_34_kt,share_35_kt,share_36_kt,share_37_kt,share_38_kt,share_39_kt,share_4_kt,share_40_kt,share_41_kt,share_42_kt,share_43_kt,share_44_kt,share_45_kt,share_46_kt,share_47_kt,share_48_kt,share_49_kt,share_5_kt,share_50_kt,share_51_kt,share_52_kt,share_53_kt,share_54_kt,share_55_kt,share_56_kt,share_57_kt,share_58_kt,share_59_kt,share_6_kt,share_60_kt,share_61_kt,share_62_kt,share_63_kt,share_64_kt,share_65_kt,share_66_kt,share_67_kt,share_68_kt,share_69_kt,share_7_kt,share_70_kt,share_71_kt,share_72_kt,share_73_kt,share_74_kt,share_75_kt,share_76_kt,share_77_kt,share_78_kt,share_79_kt,share_8_kt,share_9_kt
0,0,,,,61.0,,7.0,,8.0,,1.0,,,,,,,1.0,,,,,,,,,,,,,15.0,,18.0,,,,,,,,,,,,,,,,,,61.0,2.0,,10.0,,,,2.0,,,1.0,5.0,,,,,,,,,,,,,,,,,,,2.0,,14,194.0,,,,0.314433,,0.036082,,0.041237,,0.005155,,,,,,,0.005155,,,,,,,,,,,,,0.07732,,0.092784,,,,,,,,,,,,,,,,,,0.314433,0.010309,,0.051546,,,,0.010309,,,0.005155,0.025773,,,,,,,,,,,,,,,,,,,0.010309,
1,1,,,,,,14.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,14.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,,,,3.0,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,5.0,,,,0.6,,0.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,,,,6.0,,2.0,,4.0,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,18.0,,,,0.333333,,0.111111,,0.222222,,,,0.277778,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.055556,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
%%time
df_from = create_pivot(pays_df, categotical_df, 
                 direction='from',
                 agg_col='count',
                 groupby_col = 'okved2',
                 cnt_not_null_cols=0)

len of data: 189772
CPU times: user 32.1 s, sys: 25.8 s, total: 57.9 s
Wall time: 58.2 s


In [12]:
df_from.head(5)

okved2,hash_inn,0_count_dt_dt,1_count_dt_dt,10_count_dt_dt,1000_count_dt_dt,11_count_dt_dt,12_count_dt_dt,13_count_dt_dt,14_count_dt_dt,15_count_dt_dt,16_count_dt_dt,17_count_dt_dt,18_count_dt_dt,19_count_dt_dt,2_count_dt_dt,20_count_dt_dt,21_count_dt_dt,22_count_dt_dt,23_count_dt_dt,24_count_dt_dt,25_count_dt_dt,26_count_dt_dt,27_count_dt_dt,28_count_dt_dt,29_count_dt_dt,3_count_dt_dt,30_count_dt_dt,31_count_dt_dt,32_count_dt_dt,33_count_dt_dt,34_count_dt_dt,35_count_dt_dt,36_count_dt_dt,37_count_dt_dt,38_count_dt_dt,39_count_dt_dt,4_count_dt_dt,40_count_dt_dt,41_count_dt_dt,42_count_dt_dt,43_count_dt_dt,44_count_dt_dt,45_count_dt_dt,46_count_dt_dt,47_count_dt_dt,48_count_dt_dt,49_count_dt_dt,5_count_dt_dt,50_count_dt_dt,51_count_dt_dt,52_count_dt_dt,53_count_dt_dt,54_count_dt_dt,55_count_dt_dt,56_count_dt_dt,57_count_dt_dt,58_count_dt_dt,59_count_dt_dt,6_count_dt_dt,60_count_dt_dt,61_count_dt_dt,62_count_dt_dt,63_count_dt_dt,64_count_dt_dt,65_count_dt_dt,66_count_dt_dt,67_count_dt_dt,68_count_dt_dt,69_count_dt_dt,7_count_dt_dt,70_count_dt_dt,71_count_dt_dt,72_count_dt_dt,73_count_dt_dt,74_count_dt_dt,75_count_dt_dt,76_count_dt_dt,77_count_dt_dt,78_count_dt_dt,79_count_dt_dt,8_count_dt_dt,9_count_dt_dt,cnt_not_null_dt,total_dt,share_0_dt_dt,share_1_dt_dt,share_10_dt_dt,share_1000_dt_dt,share_11_dt_dt,share_12_dt_dt,share_13_dt_dt,share_14_dt_dt,share_15_dt_dt,share_16_dt_dt,share_17_dt_dt,share_18_dt_dt,share_19_dt_dt,share_2_dt_dt,share_20_dt_dt,share_21_dt_dt,share_22_dt_dt,share_23_dt_dt,share_24_dt_dt,share_25_dt_dt,share_26_dt_dt,share_27_dt_dt,share_28_dt_dt,share_29_dt_dt,share_3_dt_dt,share_30_dt_dt,share_31_dt_dt,share_32_dt_dt,share_33_dt_dt,share_34_dt_dt,share_35_dt_dt,share_36_dt_dt,share_37_dt_dt,share_38_dt_dt,share_39_dt_dt,share_4_dt_dt,share_40_dt_dt,share_41_dt_dt,share_42_dt_dt,share_43_dt_dt,share_44_dt_dt,share_45_dt_dt,share_46_dt_dt,share_47_dt_dt,share_48_dt_dt,share_49_dt_dt,share_5_dt_dt,share_50_dt_dt,share_51_dt_dt,share_52_dt_dt,share_53_dt_dt,share_54_dt_dt,share_55_dt_dt,share_56_dt_dt,share_57_dt_dt,share_58_dt_dt,share_59_dt_dt,share_6_dt_dt,share_60_dt_dt,share_61_dt_dt,share_62_dt_dt,share_63_dt_dt,share_64_dt_dt,share_65_dt_dt,share_66_dt_dt,share_67_dt_dt,share_68_dt_dt,share_69_dt_dt,share_7_dt_dt,share_70_dt_dt,share_71_dt_dt,share_72_dt_dt,share_73_dt_dt,share_74_dt_dt,share_75_dt_dt,share_76_dt_dt,share_77_dt_dt,share_78_dt_dt,share_79_dt_dt,share_8_dt_dt,share_9_dt_dt
0,0,,,,281.0,7.0,112.0,,15.0,,11.0,,4.0,5.0,,34.0,,,,,,4.0,,,2.0,2.0,,,1.0,,2.0,,,,,,91.0,,,2.0,2.0,19.0,,,8.0,,,,,,,30.0,10.0,,18.0,,,,,,9.0,26.0,,,,,,,,2.0,2.0,10.0,6.0,,5.0,6.0,56.0,,4.0,8.0,251.0,32.0,34,1077.0,,,,0.26091,0.0065,0.103993,,0.013928,,0.010214,,0.003714,0.004643,,0.031569,,,,,,0.003714,,,0.001857,0.001857,,,0.000929,,0.001857,,,,,,0.084494,,,0.001857,0.001857,0.017642,,,0.007428,,,,,,,0.027855,0.009285,,0.016713,,,,,,0.008357,0.024141,,,,,,,,0.001857,0.001857,0.009285,0.005571,,0.004643,0.005571,0.051996,,0.003714,0.007428,0.233055,0.029712
1,2,,,,,,29.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,29.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,5,,,,24.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,10.0,,,,,,,,3,38.0,,,,0.631579,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.105263,,,,,,,,,,,,,,,,,,,,0.263158,,,,,,,
3,7,,,,1.0,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,4,17.0,,,,0.058824,,0.235294,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.588235,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.117647,,
4,10,,,,31.0,,16.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,24.0,,,,,,,,4,72.0,,,,0.430556,,0.222222,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.013889,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.333333,,,,,,,


In [13]:
%%time
result_df = pd.merge(categotical_df, df_to,
                       on='hash_inn',
                       how='left')

result_df = pd.merge(result_df, df_from,
                       on='hash_inn',
                       how='left')

CPU times: user 1.65 s, sys: 1.08 s, total: 2.73 s
Wall time: 2.73 s


In [33]:
result_df.iloc[3].dropna()

hash_inn           218005
okved2                 12
region                 86
1000_count_kt          20
cnt_not_null_kt         1
total_kt               20
share_1000_kt           1
Name: 3, dtype: object

In [35]:
pays_df.loc[pays_df.hash_inn_dt==218005]

Unnamed: 0,hash_inn_kt,hash_inn_dt,week,count,sum


In [15]:
result_df.shape

(240069, 331)

In [16]:
result_df[(result_df['cnt_not_null_kt'].isna()) & (result_df['cnt_not_null_dt'].isna())].shape

(0, 331)

In [14]:
result_df.to_csv('../data/result_df.csv', index=False, header=True)

In [17]:
result_df

Unnamed: 0,hash_inn,okved2,region,0_count_kt,1_count_kt,10_count_kt,1000_count_kt,11_count_kt,12_count_kt,13_count_kt,14_count_kt,15_count_kt,16_count_kt,17_count_kt,18_count_kt,19_count_kt,2_count_kt,20_count_kt,21_count_kt,22_count_kt,23_count_kt,24_count_kt,25_count_kt,26_count_kt,27_count_kt,28_count_kt,29_count_kt,3_count_kt,30_count_kt,31_count_kt,32_count_kt,33_count_kt,34_count_kt,35_count_kt,36_count_kt,37_count_kt,38_count_kt,39_count_kt,4_count_kt,40_count_kt,41_count_kt,42_count_kt,43_count_kt,44_count_kt,45_count_kt,46_count_kt,47_count_kt,48_count_kt,49_count_kt,5_count_kt,50_count_kt,51_count_kt,52_count_kt,53_count_kt,54_count_kt,55_count_kt,56_count_kt,57_count_kt,58_count_kt,59_count_kt,6_count_kt,60_count_kt,61_count_kt,62_count_kt,63_count_kt,64_count_kt,65_count_kt,66_count_kt,67_count_kt,68_count_kt,69_count_kt,7_count_kt,70_count_kt,71_count_kt,72_count_kt,73_count_kt,74_count_kt,75_count_kt,76_count_kt,77_count_kt,78_count_kt,79_count_kt,8_count_kt,9_count_kt,cnt_not_null_kt,total_kt,share_0_kt,share_1_kt,share_10_kt,share_1000_kt,share_11_kt,share_12_kt,share_13_kt,share_14_kt,share_15_kt,share_16_kt,share_17_kt,share_18_kt,share_19_kt,share_2_kt,share_20_kt,share_21_kt,share_22_kt,share_23_kt,share_24_kt,share_25_kt,share_26_kt,share_27_kt,share_28_kt,share_29_kt,share_3_kt,share_30_kt,share_31_kt,share_32_kt,share_33_kt,share_34_kt,share_35_kt,share_36_kt,share_37_kt,share_38_kt,share_39_kt,share_4_kt,share_40_kt,share_41_kt,share_42_kt,share_43_kt,share_44_kt,share_45_kt,share_46_kt,share_47_kt,share_48_kt,share_49_kt,share_5_kt,share_50_kt,share_51_kt,share_52_kt,share_53_kt,share_54_kt,share_55_kt,share_56_kt,share_57_kt,share_58_kt,share_59_kt,share_6_kt,share_60_kt,share_61_kt,share_62_kt,share_63_kt,share_64_kt,share_65_kt,share_66_kt,share_67_kt,share_68_kt,share_69_kt,share_7_kt,share_70_kt,share_71_kt,share_72_kt,share_73_kt,share_74_kt,share_75_kt,share_76_kt,share_77_kt,share_78_kt,share_79_kt,share_8_kt,share_9_kt,0_count_dt_dt,1_count_dt_dt,10_count_dt_dt,1000_count_dt_dt,11_count_dt_dt,12_count_dt_dt,13_count_dt_dt,14_count_dt_dt,15_count_dt_dt,16_count_dt_dt,17_count_dt_dt,18_count_dt_dt,19_count_dt_dt,2_count_dt_dt,20_count_dt_dt,21_count_dt_dt,22_count_dt_dt,23_count_dt_dt,24_count_dt_dt,25_count_dt_dt,26_count_dt_dt,27_count_dt_dt,28_count_dt_dt,29_count_dt_dt,3_count_dt_dt,30_count_dt_dt,31_count_dt_dt,32_count_dt_dt,33_count_dt_dt,34_count_dt_dt,35_count_dt_dt,36_count_dt_dt,37_count_dt_dt,38_count_dt_dt,39_count_dt_dt,4_count_dt_dt,40_count_dt_dt,41_count_dt_dt,42_count_dt_dt,43_count_dt_dt,44_count_dt_dt,45_count_dt_dt,46_count_dt_dt,47_count_dt_dt,48_count_dt_dt,49_count_dt_dt,5_count_dt_dt,50_count_dt_dt,51_count_dt_dt,52_count_dt_dt,53_count_dt_dt,54_count_dt_dt,55_count_dt_dt,56_count_dt_dt,57_count_dt_dt,58_count_dt_dt,59_count_dt_dt,6_count_dt_dt,60_count_dt_dt,61_count_dt_dt,62_count_dt_dt,63_count_dt_dt,64_count_dt_dt,65_count_dt_dt,66_count_dt_dt,67_count_dt_dt,68_count_dt_dt,69_count_dt_dt,7_count_dt_dt,70_count_dt_dt,71_count_dt_dt,72_count_dt_dt,73_count_dt_dt,74_count_dt_dt,75_count_dt_dt,76_count_dt_dt,77_count_dt_dt,78_count_dt_dt,79_count_dt_dt,8_count_dt_dt,9_count_dt_dt,cnt_not_null_dt,total_dt,share_0_dt_dt,share_1_dt_dt,share_10_dt_dt,share_1000_dt_dt,share_11_dt_dt,share_12_dt_dt,share_13_dt_dt,share_14_dt_dt,share_15_dt_dt,share_16_dt_dt,share_17_dt_dt,share_18_dt_dt,share_19_dt_dt,share_2_dt_dt,share_20_dt_dt,share_21_dt_dt,share_22_dt_dt,share_23_dt_dt,share_24_dt_dt,share_25_dt_dt,share_26_dt_dt,share_27_dt_dt,share_28_dt_dt,share_29_dt_dt,share_3_dt_dt,share_30_dt_dt,share_31_dt_dt,share_32_dt_dt,share_33_dt_dt,share_34_dt_dt,share_35_dt_dt,share_36_dt_dt,share_37_dt_dt,share_38_dt_dt,share_39_dt_dt,share_4_dt_dt,share_40_dt_dt,share_41_dt_dt,share_42_dt_dt,share_43_dt_dt,share_44_dt_dt,share_45_dt_dt,share_46_dt_dt,share_47_dt_dt,share_48_dt_dt,share_49_dt_dt,share_5_dt_dt,share_50_dt_dt,share_51_dt_dt,share_52_dt_dt,share_53_dt_dt,share_54_dt_dt,share_55_dt_dt,share_56_dt_dt,share_57_dt_dt,share_58_dt_dt,share_59_dt_dt,share_6_dt_dt,share_60_dt_dt,share_61_dt_dt,share_62_dt_dt,share_63_dt_dt,share_64_dt_dt,share_65_dt_dt,share_66_dt_dt,share_67_dt_dt,share_68_dt_dt,share_69_dt_dt,share_7_dt_dt,share_70_dt_dt,share_71_dt_dt,share_72_dt_dt,share_73_dt_dt,share_74_dt_dt,share_75_dt_dt,share_76_dt_dt,share_77_dt_dt,share_78_dt_dt,share_79_dt_dt,share_8_dt_dt,share_9_dt_dt
0,61058,34,86,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,12.0,,,,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,8311,18,86,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,11.0,,,,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,64081,43,86,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,22.0,,,,0.181818,,0.272727,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.545455,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,218005,12,86,,,,20.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,20.0,,,,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,159729,34,86,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,33.0,,22.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,55.0,,,,0.600000,,0.400000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240064,217719,1000,69,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
240065,156954,1000,49,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,,,,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
240066,249271,1000,49,,,,,,,,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,6.0,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
240067,196702,1000,12,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,4.0,,,,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Хлам (не используется)

In [17]:
okveds = public_df[['hash_inn', 'okved2']]
okveds['hash_inn_kt'] = okveds['hash_inn']
okveds['okved_kt'] = okveds['okved2']
okveds = okveds.rename(columns={'hash_inn': 'hash_inn_dt', 'okved2': 'okved_dt'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
pays_df = pd.merge(pays_dfpays_df, okveds[['hash_inn_kt', 'okved_kt']], how='left', on='hash_inn_kt')
pays_df = pd.merge(pays_df, okveds[['hash_inn_dt', 'okved_dt']], how='left', on='hash_inn_dt')
pays_df = pays_df.dropna()
pays_df['okved_dt'] = pays_df['okved_dt'].astype('int32')
pays_df['hash_inn_kt'] = pays_df['hash_inn_kt'].astype(str)
pays_df['hash_inn_dt'] = pays_df['hash_inn_dt'].astype(str)
pays_df.head(2)

NameError: name 'pays_dfpays_df' is not defined

In [None]:
kt_df = pays_df \
    .groupby(['hash_inn_kt', 'okved_dt']) \
    .agg({'count': 'sum'}) \
    .reset_index(drop=False) \
    .rename(columns={'hash_inn_kt': 'inn', 'okved_dt': 'okved'})
kt_df = kt_df[kt_df.okved > 0]
kt_df.head(2)

In [None]:
kt_df.shape

In [None]:
kt_df.to_csv('../data/okved_kt_cnt.csv', index=False, header=True)

In [None]:
kt_df