In [1]:
import numpy as np
import pandas as pd

In [2]:
inns_df = pd.read_csv('./inn_info_public.csv')
pays_df = pd.read_csv('./pays.csv')

In [3]:
inns_df.head()

Unnamed: 0,hash_inn,okved2,region,is_public
0,61058,34,86,True
1,8311,18,86,True
2,130273,-1,86,False
3,64081,43,86,True
4,218005,12,86,True


In [4]:
inns_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240069 entries, 0 to 240068
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   hash_inn   240069 non-null  int64
 1   okved2     240069 non-null  int64
 2   region     240069 non-null  int64
 3   is_public  240069 non-null  bool 
dtypes: bool(1), int64(3)
memory usage: 5.7 MB


In [5]:
pays_df.head()

Unnamed: 0,hash_inn_kt,hash_inn_dt,week,count,sum
0,0,0,1,4,38399.6
1,0,0,5,2,399.6
2,0,0,7,2,79.6
3,0,0,9,2,239.6
4,0,0,12,2,79.6


In [6]:
pays_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5430150 entries, 0 to 5430149
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   hash_inn_kt  int64  
 1   hash_inn_dt  int64  
 2   week         int64  
 3   count        int64  
 4   sum          float64
dtypes: float64(1), int64(4)
memory usage: 207.1 MB


In [7]:
print(f'Кол-во уникальных инн в датафрейме инн: {len(inns_df["hash_inn"].unique())}')

print(f'Кол-во уникальных инн в датафрейме платежей по кт: {len(pays_df["hash_inn_kt"].unique())}\n'
      f'Кол-во уникальных инн в датафрейме платежей по дт: {len(pays_df["hash_inn_dt"].unique())}')

Кол-во уникальных инн в датафрейме инн: 240069
Кол-во уникальных инн в датафрейме платежей по кт: 152160
Кол-во уникальных инн в датафрейме платежей по дт: 189772


In [8]:
okveds = inns_df[['hash_inn', 'okved2']]
okveds['hash_inn_kt'] = okveds['hash_inn']
okveds['okved_kt'] = okveds['okved2']

okveds = okveds.rename(columns={'hash_inn': 'hash_inn_dt', 'okved2': 'okved_dt'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
pays_df = pd.merge(pays_df, okveds[['hash_inn_kt', 'okved_kt']], how='left', on='hash_inn_kt')
pays_df = pd.merge(pays_df, okveds[['hash_inn_dt', 'okved_dt']], how='left', on='hash_inn_dt')
pays_df.head()

Unnamed: 0,hash_inn_kt,hash_inn_dt,week,count,sum,okved_kt,okved_dt
0,0,0,1,4,38399.6,-1,-1.0
1,0,0,5,2,399.6,-1,-1.0
2,0,0,7,2,79.6,-1,-1.0
3,0,0,9,2,239.6,-1,-1.0
4,0,0,12,2,79.6,-1,-1.0


In [10]:
pays_df = pays_df.dropna()
pays_df['okved_dt'] = pays_df['okved_dt'].astype('int32')
pays_df['hash_inn_kt'] = pays_df['hash_inn_kt'].astype(str)
pays_df['hash_inn_dt'] = pays_df['hash_inn_dt'].astype(str)

In [11]:
pays_df.head(10)

Unnamed: 0,hash_inn_kt,hash_inn_dt,week,count,sum,okved_kt,okved_dt
0,0,0,1,4,38399.6,-1,-1
1,0,0,5,2,399.6,-1,-1
2,0,0,7,2,79.6,-1,-1
3,0,0,9,2,239.6,-1,-1
4,0,0,12,2,79.6,-1,-1
5,0,0,15,2,399.6,-1,-1
6,0,0,16,2,0.92,-1,-1
7,0,0,20,4,2399.6,-1,-1
8,0,0,21,4,0.8,-1,-1
9,0,0,22,2,239.6,-1,-1


In [12]:
dfs = []

In [13]:
kt_dt_sum = pd.crosstab(pays_df['hash_inn_kt'], 
                        pays_df['okved_dt'], 
                        values=pays_df['sum'], 
                        aggfunc='sum')\
              .fillna(0)\
              .add_prefix('dt_sum_')\
              .reset_index()\
              .rename(columns={'hash_inn_kt': 'hash_inn'})

dfs.append(kt_dt_sum)

dt_kt_sum = pd.crosstab(pays_df['hash_inn_dt'], 
                        pays_df['okved_kt'], 
                        values=pays_df['sum'], 
                        aggfunc='sum')\
              .fillna(0)\
              .add_prefix('kt_sum_')\
              .reset_index()\
              .rename(columns={'hash_inn_dt': 'hash_inn'})

dfs.append(dt_kt_sum)

kt_dt_sum_ratio = pd.crosstab(pays_df['hash_inn_kt'], 
                              pays_df['okved_dt'], 
                              values=pays_df['sum'], 
                              aggfunc='sum')\
                    .fillna(0)\
                    .add_prefix('dt_sum_')\
                    .apply(lambda x: x/x.sum(), axis=1)\
                    .add_suffix('_ratio')\
                    .reset_index()\
                    .rename(columns={'hash_inn_kt': 'hash_inn'})

dfs.append(kt_dt_sum_ratio)

dt_kt_sum_ratio = pd.crosstab(pays_df['hash_inn_dt'], 
                              pays_df['okved_kt'], 
                              values=pays_df['sum'], 
                              aggfunc='sum')\
                    .fillna(0)\
                    .add_prefix('kt_sum_')\
                    .apply(lambda x: x/x.sum(), axis=1)\
                    .add_suffix('_ratio')\
                    .reset_index()\
                    .rename(columns={'hash_inn_dt': 'hash_inn'})
    
dfs.append(dt_kt_sum_ratio)

In [14]:
kt_dt_cnt = pd.crosstab(pays_df['hash_inn_kt'], 
                        pays_df['okved_dt'], 
                        values=pays_df['sum'], 
                        aggfunc='count')\
              .fillna(0)\
              .add_prefix('dt_cnt_')\
              .reset_index()\
              .rename(columns={'hash_inn_kt': 'hash_inn'})

dfs.append(kt_dt_cnt)

dt_kt_cnt = pd.crosstab(pays_df['hash_inn_dt'], 
                        pays_df['okved_kt'], 
                        values=pays_df['sum'], 
                        aggfunc='count')\
              .fillna(0)\
              .add_prefix('kt_cnt_')\
              .reset_index()\
              .rename(columns={'hash_inn_dt': 'hash_inn'})

dfs.append(dt_kt_cnt)

kt_dt_cnt_ratio = pd.crosstab(pays_df['hash_inn_kt'], 
                              pays_df['okved_dt'], 
                              values=pays_df['sum'], 
                              aggfunc='count')\
                    .fillna(0)\
                    .add_prefix('dt_cnt_')\
                    .apply(lambda x: x/x.sum(), axis=1)\
                    .add_suffix('_ratio')\
                   .reset_index()\
                   .rename(columns={'hash_inn_kt': 'hash_inn'})

dfs.append(kt_dt_cnt_ratio)

dt_kt_cnt_ratio = pd.crosstab(pays_df['hash_inn_dt'], 
                              pays_df['okved_kt'], 
                              values=pays_df['sum'], 
                              aggfunc='count')\
                    .fillna(0)\
                    .add_prefix('kt_cnt_')\
                    .apply(lambda x: x/x.sum(), axis=1)\
                    .add_suffix('_ratio')\
                    .reset_index()\
                    .rename(columns={'hash_inn_dt': 'hash_inn'})

dfs.append(dt_kt_cnt_ratio)

In [15]:
kt_dt_week_sum = pd.crosstab(pays_df['hash_inn_kt'], 
                             pays_df['week'], 
                             values=pays_df['sum'], 
                             aggfunc='sum')\
                   .fillna(0)\
                   .add_prefix('dt_week_sum_')\
                   .reset_index()\
                   .rename(columns={'hash_inn_kt': 'hash_inn'})

dfs.append(kt_dt_week_sum)

dt_kt_week_sum = pd.crosstab(pays_df['hash_inn_dt'], 
                             pays_df['week'], 
                             values=pays_df['sum'], 
                             aggfunc='sum')\
                   .fillna(0)\
                   .add_prefix('kt_week_sum_')\
                   .reset_index()\
                   .rename(columns={'hash_inn_dt': 'hash_inn'})

dfs.append(dt_kt_week_sum)

kt_dt_week_sum_ratio = pd.crosstab(pays_df['hash_inn_kt'], 
                                   pays_df['week'], 
                                   values=pays_df['sum'], 
                                   aggfunc='sum')\
                         .fillna(0)\
                         .add_prefix('dt_week_sum_')\
                         .apply(lambda x: x/x.sum(), axis=1)\
                         .add_suffix('_ratio')\
                         .reset_index()\
                         .rename(columns={'hash_inn_kt': 'hash_inn'})

dfs.append(kt_dt_week_sum_ratio)

dt_kt_week_sum_ratio = pd.crosstab(pays_df['hash_inn_dt'], 
                                   pays_df['week'], 
                                   values=pays_df['sum'], 
                                   aggfunc='sum')\
                         .fillna(0)\
                         .add_prefix('kt_week_sum_')\
                         .apply(lambda x: x/x.sum(), axis=1)\
                         .add_suffix('_ratio')\
                         .reset_index()\
                         .rename(columns={'hash_inn_dt': 'hash_inn'})

dfs.append(dt_kt_week_sum_ratio)

In [16]:
%%time

kt_dt_week_cnt = pd.crosstab(pays_df['hash_inn_kt'], 
                             pays_df['week'], 
                             values=pays_df['sum'], 
                             aggfunc='count')\
                   .fillna(0)\
                   .add_prefix('dt_week_cnt_')\
                   .reset_index()\
                   .rename(columns={'hash_inn_kt': 'hash_inn'})

dfs.append(kt_dt_week_cnt)

dt_kt_week_cnt = pd.crosstab(pays_df['hash_inn_dt'], 
                             pays_df['week'], 
                             values=pays_df['sum'], 
                             aggfunc='count')\
                   .fillna(0)\
                   .add_prefix('kt_week_cnt_')\
                   .reset_index()\
                   .rename(columns={'hash_inn_dt': 'hash_inn'})

dfs.append(dt_kt_week_cnt)

kt_dt_week_cnt_ratio = pd.crosstab(pays_df['hash_inn_kt'], 
                                   pays_df['week'], 
                                   values=pays_df['sum'], 
                                   aggfunc='count')\
                         .fillna(0)\
                         .add_prefix('dt_week_cnt_')\
                         .apply(lambda x: x/x.sum(), axis=1)\
                         .add_suffix('_ratio')\
                         .reset_index()\
                         .rename(columns={'hash_inn_kt': 'hash_inn'})

dfs.append(kt_dt_week_cnt_ratio)

dt_kt_week_cnt_ratio = pd.crosstab(pays_df['hash_inn_dt'], 
                                   pays_df['week'], 
                                   values=pays_df['sum'], 
                                   aggfunc='count')\
                         .fillna(0)\
                         .add_prefix('kt_week_cnt_')\
                         .apply(lambda x: x/x.sum(), axis=1)\
                         .add_suffix('_ratio')\
                         .reset_index()\
                         .rename(columns={'hash_inn_dt': 'hash_inn'})

dfs.append(dt_kt_week_cnt_ratio)

CPU times: user 1min 35s, sys: 1.86 s, total: 1min 37s
Wall time: 1min 37s


In [17]:
%%time

df = kt_dt_sum

for i in dfs[1:]:
    df = pd.merge(df, i, how='outer', on='hash_inn')

CPU times: user 10.1 s, sys: 5.34 s, total: 15.4 s
Wall time: 15.6 s


In [18]:
cols_to_del = [i for i in df.columns if '-1' in i]
df.fillna(0, inplace=True)
df.drop(cols_to_del, axis=1, inplace=True)

In [19]:
dt = set(pays_df['hash_inn_dt'].tolist())
kt = set(pays_df['hash_inn_kt'].tolist())

In [20]:
dt & kt

{'7091',
 '75457',
 '243061',
 '194197',
 '248644',
 '105730',
 '129715',
 '127205',
 '147012',
 '132265',
 '148104',
 '57890',
 '100096',
 '241563',
 '248640',
 '166632',
 '77609',
 '243977',
 '137110',
 '104129',
 '62551',
 '22542',
 '58601',
 '76526',
 '140927',
 '245447',
 '21139',
 '203343',
 '206360',
 '57202',
 '16388',
 '162616',
 '55684',
 '40700',
 '116767',
 '177892',
 '17080',
 '146465',
 '102578',
 '80407',
 '35266',
 '128203',
 '188835',
 '199957',
 '132178',
 '2350',
 '210565',
 '131194',
 '175760',
 '144919',
 '34048',
 '181631',
 '167055',
 '4375',
 '202180',
 '212092',
 '215474',
 '235902',
 '204759',
 '244989',
 '245670',
 '151258',
 '228375',
 '61371',
 '194963',
 '38364',
 '192476',
 '255496',
 '3146',
 '250197',
 '55672',
 '93496',
 '150479',
 '72799',
 '170154',
 '152242',
 '243453',
 '77165',
 '76887',
 '190683',
 '258170',
 '17139',
 '42493',
 '116905',
 '53389',
 '191061',
 '202953',
 '2103',
 '239823',
 '259054',
 '231883',
 '15028',
 '65062',
 '68678',
 '259

In [21]:
df[df['hash_inn']=='176393']

Unnamed: 0,hash_inn,dt_sum_0,dt_sum_1,dt_sum_2,dt_sum_3,dt_sum_4,dt_sum_5,dt_sum_6,dt_sum_7,dt_sum_8,...,kt_week_cnt_14_ratio,kt_week_cnt_15_ratio,kt_week_cnt_16_ratio,kt_week_cnt_17_ratio,kt_week_cnt_18_ratio,kt_week_cnt_19_ratio,kt_week_cnt_20_ratio,kt_week_cnt_21_ratio,kt_week_cnt_22_ratio,kt_week_cnt_23_ratio
49613,176393,0.0,0.0,0.0,0.0,0.588,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
inns_df['hash_inn'] = inns_df['hash_inn'].astype(str)

In [23]:
df = pd.merge(inns_df[['hash_inn', 'okved2']], df, on='hash_inn', how='left')

In [24]:
df.rename(columns={'okved2': 'target'}, inplace = True)

In [25]:
len([col for col in df.columns if '_ratio' in col])

416

In [None]:
df[['hash_inn', 'target']+[col for col in df.columns if '_ratio' in col]] \
    .to_csv('okved_pivot_ratio.csv', index=False, header=True)

In [26]:
df.columns.to_list()

['hash_inn',
 'target',
 'dt_sum_0',
 'dt_sum_1',
 'dt_sum_2',
 'dt_sum_3',
 'dt_sum_4',
 'dt_sum_5',
 'dt_sum_6',
 'dt_sum_7',
 'dt_sum_8',
 'dt_sum_9',
 'dt_sum_10',
 'dt_sum_11',
 'dt_sum_12',
 'dt_sum_13',
 'dt_sum_14',
 'dt_sum_15',
 'dt_sum_16',
 'dt_sum_17',
 'dt_sum_18',
 'dt_sum_19',
 'dt_sum_20',
 'dt_sum_21',
 'dt_sum_22',
 'dt_sum_23',
 'dt_sum_24',
 'dt_sum_25',
 'dt_sum_26',
 'dt_sum_27',
 'dt_sum_28',
 'dt_sum_29',
 'dt_sum_30',
 'dt_sum_31',
 'dt_sum_32',
 'dt_sum_33',
 'dt_sum_34',
 'dt_sum_35',
 'dt_sum_36',
 'dt_sum_37',
 'dt_sum_38',
 'dt_sum_39',
 'dt_sum_40',
 'dt_sum_41',
 'dt_sum_42',
 'dt_sum_43',
 'dt_sum_44',
 'dt_sum_45',
 'dt_sum_46',
 'dt_sum_47',
 'dt_sum_48',
 'dt_sum_49',
 'dt_sum_50',
 'dt_sum_51',
 'dt_sum_52',
 'dt_sum_53',
 'dt_sum_54',
 'dt_sum_55',
 'dt_sum_56',
 'dt_sum_57',
 'dt_sum_58',
 'dt_sum_59',
 'dt_sum_60',
 'dt_sum_61',
 'dt_sum_62',
 'dt_sum_63',
 'dt_sum_64',
 'dt_sum_65',
 'dt_sum_66',
 'dt_sum_67',
 'dt_sum_68',
 'dt_sum_69',
 'dt_s

In [29]:
df[['hash_inn', 'target']+[col for col in df.columns if 'kt_cnt' in col and 'ratio' not in col]] \
    .to_csv('okved_kt_cnt.csv', index=False, header=True)