In [3]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)

In [4]:
public_df = pd.read_csv('../data/inn_info_public.csv', dtype={'okved2': 'str', 'region': 'str'})
private_df = pd.read_csv('../data/inn_info_private.csv', dtype={'okved2': 'str'})
pays_df = pd.read_csv('../data/pays.csv')

In [5]:
public_df.head(2)

Unnamed: 0,hash_inn,okved2,region,is_public
0,61058,34,86,True
1,8311,18,86,True


In [6]:
pays_df.head(2)

Unnamed: 0,hash_inn_kt,hash_inn_dt,week,count,sum
0,0,0,1,4,38399.6
1,0,0,5,2,399.6


In [7]:
unknown_targets = public_df[public_df['is_public']==False][['hash_inn', 'region']]

In [8]:
unknown_targets = pd.merge(unknown_targets, 
                           private_df, 
                           on = 'hash_inn', 
                           how = 'left')

In [9]:
categotical_df = pd.concat([public_df[public_df['is_public']==True][['hash_inn', 'okved2', 'region']], 
                           unknown_targets[['hash_inn', 'okved2', 'region']]
                           ], axis = 0)

In [10]:
def create_pivot(pays_df, categotical_df, 
                 direction: str,
                 groupby_col: str,
                 agg_col: str,
                 cnt_not_null_cols = 1):
    """
    :cold_start = False - if False: убираем объекты, у которых было взаимодействие только с 1 категорией (groupby_col)
    :direction - направление платежа: kt - входящий, dt - исходящий
    :groupby_col - категорий, по которой группируем (okved2 или region)
    :agg_col - колонка для группировки
    """
    if direction == '_kt':
        postfix = '_dt'
    elif direction == '_dt':
        postfix = '_kt'
        
        pays_df = pd.merge(pays_df, categotical_df,
                           left_on = 'hash_inn' + direction,
                           right_on = 'hash_inn',
                           how = 'left')
        
        pays_df = pd.pivot_table(pays_df, 
                                 values=agg_col, 
                                 index='hash_inn'+direction,
                                 columns=[groupby_col],
                                 aggfunc='sum')
        
        pays_df['cnt_not_null'] = pays_df.count(axis=1)
        pays_df = pays_df[pays_df['cnt_not_null']>cnt_not_null_cols]
        print('len of data: {}'.format(pays_df.shape[0]))
        pays_df['total'] = pays_df.sum(axis=1)
        cols_list = [col for col in pays_df.columns if col not in ['total', 'cnt_not_null']]
            
        for col in pays_df[cols_list]:
            pays_df.rename(columns={col: col+'_'+agg_col+postfix}, inplace=True)
            pays_df['share_' + col + postfix] = pays_df[col] / pays_df['total']
        
        pays_df = pays_df \
                    .reset_index(drop=False) \
                    .rename(columns={'hash_inn' + direction: 'inn', groupby_col: 'okved' + postfix}, 
                            inplace=True)
    return pays_df

In [11]:
newDf = create_pivot(pays_df, categotical_df, 
                 direction='_kt',
                 agg_col='count',
                 groupby_col = 'okved2')

In [12]:
newDf.head()

Unnamed: 0,hash_inn_kt,hash_inn_dt,week,count,sum
0,0,0,1,4,38399.6
1,0,0,5,2,399.6
2,0,0,7,2,79.6
3,0,0,9,2,239.6
4,0,0,12,2,79.6


In [54]:
newDf.count(axis=1).head()

0    4
1    4
2    4
3    4
4    4
dtype: int64

In [23]:
okveds = public_df[['hash_inn', 'okved2']]
okveds['hash_inn_kt'] = okveds['hash_inn']
okveds['okved_kt'] = okveds['okved2']
okveds = okveds.rename(columns={'hash_inn': 'hash_inn_dt', 'okved2': 'okved_dt'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [24]:
pays_df = pd.merge(pays_df, okveds[['hash_inn_kt', 'okved_kt']], how='left', on='hash_inn_kt')
pays_df = pd.merge(pays_df, okveds[['hash_inn_dt', 'okved_dt']], how='left', on='hash_inn_dt')
pays_df = pays_df.dropna()
pays_df['okved_dt'] = pays_df['okved_dt'].astype('int32')
pays_df['hash_inn_kt'] = pays_df['hash_inn_kt'].astype(str)
pays_df['hash_inn_dt'] = pays_df['hash_inn_dt'].astype(str)
pays_df.head(2)

Unnamed: 0,hash_inn_kt,hash_inn_dt,week,count,sum,okved_kt,okved_dt
0,0,0,1,4,38399.6,-1,-1
1,0,0,5,2,399.6,-1,-1


In [25]:
kt_df = pays_df \
    .groupby(['hash_inn_kt', 'okved_dt']) \
    .agg({'count': 'sum'}) \
    .reset_index(drop=False) \
    .rename(columns={'hash_inn_kt': 'inn', 'okved_dt': 'okved'})
kt_df = kt_df[kt_df.okved > 0]
kt_df.head(2)

Unnamed: 0,inn,okved,count
1,0,8,2
2,0,12,7


In [26]:
kt_df.shape

(416970, 3)

In [29]:
kt_df.to_csv('../data/okved_kt_cnt.csv', index=False, header=True)