<a id="top"></a> <br>
## contents
1. [Data ETL](#1)
1. [Create Vertical Federation](#2)
1. [Create Horizontal Federation](#3)

<a id="1"></a> <br>
## 1- Data ETL

###### [Go to top](#top)

This notebook was mainly copied from [this notebook](https://www.kaggle.com/code/chauhuynh/my-first-kernel-3-699). All credits belongs to the original author.

In [None]:
import numpy as np
import pandas as pd
import datetime
import gc
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook as tqdm

import random
seed = 1414
random.seed(seed)
np.random.seed(seed)

In [None]:
! ls /kaggle/input/elofederatedlearningdataetltraintest

In [None]:
df_train = pd.read_csv('../input/elo-merchant-category-recommendation/train.csv')
#df_test = pd.read_csv('../input/test.csv')
df_hist_trans = pd.read_csv('../input/elo-merchant-category-recommendation/historical_transactions.csv')
df_new_merchant_trans = pd.read_csv('../input/elo-merchant-category-recommendation/new_merchant_transactions.csv')

In [None]:
test_df = pd.read_csv('../input/elofederatedlearningdataetltraintest/horizontalsplit-0-0-lower_table.table.csv')
test_df['test'] = 1
test_df = test_df[['card_id', 'test']]

In [None]:
for df in [df_hist_trans,df_new_merchant_trans]:
    df['category_2'].fillna(1.0,inplace=True)
    df['category_3'].fillna('A',inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)

In [None]:
df_train['outliers'] = 0
df_train.loc[df_train['target'] < -30, 'outliers'] = 1
df_train['outliers'].value_counts()

In [None]:
extra_train = df_train[df_train['outliers'] == 1]

In [None]:
df_train = df_train[df_train['outliers'] != 1]

In [None]:
extra_train.shape

In [None]:
df_train.shape

In [None]:
def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

In [None]:
for df in [df_hist_trans,df_new_merchant_trans]:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
    df['hour'] = df['purchase_date'].dt.hour
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0}) 
    #https://www.kaggle.com/c/elo-merchant-category-recommendation/discussion/73244
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']

In [None]:
aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']

aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['authorized_flag'] = ['sum', 'mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    df_hist_trans[col+'_mean'] = df_hist_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']    

new_columns = get_new_columns('hist',aggs)
df_hist_trans_group = df_hist_trans.groupby('card_id').agg(aggs)
df_hist_trans_group.columns = new_columns
df_hist_trans_group.reset_index(drop=False,inplace=True)
df_hist_trans_group['hist_purchase_date_diff'] = (df_hist_trans_group['hist_purchase_date_max'] - df_hist_trans_group['hist_purchase_date_min']).dt.days
df_hist_trans_group['hist_purchase_date_average'] = df_hist_trans_group['hist_purchase_date_diff']/df_hist_trans_group['hist_card_id_size']
df_hist_trans_group['hist_purchase_date_uptonow'] = (datetime.datetime.today() - df_hist_trans_group['hist_purchase_date_max']).dt.days
df_train = df_train.merge(df_hist_trans_group,on='card_id',how='left')
#df_test = df_test.merge(df_hist_trans_group,on='card_id',how='left')
del df_hist_trans_group;gc.collect()

In [None]:
aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']
aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    df_new_merchant_trans[col+'_mean'] = df_new_merchant_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']
    
new_columns = get_new_columns('new_hist',aggs)
df_hist_trans_group = df_new_merchant_trans.groupby('card_id').agg(aggs)
df_hist_trans_group.columns = new_columns
df_hist_trans_group.reset_index(drop=False,inplace=True)
df_hist_trans_group['new_hist_purchase_date_diff'] = (df_hist_trans_group['new_hist_purchase_date_max'] - df_hist_trans_group['new_hist_purchase_date_min']).dt.days
df_hist_trans_group['new_hist_purchase_date_average'] = df_hist_trans_group['new_hist_purchase_date_diff']/df_hist_trans_group['new_hist_card_id_size']
df_hist_trans_group['new_hist_purchase_date_uptonow'] = (datetime.datetime.today() - df_hist_trans_group['new_hist_purchase_date_max']).dt.days
df_train = df_train.merge(df_hist_trans_group,on='card_id',how='left')
#df_test = df_test.merge(df_hist_trans_group,on='card_id',how='left')
del df_hist_trans_group;gc.collect()

In [None]:
del df_hist_trans;gc.collect()
del df_new_merchant_trans;gc.collect()
df_train.head(5)

In [None]:
df_train.shape

In [None]:
for df in [df_train]:
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['dayofweek'] = df['first_active_month'].dt.dayofweek
    df['weekofyear'] = df['first_active_month'].dt.weekofyear
    df['month'] = df['first_active_month'].dt.month
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days
    df['hist_first_buy'] = (df['hist_purchase_date_min'] - df['first_active_month']).dt.days
    df['new_hist_first_buy'] = (df['new_hist_purchase_date_min'] - df['first_active_month']).dt.days
    for f in ['hist_purchase_date_max','hist_purchase_date_min','new_hist_purchase_date_max',\
                     'new_hist_purchase_date_min']:
        df[f] = df[f].astype(np.int64) * 1e-9
    df['card_id_total'] = df['new_hist_card_id_size']+df['hist_card_id_size']
    df['purchase_amount_total'] = df['new_hist_purchase_amount_sum']+df['hist_purchase_amount_sum']

In [None]:
df_train_columns = [c for c in df_train.columns if c not in ['first_active_month', 'outliers']]
df_train = df_train[df_train_columns]

In [None]:
df_train.shape

In [None]:
df_train = df_train.merge(test_df,on='card_id',how='left')

In [None]:
df_train['test'].value_counts()

In [None]:
df_train.shape

In [None]:
df_train.head()

In [None]:
df_train.to_csv('elo-ETL-data.csv')

In [None]:
df_train[df_train['test']==1].drop(['test'],axis=1).to_csv('elo-ETL-data-test.csv')
df_train[df_train['test']!=1].drop(['test'],axis=1).to_csv('elo-ETL-data-train.csv')

<a id="2"></a> <br>
## 2- Create Vertical Federation

###### [Go to top](#top)

In [None]:
df_train_columns_without_id = [c for c in df_train.columns if c not in ['card_id','target','test']]
round(len(df_train_columns_without_id)*0.6)

In [None]:
df_train_columns_60 = random.sample(df_train_columns_without_id, round(len(df_train_columns_without_id)*0.6))
df_train_columns_40 = [c for c in df_train.columns if c not in df_train_columns_60]
df_train_columns_60 = df_train_columns_60 + ['card_id', 'test']
print(df_train_columns_60)

In [None]:
print(df_train_columns_40)

In [None]:
df_train[df_train_columns_60].shape

In [None]:
extra_train_columns = [c for c in extra_train.columns if c not in ['first_active_month', 'outliers', 'target','test']]

In [None]:
extra_train[extra_train_columns].shape

In [None]:
extra_train[extra_train_columns].head()

In [None]:
fed_60_v = pd.concat([df_train[df_train_columns_60],extra_train[extra_train_columns]])
fed_60_v.shape

In [None]:
fed_60_v_test = fed_60_v[fed_60_v['test']==1]
fed_60_v_test = fed_60_v_test.drop(['test'],axis=1)
fed_60_v_test.to_csv('elo-ETL-data-60-vertical-test.csv')
for i,each in enumerate([c for c in df_train_columns_60 if c not in ['card_id','target']]):
    #print(i)
    fed_60_v_test.rename(columns={each:f'x{i}'},inplace=True)
    
#fed_60_v_test.to_csv('elo-ETL-data-60-vertical-test-x.csv')

In [None]:
fed_60_v_test.shape

In [None]:
fed_60_v_train = fed_60_v[fed_60_v['test']!=1]
fed_60_v_train = fed_60_v_train.drop(['test'],axis=1)
fed_60_v_train.to_csv('elo-ETL-data-60-vertical-train.csv')
for i,each in enumerate([c for c in df_train_columns_60 if c not in ['card_id','target']]):
    #print(i)
    fed_60_v_train.rename(columns={each:f'x{i}'},inplace=True)
    
#fed_60_v_train.to_csv('elo-ETL-data-60-vertical-train-x.csv')

In [None]:
fed_60_v_train.shape

In [None]:
fed_40_v = df_train[df_train_columns_40]
fed_40_v_test = fed_40_v[fed_40_v['test']==1]
fed_40_v_test = fed_40_v_test.drop(['test'],axis=1)
fed_40_v_test.to_csv('elo-ETL-data-40-vertical-test.csv')
for i,each in enumerate([c for c in df_train_columns_40 if c not in ['card_id','target']]):
    #print(i)
    fed_40_v_test.rename(columns={each:f'x{i}'},inplace=True)
    
#fed_40_v_test.to_csv('elo-ETL-data-40-vertical-test-x.csv')

In [None]:
fed_40_v_test.shape

In [None]:
fed_40_v_train = fed_40_v[fed_40_v['test']!=1]
fed_40_v_train = fed_40_v_train.drop(['test'],axis=1)
fed_40_v_train.to_csv('elo-ETL-data-40-vertical-train.csv')
for i,each in enumerate([c for c in df_train_columns_40 if c not in ['card_id','target']]):
    #print(i)
    fed_40_v_train.rename(columns={each:f'x{i}'},inplace=True)
    
#fed_40_v_train.to_csv('elo-ETL-data-40-vertical-train-x.csv')

In [None]:
fed_40_v_train.shape

<a id="3"></a> <br>
## 3- Create Horizontal Federation

###### [Go to top](#top)

In [None]:
df_train.shape

In [None]:
df_t = df_train[df_train['test']==1]
df_t.drop(['test'],axis=1).to_csv('elo-ETL-data-horizontal-test.csv')

In [None]:
df_t.drop(['test'],axis=1).shape

In [None]:
df_train = df_train[df_train['test']!=1].drop(['test'],axis=1)

In [None]:
fed_60_h = df_train.sample(n=round(df_train.shape[0]*0.6), random_state=seed, axis=0)
fed_60_h.to_csv('elo-ETL-data-60-horizontal-train.csv')

In [None]:
fed_60_h.shape

In [None]:
fed_40_h = pd.concat([df_train, fed_60_h, fed_60_h]).drop_duplicates(keep=False)  #df1-df2
fed_40_h.to_csv('elo-ETL-data-40-horizontal-train.csv')

In [None]:
fed_40_h.shape

In [None]:
for i,each in enumerate([c for c in fed_60_h.columns if c not in ['card_id','target']]):
    #print(i)
    fed_60_h.rename(columns={each:f'x{i}'},inplace=True)
    fed_40_h.rename(columns={each:f'x{i}'},inplace=True)

In [None]:
#fed_60_h.to_csv('elo-ETL-data-60-horizontal-train-x.csv')
#fed_40_h.to_csv('elo-ETL-data-40-horizontal-train-x.csv')