In [1]:
import pandas as pd 
import numpy as np
from tqdm import tqdm

pd.options.display.max_columns = None
pd.options.display.max_rows = 100

In [2]:
version = "f04"

In [3]:
train_identity = pd.read_csv("../data/raw/train_identity.csv")
train_transaction = pd.read_csv("../data/raw/train_transaction.csv")
test_identity = pd.read_csv("../data/raw/test_identity.csv")
test_transaction = pd.read_csv("../data/raw/test_transaction.csv")

## concat

In [5]:
train_transaction["istrain"] = 1
test_transaction["istrain"] = 0
test_transaction["isFraud"] = np.nan

concat_transaction = pd.concat([train_transaction, test_transaction], axis=0, sort=False)
concat_identity = pd.concat([train_identity, test_identity], axis=0, sort=False)

# とりあえず、マージ

In [6]:
all_data = pd.merge(concat_transaction,concat_identity, on ='TransactionID', how='left')

# 効いていそうな特徴を追加(カーネル参考)

In [7]:
all_data['TransactionAmt_to_mean_card1'] = all_data['TransactionAmt'] / all_data.groupby(['card1'])['TransactionAmt'].transform('mean')
all_data['TransactionAmt_to_mean_card4'] = all_data['TransactionAmt'] / all_data.groupby(['card4'])['TransactionAmt'].transform('mean')
all_data['TransactionAmt_to_std_card1'] = all_data['TransactionAmt'] / all_data.groupby(['card1'])['TransactionAmt'].transform('std')
all_data['TransactionAmt_to_std_card4'] = all_data['TransactionAmt'] / all_data.groupby(['card4'])['TransactionAmt'].transform('std')


all_data['id_02_to_mean_card1'] = all_data['id_02'] / all_data.groupby(['card1'])['id_02'].transform('mean')
all_data['id_02_to_mean_card4'] = all_data['id_02'] / all_data.groupby(['card4'])['id_02'].transform('mean')
all_data['id_02_to_std_card1'] = all_data['id_02'] / all_data.groupby(['card1'])['id_02'].transform('std')
all_data['id_02_to_std_card4'] = all_data['id_02'] / all_data.groupby(['card4'])['id_02'].transform('std')


all_data['D15_to_mean_card1'] = all_data['D15'] / all_data.groupby(['card1'])['D15'].transform('mean')
all_data['D15_to_mean_card4'] = all_data['D15'] / all_data.groupby(['card4'])['D15'].transform('mean')
all_data['D15_to_std_card1'] = all_data['D15'] / all_data.groupby(['card1'])['D15'].transform('std')
all_data['D15_to_std_card4'] = all_data['D15'] / all_data.groupby(['card4'])['D15'].transform('std')


all_data['D15_to_mean_addr1'] = all_data['D15'] / all_data.groupby(['addr1'])['D15'].transform('mean')
all_data['D15_to_mean_addr2'] = all_data['D15'] / all_data.groupby(['addr2'])['D15'].transform('mean')
all_data['D15_to_std_addr1'] = all_data['D15'] / all_data.groupby(['addr1'])['D15'].transform('std')
all_data['D15_to_std_addr2'] = all_data['D15'] / all_data.groupby(['addr2'])['D15'].transform('std')


# card1を個人のIDと仮定して、前回の情報を抽出

In [8]:
card1_list = all_data['card1'].drop_duplicates().tolist()

In [9]:
# card1がひとつ前に行った時間と金額を特徴に入れる
card_tra = pd.DataFrame()

for card_num in tqdm(card1_list):
    card_temp = all_data.query('card1 == @card_num')
    card_temp_shift = card_temp[['TransactionDT','TransactionAmt']].shift(1)
    card_temp_shift = card_temp_shift.rename(columns={'TransactionDT':'TransactionDT_shift','TransactionAmt':'TransactionAmt_shift'})
    card_tra = pd.concat([card_tra, card_temp_shift],axis = 0)

all_data = pd.concat([all_data,card_tra],axis=1)

100%|██████████| 17091/17091 [05:17<00:00, 53.89it/s]


In [12]:
all_data['TransactionDT_diff'] = all_data['TransactionDT'] -  all_data['TransactionDT_shift']
all_data['TransactionAmt_diff'] = all_data['TransactionAmt'] -  all_data['TransactionAmt_shift']

In [15]:
all_data = all_data.drop(['TransactionDT_shift','TransactionAmt_shift'],axis=1)

# TransactionAmtの非整数部分

In [16]:
all_data["TransactionAmtmod"] = all_data["TransactionAmt"] - all_data["TransactionAmt"].astype(int)

In [17]:
for column in ['TransactionAmt','TransactionAmtmod']:
    all_data[column] = all_data[column].astype('float')

### TransactionDT

In [18]:
import datetime
all_data['TransactionDT'] = pd.to_datetime('2017-12-01') + pd.to_timedelta(all_data['TransactionDT'],unit='s')

### 時間特徴追加

In [19]:
# concat_transaction["day"] = concat_transaction["TransactionDT"].dt.day
all_data["hour"] = all_data["TransactionDT"].dt.hour
# concat_transaction["minute"] = concat_transaction["TransactionDT"].dt.minute
#concat_transaction["second"] = concat_transaction["TransactionDT"].dt.second
all_data["dayofweek"] = all_data["TransactionDT"].dt.dayofweek

# email特徴追加

In [20]:
all_data[["P_emaildomain_1","P_emaildomain_2","P_emaildomain_3"]] = all_data['P_emaildomain'].str.split('.',expand=True)
all_data[["R_emaildomain_1","R_emaildomain_2","R_emaildomain_3"]] = all_data['R_emaildomain'].str.split('.',expand=True)

### カテゴリとして扱う特徴

In [21]:
cat_cols = ['id_12',
            'id_13', 
            'id_14', 
            'id_15', 
            'id_16',
            'id_17', 
            'id_18',
            'id_19',
            'id_20', 
            'id_21',
            'id_22',
            'id_23',
            'id_24',
            'id_25', 
            'id_26',
            'id_27', 
            'id_28', 
            'id_29',
            'id_30',
            'id_31',
            'id_32', 
            'id_33',
            'id_34',
            'id_35', 
            'id_36', 
            'id_37', 
            'id_38',
            'DeviceType', 
            'DeviceInfo',
            'ProductCD',
            'card4',
            'card6',
            'M4',
            'P_emaildomain',
            'R_emaildomain',
            'card1', 
            'card2', 
            'card3', 
            'card5',
            'addr1',
            'addr2',
            'M1',
            'M2', 
            'M3',
            'M5',
            'M6',
            'M7',
            'M8',
            'M9',
            'P_emaildomain_1', 
            'P_emaildomain_2',
            'P_emaildomain_3', 
            'R_emaildomain_1',
            'R_emaildomain_2', 
            'R_emaildomain_3',
            'ProductCD']

# カウントエンコーディング

In [22]:
def count_encoder(df, column):
    df[column] = df[column].fillna(-1)
    count_enc = df.groupby(column)[column].count()
    df[f'count_enc_{column}'] = df[column].map(count_enc)
    return df

In [23]:
for col in cat_cols:
    all_data = count_encoder(all_data, col)

In [24]:
# for column in cat_cols:
#     all_data[column] = all_data[column].astype('category')

# 結合していたtrainとtestを分割

In [25]:
train_processed = all_data.query('istrain == 1')
test_processed = all_data.query('istrain == 0')

In [26]:
test_processed = test_processed.drop(["isFraud"], axis=1)

In [27]:
import feather

In [28]:
# 書き込み

In [30]:
# feather.write_dataframe(train_processed, f'../data/processed/train_processed_{version}.feather')
# feather.write_dataframe(test_processed, f'../data/processed/test_processed_{version}.feather')

In [31]:
train_processed.to_csv('../data/processed/train_processed_f04.csv',index=False,header=True)
test_processed.to_csv('../data/processed/test_processed_f04.csv',index=False,header=True)