In [44]:
import pandas as pd 
import numpy as np

In [45]:
version = "08"

In [46]:
train_identity = pd.read_csv("../data/raw/train_identity.csv")
train_transaction = pd.read_csv("../data/raw/train_transaction.csv")
test_identity = pd.read_csv("../data/raw/test_identity.csv")
test_transaction = pd.read_csv("../data/raw/test_transaction.csv")

## concat

In [47]:
train_transaction["istrain"] = 1
test_transaction["istrain"] = 0
test_transaction["isFraud"] = np.nan

concat_transaction = pd.concat([train_transaction, test_transaction], axis=0, sort=False)
concat_identity = pd.concat([train_identity, test_identity], axis=0, sort=False)

## transaction

In [48]:
concat_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V331,V332,V333,V334,V335,V336,V337,V338,V339,istrain
0,2987000,0.0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,1
1,2987001,0.0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,1
2,2987002,0.0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,1
3,2987003,0.0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,1
4,2987004,0.0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


### 特徴量削除

In [49]:
#many_null_cols = [col for col in concat_transaction.columns if concat_transaction[col].isnull().sum() / concat_transaction.shape[0] > 0.6]
#print(many_null_cols)

### TransactionDT

In [50]:
import datetime
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
concat_transaction['TransactionDT'] = concat_transaction['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

In [51]:
concat_transaction["TransactionDT"].head()

0   2017-12-02 00:00:00
1   2017-12-02 00:00:01
2   2017-12-02 00:01:09
3   2017-12-02 00:01:39
4   2017-12-02 00:01:46
Name: TransactionDT, dtype: datetime64[ns]

* 時間特徴追加

In [52]:
#concat_transaction["day"] = concat_transaction["TransactionDT"].dt.day
concat_transaction["hour"] = concat_transaction["TransactionDT"].dt.hour
#concat_transaction["minute"] = concat_transaction["TransactionDT"].dt.minute
#concat_transaction["second"] = concat_transaction["TransactionDT"].dt.second
concat_transaction["dayofweek"] = concat_transaction["TransactionDT"].dt.dayofweek

In [53]:
#concat_transaction = concat_transaction.drop(["TransactionDT"], axis=1)

### TransactionAmt

In [54]:
concat_transaction["TransactionAmt_mod"] = concat_transaction["TransactionAmt"] - concat_transaction["TransactionAmt"].astype(int)

In [55]:
concat_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V334,V335,V336,V337,V338,V339,istrain,hour,dayofweek,TransactionAmt_mod
0,2987000,0.0,2017-12-02 00:00:00,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,1,0,5,0.5
1,2987001,0.0,2017-12-02 00:00:01,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,1,0,5,0.0
2,2987002,0.0,2017-12-02 00:01:09,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,1,0,5,0.0
3,2987003,0.0,2017-12-02 00:01:39,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,1,0,5,0.0
4,2987004,0.0,2017-12-02 00:01:46,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0,5,0.0


In [56]:
concat_transaction = concat_transaction[concat_transaction["TransactionAmt"] < 30000]

In [57]:
def count_encoder(df, column):
    df[column] = df[column].fillna(-1)
    count_enc = df.groupby(column)[column].count()
    df[f'count_enc_{column}'] = df[column].map(count_enc)
    return df

In [58]:
from sklearn.preprocessing import LabelEncoder
def label_encoder(df, column):
    label_enc = LabelEncoder()
    df[column] = df[column].astype(str)
    df[column] = label_enc.fit_transform(df[column])
    return df

### ProductCD

In [59]:
concat_transaction = count_encoder(concat_transaction, "ProductCD")
concat_transaction = label_encoder(concat_transaction, "ProductCD")

### card

In [60]:
for column in ["card1", "card2", "card3", "card4", "card5", "card6"]:
    concat_transaction = count_encoder(concat_transaction, column)
    concat_transaction = label_encoder(concat_transaction, column)

In [61]:
concat_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,hour,dayofweek,TransactionAmt_mod,count_enc_ProductCD,count_enc_card1,count_enc_card2,count_enc_card3,count_enc_card4,count_enc_card5,count_enc_card6
0,2987000,0.0,2017-12-02 00:00:00,68.5,4,4248,0,51,2,43,...,0,5,0.5,800655,56,17587,956843,9524,309,267646
1,2987001,0.0,2017-12-02 00:00:01,29.0,4,9979,305,51,3,3,...,0,5,0.0,800655,1338,5593,956843,347384,49489,267646
2,2987002,0.0,2017-12-02 00:01:09,59.0,4,11850,391,51,4,67,...,0,5,0.0,800655,1794,70496,956843,719649,102930,824959
3,2987003,0.0,2017-12-02 00:01:39,50.0,4,8796,468,51,3,18,...,0,5,0.0,800655,7635,11287,956843,347384,47061,824959
4,2987004,0.0,2017-12-02 00:01:46,50.0,1,11687,415,51,3,3,...,0,5,0.0,62397,30,27223,956843,347384,49489,267646


* 集約

In [62]:
agg_col = [["card1"], ["card2"], ["card3"], ["card5"],
           ["ProductCD", "card1"], ["ProductCD", "card2"], ["ProductCD", "card3"], ["ProductCD", "card5"],
           ["card1", "card2", "card3", "card5"], ["addr1"], ["C13"], ["hour"]]

In [63]:
for col_name in agg_col:
    concat_transaction[f'{col_name}_count'] = concat_transaction.groupby(columns)[columns[0]].transform("count")
    for target_col in ["TransactionAmt", "D15", "D10", "D4", "D2", "D1"]:    
        col_name = '-'.join(columns)
        concat_transaction[f'{col_name}_{target_col}_mean'] = concat_transaction.groupby(columns)[target_col].transform('mean')
        concat_transaction[f'{col_name}_{target_col}_std'] = concat_transaction.groupby(columns)[target_col].transform('std')
        
        concat_transaction[f'{col_name}_{target_col}_div_mean'] = concat_transaction[target_col] / concat_transaction[f'{col_name}_{target_col}_mean']
        concat_transaction[f'{col_name}_{target_col}_subtract_mean'] = concat_transaction[target_col] - concat_transaction[f'{col_name}_{target_col}_mean']
        concat_transaction[f'{col_name}_{target_col}_div_std'] = concat_transaction[target_col] / concat_transaction[f'{col_name}_{target_col}_std']
        concat_transaction[f'{col_name}_{target_col}_sum'] = concat_transaction.groupby(columns)[target_col].transform('sum')
        concat_transaction[f'{col_name}_{target_col}_deviation_value'] = (concat_transaction[target_col] - \
                                                                            concat_transaction[f'{col_name}_{target_col}_mean'] / \
                                                                            concat_transaction[f'{col_name}_{target_col}_std'])

In [64]:
concat_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,['card3']_count,['card5']_count,"['ProductCD', 'card1']_count","['ProductCD', 'card2']_count","['ProductCD', 'card3']_count","['ProductCD', 'card5']_count","['card1', 'card2', 'card3', 'card5']_count",['addr1']_count,['C13']_count,['hour']_count
0,2987000,0.0,2017-12-02 00:00:00,68.5,4,4248,0,51,2,43,...,56,56,56,56,56,56,56,56,56,56
1,2987001,0.0,2017-12-02 00:00:01,29.0,4,9979,305,51,3,3,...,1338,1338,1338,1338,1338,1338,1338,1338,1338,1338
2,2987002,0.0,2017-12-02 00:01:09,59.0,4,11850,391,51,4,67,...,1794,1794,1794,1794,1794,1794,1794,1794,1794,1794
3,2987003,0.0,2017-12-02 00:01:39,50.0,4,8796,468,51,3,18,...,7635,7635,7635,7635,7635,7635,7635,7635,7635,7635
4,2987004,0.0,2017-12-02 00:01:46,50.0,1,11687,415,51,3,3,...,30,30,30,30,30,30,30,30,30,30


In [65]:
concat_transaction["card_no"] = (concat_transaction["card1"]).astype(str) + "_" + (concat_transaction["card2"]).astype(str) + "_" + (concat_transaction["card3"]).astype(str) + "_" + (concat_transaction["card5"]).astype(str)

In [66]:
#concat_transaction = concat_transaction.sort_values(["card_no", "TransactionDT"], ascending=True)

In [67]:
#concat_transaction_groupby = concat_transaction.groupby(["card_no"])

In [68]:
"""
for i in range(1, 4):
    concat_transaction[f'TransactionAmt_by_card_no_shift{i}'] = concat_transaction_groupby["TransactionAmt"].shift(i)
    concat_transaction[f'TransactionAmt_by_card_no_shift{i}'] = concat_transaction[f'TransactionAmt_by_card_no_shift{i}'].fillna(-1)
"""

'\nfor i in range(1, 4):\n    concat_transaction[f\'TransactionAmt_by_card_no_shift{i}\'] = concat_transaction_groupby["TransactionAmt"].shift(i)\n    concat_transaction[f\'TransactionAmt_by_card_no_shift{i}\'] = concat_transaction[f\'TransactionAmt_by_card_no_shift{i}\'].fillna(-1)\n'

In [69]:
concat_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,['card5']_count,"['ProductCD', 'card1']_count","['ProductCD', 'card2']_count","['ProductCD', 'card3']_count","['ProductCD', 'card5']_count","['card1', 'card2', 'card3', 'card5']_count",['addr1']_count,['C13']_count,['hour']_count,card_no
0,2987000,0.0,2017-12-02 00:00:00,68.5,4,4248,0,51,2,43,...,56,56,56,56,56,56,56,56,56,4248_0_51_43
1,2987001,0.0,2017-12-02 00:00:01,29.0,4,9979,305,51,3,3,...,1338,1338,1338,1338,1338,1338,1338,1338,1338,9979_305_51_3
2,2987002,0.0,2017-12-02 00:01:09,59.0,4,11850,391,51,4,67,...,1794,1794,1794,1794,1794,1794,1794,1794,1794,11850_391_51_67
3,2987003,0.0,2017-12-02 00:01:39,50.0,4,8796,468,51,3,18,...,7635,7635,7635,7635,7635,7635,7635,7635,7635,8796_468_51_18
4,2987004,0.0,2017-12-02 00:01:46,50.0,1,11687,415,51,3,3,...,30,30,30,30,30,30,30,30,30,11687_415_51_3


In [70]:
concat_transaction = concat_transaction.sort_values(["TransactionDT"], ascending=True)

### addr

In [71]:
for column in ["addr1", "addr2"]:
    concat_transaction = count_encoder(concat_transaction, column)
    concat_transaction = label_encoder(concat_transaction, column)

In [72]:
concat_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,"['ProductCD', 'card2']_count","['ProductCD', 'card3']_count","['ProductCD', 'card5']_count","['card1', 'card2', 'card3', 'card5']_count",['addr1']_count,['C13']_count,['hour']_count,card_no,count_enc_addr1,count_enc_addr2
0,2987000,0.0,2017-12-02 00:00:00,68.5,4,4248,0,51,2,43,...,56,56,56,56,56,56,56,4248_0_51_43,43035,956413
1,2987001,0.0,2017-12-02 00:00:01,29.0,4,9979,305,51,3,3,...,1338,1338,1338,1338,1338,1338,1338,9979_305_51_3,76902,956413
2,2987002,0.0,2017-12-02 00:01:09,59.0,4,11850,391,51,4,67,...,1794,1794,1794,1794,1794,1794,1794,11850_391_51_67,48387,956413
3,2987003,0.0,2017-12-02 00:01:39,50.0,4,8796,468,51,3,18,...,7635,7635,7635,7635,7635,7635,7635,8796_468_51_18,17455,956413
4,2987004,0.0,2017-12-02 00:01:46,50.0,1,11687,415,51,3,3,...,30,30,30,30,30,30,30,11687_415_51_3,7107,956413


### dist

In [73]:
for column in ["dist1", "dist2"]:
    concat_transaction = count_encoder(concat_transaction, column)

In [74]:
concat_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,"['ProductCD', 'card5']_count","['card1', 'card2', 'card3', 'card5']_count",['addr1']_count,['C13']_count,['hour']_count,card_no,count_enc_addr1,count_enc_addr2,count_enc_dist1,count_enc_dist2
0,2987000,0.0,2017-12-02 00:00:00,68.5,4,4248,0,51,2,43,...,56,56,56,56,56,4248_0_51_43,43035,956413,4707,1023166
1,2987001,0.0,2017-12-02 00:00:01,29.0,4,9979,305,51,3,3,...,1338,1338,1338,1338,1338,9979_305_51_3,76902,956413,643488,1023166
2,2987002,0.0,2017-12-02 00:01:09,59.0,4,11850,391,51,4,67,...,1794,1794,1794,1794,1794,11850_391_51_67,48387,956413,38,1023166
3,2987003,0.0,2017-12-02 00:01:39,50.0,4,8796,468,51,3,18,...,7635,7635,7635,7635,7635,8796_468_51_18,17455,956413,643488,1023166
4,2987004,0.0,2017-12-02 00:01:46,50.0,1,11687,415,51,3,3,...,30,30,30,30,30,11687_415_51_3,7107,956413,643488,1023166


### emaildomain

In [75]:
for col in ["P_emaildomain", "R_emaildomain"]:
    concat_transaction[col] = concat_transaction[col].fillna("nan.nan")
    concat_transaction[f'{col}_0'] = concat_transaction[col].apply(lambda x : x.split(".")[0])
    #concat_transaction[f'{col}_1'] = concat_transaction[col].apply(lambda x : x.split(".")[1])
    concat_transaction[f'{col}_last'] = concat_transaction[col].apply(lambda x : x.split(".")[-1])

    concat_transaction[f'{col}_name_identification'] = "nan"
    for domain_name in ["yahoo", "ymail", "frontier", "rocketmail"]:
        concat_transaction.loc[concat_transaction[f'{col}_0'].str.contains(domain_name, na=False) ,f'{col}_name_identification'] = "Yahoo"
    for domain_name in ["hotmail", "outlook", "live", "msn"]:
        concat_transaction.loc[concat_transaction[f'{col}_0'].str.contains(domain_name, na=False) ,f'{col}_name_identification'] = "Microsoft"
    for domain_name in ["icloud", "mac", "me"]:
        concat_transaction.loc[concat_transaction[f'{col}_0'].str.contains(domain_name, na=False) ,f'{col}_name_identification'] = "Apple"
    for domain_name in ["prodigy", "att", "sbcglobal"]:
        concat_transaction.loc[concat_transaction[f'{col}_0'].str.contains(domain_name, na=False) ,f'{col}_name_identification'] = "AT&T"
    for domain_name in ["centurylink", "embarqmail", "q"]:
        concat_transaction.loc[concat_transaction[f'{col}_0'].str.contains(domain_name, na=False) ,f'{col}_name_identification'] = "Centurylink"
    for domain_name in ["aim", "aol"]:
        concat_transaction.loc[concat_transaction[f'{col}_0'].str.contains(domain_name, na=False) ,f'{col}_name_identification'] = "AOL"
    for domain_name in ["twc", "charter"]:
        concat_transaction.loc[concat_transaction[f'{col}_0'].str.contains(domain_name, na=False) ,f'{col}_name_identification'] = "Spectrum"
    
    for col_name in [col, f'{col}_0', f'{col}_last', f'{col}_name_identification']:
        concat_transaction = count_encoder(concat_transaction, col_name)
        concat_transaction = label_encoder(concat_transaction, col_name)

In [76]:
for column in ["P_emaildomain", "R_emaildomain"]:
    concat_transaction = count_encoder(concat_transaction, column)
    concat_transaction = label_encoder(concat_transaction, column)

In [77]:
#concat_transaction = concat_transaction.drop(["P_emaildomain", "R_emaildomain"], axis=1)

### C1~C14

### D1~D15

In [78]:
def count_nan(df, columns, names):
    df[f'{names}_nan_count'] = df[columns].isnull().sum(axis=1)
    return df

In [79]:
d_columns = []
for i in range(1, 16):
    d_columns.append(f'D{i}')
concat_transaction = count_nan(concat_transaction, d_columns, "D")
concat_transaction[d_columns] = concat_transaction[d_columns].fillna(-1)

In [80]:
concat_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,count_enc_P_emaildomain_last,count_enc_P_emaildomain_name_identification,R_emaildomain_0,R_emaildomain_last,R_emaildomain_name_identification,count_enc_R_emaildomain,count_enc_R_emaildomain_0,count_enc_R_emaildomain_last,count_enc_R_emaildomain_name_identification,D_nan_count
0,2987000,0.0,2017-12-02 00:00:00,68.5,4,4248,0,51,2,43,...,163648,705998,25,8,7,824068,824068,824068,991654,10
1,2987001,0.0,2017-12-02 00:00:01,29.0,4,9979,305,51,3,3,...,877699,705998,25,8,7,824068,824068,824068,991654,11
2,2987002,0.0,2017-12-02 00:01:09,59.0,4,11850,391,51,4,67,...,877699,112987,25,8,7,824068,824068,824068,991654,10
3,2987003,0.0,2017-12-02 00:01:39,50.0,4,8796,468,51,3,18,...,877699,192737,25,8,7,824068,824068,824068,991654,8
4,2987004,0.0,2017-12-02 00:01:46,50.0,1,11687,415,51,3,3,...,877699,705998,25,8,7,824068,824068,824068,991654,14


### M1~M9

In [81]:
m_columns = []
for i in range(1, 10):
    m_columns.append(f'M{i}')
concat_transaction = count_nan(concat_transaction, d_columns, "M")
for column in m_columns:
    concat_transaction = label_encoder(concat_transaction, column)
    concat_transaction[column] = concat_transaction[column].fillna("-1")

In [82]:
m_columns

['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

### V1~V339

In [83]:
v_columns = []
for i in range(1, 340):
    d_columns.append(f'V{i}')
concat_transaction = count_nan(concat_transaction, v_columns, "V")
concat_transaction[v_columns] = concat_transaction[v_columns].fillna(-1)

In [84]:
concat_transaction.columns

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
       ...
       'R_emaildomain_0', 'R_emaildomain_last',
       'R_emaildomain_name_identification', 'count_enc_R_emaildomain',
       'count_enc_R_emaildomain_0', 'count_enc_R_emaildomain_last',
       'count_enc_R_emaildomain_name_identification', 'D_nan_count',
       'M_nan_count', 'V_nan_count'],
      dtype='object', length=481)

In [85]:
concat_transaction = concat_transaction.replace([np.inf, -np.inf], np.nan)
concat_transaction = concat_transaction.fillna(-1)

In [86]:
concat_transaction = concat_transaction.drop(["TransactionDT"], axis=1)

## identity

In [87]:
concat_identity.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


### id_30

In [91]:
concat_identity["OS"] = concat_identity["id_30"].apply(lambda  x: str(x).split(" ")[0])
concat_identity = count_encoder(concat_identity, "OS")
concat_identity = label_encoder(concat_identity, "OS")

### id_31

In [92]:
concat_identity["browser"] = concat_identity["id_31"].apply(lambda x : str(x).split(" ")[0])
concat_identity = count_encoder(concat_identity, "browser")
concat_identity = label_encoder(concat_identity, "browser")

### id_01~id_11

In [93]:
for i in range(1, 10):
    concat_identity[f'id_0{i}'] = concat_identity[f'id_0{i}'].fillna(-1)
for i in [10,11]:
    concat_identity[f'id_{i}'] = concat_identity[f'id_{i}'].fillna(-1)

### id_12~id38

In [94]:
for i in range(12,39):
    concat_identity[f'id_{i}'] = concat_identity[f'id_{i}'].fillna(-1)
    concat_identity = count_encoder(concat_identity, f'id_{i}')
    concat_identity = label_encoder(concat_identity, f'id_{i}')

### DeviceType

In [95]:
concat_identity = count_encoder(concat_identity, "DeviceType")
concat_identity = label_encoder(concat_identity, "DeviceType")

### DeviceInfo

In [96]:
concat_identity = count_encoder(concat_identity, "DeviceInfo")
concat_identity = label_encoder(concat_identity, "DeviceInfo")

## merge

In [97]:
concat_merge = pd.merge(concat_transaction, concat_identity, on="TransactionID", how="left")

In [98]:
concat_merge = concat_merge.fillna(-999)

In [99]:
concat_merge.shape

(1097229, 553)

In [100]:
concat_merge.head()

Unnamed: 0,TransactionID,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,count_enc_id_31,count_enc_id_32,count_enc_id_33,count_enc_id_34,count_enc_id_35,count_enc_id_36,count_enc_id_37,count_enc_id_38,count_enc_DeviceType,count_enc_DeviceInfo
0,2987000,0.0,68.5,4,4248,0,51,2,43,2,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1,2987001,0.0,29.0,4,9979,305,51,3,3,2,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,2987002,0.0,59.0,4,11850,391,51,4,67,3,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,2987003,0.0,50.0,4,8796,468,51,3,18,3,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,2987004,0.0,50.0,1,11687,415,51,3,3,2,...,1067.0,44077.0,1430.0,132185.0,149464.0,267353.0,215149.0,108982.0,118173.0,9.0


In [101]:
concat_merge.isnull().sum()

TransactionID           0
isFraud                 0
TransactionAmt          0
ProductCD               0
card1                   0
card2                   0
card3                   0
card4                   0
card5                   0
card6                   0
addr1                   0
addr2                   0
dist1                   0
dist2                   0
P_emaildomain           0
R_emaildomain           0
C1                      0
C2                      0
C3                      0
C4                      0
C5                      0
C6                      0
C7                      0
C8                      0
C9                      0
C10                     0
C11                     0
C12                     0
C13                     0
C14                     0
                       ..
count_enc_browser       0
count_enc_id_12         0
count_enc_id_13         0
count_enc_id_14         0
count_enc_id_15         0
count_enc_id_16         0
count_enc_id_17         0
count_enc_id

In [102]:
train_processed = concat_merge[concat_merge["istrain"] == 1]
test_processed = concat_merge[concat_merge["istrain"] == 0]

In [103]:
test_processed = test_processed.drop(["isFraud"], axis=1)

In [104]:
import feather

In [105]:
train_processed.dtypes

TransactionID             int64
isFraud                 float64
TransactionAmt          float64
ProductCD                 int64
card1                     int64
card2                     int64
card3                     int64
card4                     int64
card5                     int64
card6                     int64
addr1                     int64
addr2                     int64
dist1                   float64
dist2                   float64
P_emaildomain             int64
R_emaildomain             int64
C1                      float64
C2                      float64
C3                      float64
C4                      float64
C5                      float64
C6                      float64
C7                      float64
C8                      float64
C9                      float64
C10                     float64
C11                     float64
C12                     float64
C13                     float64
C14                     float64
                         ...   
count_en

In [106]:
feather.write_dataframe(train_processed, f'../data/processed/train_processed_{version}.feather')
feather.write_dataframe(test_processed, f'../data/processed/test_processed_{version}.feather')