# 2. Feature Engineering (Strategy 2)

In [None]:
import gc
import os
import joblib
import random
import warnings
import itertools
from tqdm import tqdm
from itertools import combinations
import warnings; warnings.filterwarnings('ignore')

import scipy as sp

import numpy as np
import pandas as pd
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import scipy as sp

import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

In [None]:
def get_difference(data, num_features):
    df1,df3,df6 = [],[],[]
    customer_ids = []
    for customer_id, df in tqdm(data.groupby(['customer_ID'])):
        diff_df1 = df[num_features].diff(1).iloc[[-1]].values.astype(np.float32)
        diff_df3 = df[num_features].diff(3).iloc[[-1]].values.astype(np.float32)
        diff_df6 = df[num_features].diff(6).iloc[[-1]].values.astype(np.float32)

        df1.append(diff_df1)
        df3.append(diff_df3)
        df6.append(diff_df6)
        customer_ids.append(customer_id)
        
    df1 = np.concatenate(df1, axis = 0)
    df1 = pd.DataFrame(df1, columns = [col + '_diff1' for col in df[num_features].columns])
    df3 = np.concatenate(df3, axis = 0)
    df3 = pd.DataFrame(df3, columns = [col + '_diff3' for col in df[num_features].columns])
    df6 = np.concatenate(df6, axis = 0)
    df6 = pd.DataFrame(df6, columns = [col + '_diff6' for col in df[num_features].columns])
    df_all = pd.concat([df1,df3,df6],axis=1)
    df_all['customer_ID'] = customer_ids
    return df_all

In [None]:
def read_preprocess_data():
    train = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet')
    features = train.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]
    print('Starting training feature engineer...')
    train_num_agg = train.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max'])
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace = True)
    train_tail2 = train.groupby("customer_ID").tail(2)
    train_tail2_num_agg = train_tail2.groupby("customer_ID")[num_features].agg(['mean'])
    train_tail2_num_agg.columns = ['_'.join([xx.replace('mean','last') for xx in x]) for x in train_tail2_num_agg.columns]
    train_tail2_num_agg.reset_index(inplace = True)
    train_num_agg = train_num_agg.merge(train_tail2_num_agg, how = 'inner', on = 'customer_ID')
    for col in num_features:
        train_num_agg[f'{col}_last_mean_diff'] = train_num_agg[f'{col}_last'] - train_num_agg[f'{col}_mean']
        train_num_agg[f'{col}_last_first_diff'] = train_num_agg[f'{col}_last'] - train_num_agg[f'{col}_first']
    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(['count', 'first', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace = True)
    train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
    cols = list(train_num_agg.dtypes[train_num_agg.dtypes == 'float64'].index)
    for col in tqdm(cols):
        train_num_agg[col] = train_num_agg[col].astype(np.float32)
    cols = list(train_cat_agg.dtypes[train_cat_agg.dtypes == 'int64'].index)
    for col in tqdm(cols):
        train_cat_agg[col] = train_cat_agg[col].astype(np.int32)
    train_diff = get_difference(train, num_features)
    train = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').\
            merge(train_diff, how = 'inner', on = 'customer_ID').\
            merge(train_labels, how = 'inner', on = 'customer_ID')
    del train_num_agg, train_cat_agg, train_diff, train_tail2_num_agg
    gc.collect()
    
    test = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet')
    print('Starting test feature engineer...')
    test_num_agg = test.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)
    test_tail2 = test.groupby("customer_ID").tail(2)
    test_tail2_num_agg = test_tail2.groupby("customer_ID")[num_features].agg(['mean'])
    test_tail2_num_agg.columns = ['_'.join([xx.replace('mean','last') for xx in x]) for x in test_tail2_num_agg.columns]
    test_tail2_num_agg.reset_index(inplace = True)
    test_num_agg = test_num_agg.merge(test_tail2_num_agg, how = 'inner', on = 'customer_ID')
    for col in num_features:
        test_num_agg[f'{col}_last_mean_diff'] = test_num_agg[f'{col}_last'] - test_num_agg[f'{col}_mean']
        test_num_agg[f'{col}_last_first_diff'] = test_num_agg[f'{col}_last'] - test_num_agg[f'{col}_first']
    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(['count', 'first', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    cols = list(test_num_agg.dtypes[test_num_agg.dtypes == 'float64'].index)
    for col in tqdm(cols):
        test_num_agg[col] = test_num_agg[col].astype(np.float32)
    cols = list(test_cat_agg.dtypes[test_cat_agg.dtypes == 'int64'].index)
    for col in tqdm(cols):
        test_cat_agg[col] = test_cat_agg[col].astype(np.int32)
    test_diff = get_difference(test, num_features)
    test = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID').\
            merge(test_diff, how = 'inner', on = 'customer_ID')
    del test_num_agg, test_cat_agg, test_diff
    gc.collect()

    features = train.drop(['customer_ID'], axis = 1).columns.to_list()
    num_features = [col for col in features if col not in cat_features]
    num_cols = [col for col in num_features if (('last' in col or 'mean' in col) and 'diff' not in col)]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
        test[col + '_round2'] = test[col].round(2)
    
    print('train.shape:',train.shape)
    print('test.shape:',test.shape)
    train.to_parquet('train_fe_v2.parquet')
    test.to_parquet('test_fe_v2.parquet')

In [None]:
read_preprocess_data()