# AMEX Competition Feature engineering

The notebook is based on insights of the [EDA which makes sense ⭐️⭐️⭐️⭐️⭐️](https://www.kaggle.com/code/ambrosm/amex-eda-which-makes-sense).

and [AMEX - Feature Engineering](https://www.kaggle.com/code/lucasmorin/amex-feature-engineering)

In [None]:
import pandas as pd
import numpy as np
import pickle
from matplotlib import pyplot as plt
import random
import datetime
import math
import gc
import warnings
import seaborn as sns
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, RobustScaler

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

# Reading and preprocessing the training data

We read the data from @munumbutt's [AMEX-Feather-Dataset](https://www.kaggle.com/datasets/munumbutt/amexfeather). Then we create some groups of features:
- Selected features taken as minimums, maximums, averages, and standard deviations over all statements of a customer
- Selected features taken from the last statement of a customer
- Other featurers including the number of unique statements a customer and the number of statements a person gets statements

We one-hot encode the categorical features and fill all missing values with 0.

The code has been optimized for memory efficiency rather than readability. In particular, `.iloc[mask_array, columns]` needs much less RAM than the groupby construction used in previous versions of the notebook.

We process train and test data separately


In [None]:
test_read = pd.read_csv('../input/amex-default-prediction/train_data.csv',nrows=1)

test_read

In [None]:
# %%time

all_features = [c for c in list(test_read.columns) if c not in ['customer_ID','S_2_max']]

cat_features = ['B_30', 'B_38', 'D_114', 'D_116',
                        'D_117', 'D_120', 'D_126',
                        'D_63', 'D_64', 'D_66', 'D_68']

ohe_cat_features = [f'{s}_last' for s in cat_features]

num_features = [col for col in all_features if col not in cat_features]

def read_file(path, i):
    df = pd.read_feather(path)
    df['S_2'] = pd.to_datetime(df['S_2'])
    df['S_2_max'] = df[['S_2','customer_ID']].groupby('customer_ID').S_2.transform('max')
    df['S_2_diff'] = df[['S_2','customer_ID']].groupby('customer_ID').S_2.transform('diff').dt.days
    df['S_2'] = (df['S_2_max']-df['S_2']).dt.days
    
    #de-noising
    for col in df.columns:
        if df[col].dtype=='float16':
            df[col] = df[col].astype('float32').round(decimals=2).astype('float16')
    
    if i==0:
        enc.fit(df[cat_features])
    df[cat_features] = enc.transform(df[cat_features])
    df[cat_features] = df.fillna(df[cat_features].mode())
    df[num_features] = df.fillna(df[num_features].median())
    print('shape of data:', df.shape)
    gc.collect()
    return df
def preprocess(df, i, j ):
    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['last', 'nunique']).astype('int')
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    
    if i==0 and j==0:
        ohe.fit(test_cat_agg[ohe_cat_features])
    test_cat_last = pd.DataFrame(ohe.transform(test_cat_agg[ohe_cat_features]).astype(np.int16),
                                  index=test_cat_agg.index, columns = ohe.get_feature_names_out())
    test_cat_agg.drop(ohe_cat_features, axis = 1,inplace = True)
    
    other_agg = df.groupby('customer_ID')[['customer_ID']].agg(['count']).astype('int')
    other_agg.columns = ['_'.join(x) for x in other_agg.columns]
    
    df = pd.concat([test_num_agg, test_cat_agg, test_cat_last, other_agg], axis=1)
        
    del test_num_agg, test_cat_agg, other_agg, test_cat_last
    gc.collect()
    print('shape after engineering', df.shape )
    return df

              
ohe = OneHotEncoder(drop='first', sparse=False, dtype=np.int16, handle_unknown='ignore')
enc = OrdinalEncoder()
sca = RobustScaler()
# imp = SimpleImputer(strategy = 'median')
def process(path, i, splits):
    df = read_file(path, i)
    if i ==0:
        df.drop('target',axis = 1,inplace = True)
    def split(a, n):
        k, m = divmod(len(a), n)
        return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

    split_ids = split(df.customer_ID.unique(),splits)

    df_list = []
    
    for (j,ids) in enumerate(split_ids):
        print(j)
        df_ids = df[df.customer_ID.isin(ids)]
        df_t = preprocess(df_ids, i, j)
        df_list.append(df_t)
        del df_ids, df_t
        gc.collect()
    
    
    df = pd.concat(df_list, axis = 0)
    
    #defragment
    df = df.copy()
    
    df.reset_index(inplace = True)
    
    #drop constant columns
    df.drop(['S_2_min', 'S_2_last'],axis = 1, inplace = True)
    
#     df = reduce_mem_usage(df)
    
    # Impute missing values
    df.fillna(value=-1, inplace=True)
    df['customer_ID'] = df['customer_ID'].astype(str)
    return df

In [None]:
train = process('../input/amexfeather/train_data.ftr', 0, 2)
gc.collect()
print('Shapes:', train.shape)
train.head()

In [None]:
# merge train and target ids
target = pd.read_csv('../input/amex-default-prediction/train_labels.csv')

print(target.shape)
print(target.head())
train['target'] = target['target']
target.head()

In [None]:
train.to_feather('train_processed.ftr')
train.to_csv('train_processed.csv')

# Test Data

In [None]:
del train, target
gc.collect()

In [None]:
test = process('../input/amexfeather/test_data.ftr', 1, 30)

print('Shapes:', test.shape)
gc.collect()

In [None]:
del enc, ohe
gc.collect()

In [None]:
test

In [None]:
test.to_feather('test_processed.ftr')
test.to_csv('test_processed.csv')