I create a notebook to **visualize the time trend (pattern) across statement 1~13 for each feature**. Specifically, I only choose the customers with 13 statements in the training set. First, I rank the statements based on their datetime. Then, for each feature (e.g., B_2), I select all the statements with the same rank (e.g., 1st) and average them - to get a mean value for the feature in the 1st statement. So I can plot the mean values of the feature from 1st to 13th statements, for both non-default and default groups. You can see many of the features have distinguishable trends for non-default and default group. For example, Risk feature R_1, the time pattern for the default group shows an increase trend, however, the non-default group shows a low horizontal line. I guess, it may indicate the risk of the default group is getting higher with the increase of the statements.

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

import gc

In [None]:
# load train labels from the official data folder
labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')

# load the training data processed by raddar
# convert the S_2 column to datetime type
train = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet').assign(S_2=lambda dx: pd.to_datetime(dx.S_2))

train_with_labels = train.merge(labels, on='customer_ID')

del train
del labels
gc.collect()

In [None]:
# the statement num of each customer
statement_num = train_with_labels.groupby(['customer_ID'])['S_2'].count()

# Cunstomers with 13 statements

In [None]:
# now, we only use customers with 13 statements for analysis
customers_13 = list(statement_num.loc[statement_num == 13].index)

# the corresponding data for the customers with 13 statements
train_with_labels_13 = train_with_labels.loc[train_with_labels.loc[:,'customer_ID'].isin(customers_13), :]

In [None]:
train_with_labels_13 = train_with_labels_13.sort_values(by=['customer_ID','S_2'])

# rank the statements of each customer based on the datetime
# the rank is stored in a new col called 'statement_idx'
train_with_labels_13['statement_idx'] = train_with_labels_13.groupby(['customer_ID'])['S_2'].rank(method='first')

In [None]:
# move the 'statement_idx' to the front (next to S_2)
cols = list(train_with_labels_13)
cols.insert(2, cols.pop(cols.index('statement_idx')))
train_with_labels_13 = train_with_labels_13[cols]

In [None]:
# for different target groups (non-default or default)
# we calculate the mean of the 1st (2nd, 3rd, ..., 13th) statements for all the customers
statement_wise_mean = train_with_labels_13.groupby(['target','statement_idx']).mean()

# seperate into two groups (non-default or default)
statement_wise_mean = statement_wise_mean.reset_index()
statement_wise_mean_target_0 = statement_wise_mean.loc[statement_wise_mean['target'] == 0,:]
statement_wise_mean_target_1 = statement_wise_mean.loc[statement_wise_mean['target'] == 1,:]

statement_wise_mean_target_0 = statement_wise_mean_target_0.reset_index(drop=True)
statement_wise_mean_target_1 = statement_wise_mean_target_1.reset_index(drop=True)

del statement_wise_mean

In [None]:
# rename the cols with '_Pos' or '_Neg' appended
pos_cols = list(statement_wise_mean_target_1)[2:]
new_pos_cols = [feat + '_Pos' for feat in pos_cols]

cols_for_Pos = {}
for feat_idx in range(len(pos_cols)):
    cols_for_Pos[pos_cols[feat_idx]] = new_pos_cols[feat_idx]


neg_cols = list(statement_wise_mean_target_0)[2:]
new_neg_cols = [feat + '_Neg' for feat in neg_cols]

cols_for_Neg = {}
for feat_idx in range(len(neg_cols)):
    cols_for_Neg[neg_cols[feat_idx]] = new_neg_cols[feat_idx]
    
statement_wise_mean_target_0 = statement_wise_mean_target_0.rename(columns = cols_for_Neg)
statement_wise_mean_target_1 = statement_wise_mean_target_1.rename(columns = cols_for_Pos)

In [None]:
cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]

pos_cols.sort()
for feat in pos_cols:
    print('---------- ', end='')
    print(feat, end='')
    print(' ----------')
    
    if feat in cat_features:
        print('categorical feature')
    
    comparision = pd.concat([statement_wise_mean_target_0[feat + '_Neg'],
                             statement_wise_mean_target_1[feat + '_Pos']], axis=1)
    comparision.plot(figsize=(9, 4.5))
    plt.show()
    
    
    print('')
    print('')

---