In [None]:
import numpy as np
import pandas as pd
from datetime import datetime,timedelta
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from scipy.stats import rankdata,ks_2samp
from collections import Counter,OrderedDict
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# This notebook investigates following analysis

1. Check the relationship of transaction table data and identity table data  
2. Compare the distribution of each feature data when its target is 1(fraud) and 0(normal)  
by using Kolmogorov-Smirov test  
3. Analyze time consistency of each feature data
4. Analyze the correlation of all features each other

In [None]:
# ================================================================================
train_transaction_all_columns=['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V238', 'V239', 'V240', 'V241', 'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V248', 'V249', 'V250', 'V251', 'V252', 'V253', 'V254', 'V255', 'V256', 'V257', 'V258', 'V259', 'V260', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 'V278', 'V279', 'V280', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320', 'V321', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339']
train_identity_all_columns=['TransactionID', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

# ================================================================================
# Categorical columns in transaction

categorical_columns_in_train_transaction=[]
categorical_columns_in_train_transaction.append("ProductCD")
categorical_columns_in_train_transaction.extend(['card{}'.format(i) for i in range(1,7)])
categorical_columns_in_train_transaction.extend(['addr1','addr2'])
categorical_columns_in_train_transaction.extend(['P_emaildomain','R_emaildomain'])
categorical_columns_in_train_transaction.extend(['M{}'.format(i) for i in range(1,10)])

In [None]:
# ================================================================================
# Numerical columns in transaction

numerical_columns_in_train_transaction=[one_column for one_column in train_transaction_all_columns if one_column not in categorical_columns_in_train_transaction]

In [None]:
# ================================================================================
# Categorical columns in identity

categorical_columns_in_train_identity=[]
categorical_columns_in_train_identity.extend(['DeviceType','DeviceInfo'])
categorical_columns_in_train_identity.extend(['id_{}'.format(i) for i in range(12,39)])

In [None]:
# ================================================================================
# Numerical columns in identity

numerical_columns_in_train_identity=[one_column for one_column in train_identity_all_columns if one_column not in categorical_columns_in_train_identity]

In [None]:
# ================================================================================
# Setting datatype in loading dataframe

dtypes = {}

for c in numerical_columns_in_train_transaction+numerical_columns_in_train_identity:
    dtypes[c]='float32'
for c in categorical_columns_in_train_transaction+categorical_columns_in_train_identity:
    dtypes[c]='category'

In [None]:
# ================================================================================
# Load train data

train_transaction=pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv',dtype=dtypes)
train_identity=pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv',dtype=dtypes)

In [None]:
# ================================================================================
# 1. Check the relationship of transaction table data and identity table data  

# At first, I thought these 2 tables have 1 (identity) : N (transaction) relationship but it is 1:1 relationship  
# TransactionID in transaction is consecutive but TransactionID in identity is not consecutive  

In [None]:
# Are TransactionID values of train transaction table data unique?
# Yes, 590540 rows are unique in TransactionID column
Counter(list(Counter(list(train_transaction["TransactionID"])).values()))

In [None]:
# Are TransactionID values of train identity table data unique?
# Yes, 144233 rows are unique in TransactionID column
Counter(list(Counter(list(train_identity["TransactionID"])).values()))

In [None]:
# ================================================================================
# See the data by using Python's set form

def investigate_frequency(train_transaction,train_identity):
  intersection_of_transaction_and_identity=set(list(train_transaction["TransactionID"])).intersection(set(list(train_identity["TransactionID"])))
  transaction_minus_identity=set(list(train_transaction["TransactionID"]))-set(list(train_identity["TransactionID"]))
  identity_minus_transaction=set(list(train_identity["TransactionID"]))-set(list(train_transaction["TransactionID"]))
  return len(intersection_of_transaction_and_identity),len(transaction_minus_identity),len(identity_minus_transaction)

In [None]:
intersection_of_transaction_and_identity,transaction_minus_identity,identity_minus_transaction=investigate_frequency(train_transaction,train_identity)
intersection_of_transaction_and_identity,transaction_minus_identity,identity_minus_transaction

In [None]:
def frequency_visualization(
  frequency_distribution_of_transaction_TransactionID,
  frequency_distribution_of_idendity_TransactionID,
  number_of_transaction_rows,
  number_of_identity_rows,
  intersection_of_transaction_and_identity,
  transaction_minus_identity,
  identity_minus_transaction):

  def my_fmt(x):
    return '{:.4f}%'.format(x)
  
  plt.figure(figsize=(11,11))
  ax1=plt.subplot2grid((2,2),(0,0),colspan=1)
  ax2=plt.subplot2grid((2,2),(0,1),colspan=1)
  ax3=plt.subplot2grid((2,2),(1,0),colspan=2)
  ax1.pie(frequency_distribution_of_transaction_TransactionID.values(), labels=frequency_distribution_of_transaction_TransactionID.keys(),autopct=my_fmt)
  ax1.set_title('TransactionID distribution from transaction')
  ax2.pie(frequency_distribution_of_idendity_TransactionID.values(), labels=frequency_distribution_of_idendity_TransactionID.keys(),autopct=my_fmt)
  ax2.set_title('TransactionID distribution from identity')
  ax3.bar(["transaction","idendity","Transaction \cap identity","Transaction-Identity","Identity-Transaction"],[number_of_transaction_rows,number_of_identity_rows,intersection_of_transaction_and_identity,transaction_minus_identity,identity_minus_transaction])
  for index,data in enumerate([number_of_transaction_rows,number_of_identity_rows,intersection_of_transaction_and_identity,transaction_minus_identity,identity_minus_transaction]):
    plt.text(x=index,y=data+1,s=f"{data}",fontdict=dict(fontsize=11))

In [None]:
import matplotlib.pyplot as plt

frequency_visualization(
    Counter(list(Counter(list(train_transaction["TransactionID"])).values())),
    Counter(list(Counter(list(train_identity["TransactionID"])).values())),
    train_transaction.shape[0],
    train_identity.shape[0],
    intersection_of_transaction_and_identity,
    transaction_minus_identity,
    identity_minus_transaction)

In [None]:
# ================================================================================
# Merge the tables and sort it

csv_train=pd.merge(train_transaction,train_identity,on=['TransactionID'],how='left')
csv_train=csv_train.sort_values(by=['TransactionID'],axis=0)
csv_train.head()

In [None]:
# ================================================================================
# 2. Compare the distribution of each feature data when its target is 1(fraud) and 0(normal)  
# by using Kolmogorov-Smirov test

In [None]:
def investigate_difference_of_target0_and_target1_numerical_date_distribution(csv_train):
  train_transaction_all_columns=['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V238', 'V239', 'V240', 'V241', 'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V248', 'V249', 'V250', 'V251', 'V252', 'V253', 'V254', 'V255', 'V256', 'V257', 'V258', 'V259', 'V260', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 'V278', 'V279', 'V280', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320', 'V321', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339']
  train_identity_all_columns=['TransactionID', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

  # ================================================================================
  # Categorical columns in transaction

  categorical_columns_in_train_transaction=[]
  categorical_columns_in_train_transaction.append("ProductCD")
  categorical_columns_in_train_transaction.extend(['card{}'.format(i) for i in range(1,7)])
  categorical_columns_in_train_transaction.extend(['addr1','addr2'])
  categorical_columns_in_train_transaction.extend(['P_emaildomain','R_emaildomain'])
  categorical_columns_in_train_transaction.extend(['M{}'.format(i) for i in range(1,10)])

  numerical_columns_in_train_transaction=[one_column for one_column in train_transaction_all_columns if one_column not in categorical_columns_in_train_transaction]

  # ================================================================================
  # Categorical columns in identity

  categorical_columns_in_train_identity=[]
  categorical_columns_in_train_identity.extend(['DeviceType','DeviceInfo'])
  categorical_columns_in_train_identity.extend(['id_{}'.format(i) for i in range(12,39)])

  numerical_columns_in_train_identity=[one_column for one_column in train_identity_all_columns if one_column not in categorical_columns_in_train_identity]

  # ================================================================================
  numerical_columns_all=numerical_columns_in_train_transaction+numerical_columns_in_train_identity
  categorical_columns_all=categorical_columns_in_train_transaction+categorical_columns_in_train_identity

  numerical_columns_all.remove('TransactionID')
  numerical_columns_all.remove('isFraud')
  numerical_columns_all.remove('TransactionDT')
  numerical_columns_all.remove('TransactionID')

  # ================================================================================
  csv_train_class0=csv_train[csv_train['isFraud']==0]
  csv_train_class1=csv_train[csv_train['isFraud']==1]

  # ================================================================================
  large_difference_columns=[]
  for one_numerical_column in numerical_columns_all:
    
    # ================================================================================
    # Remove outliers

    lower_bound=csv_train[one_numerical_column].quantile(0.003)
    upper_bound=csv_train[one_numerical_column].quantile(0.997)
  
    csv_train_class0_filtered=csv_train_class0[one_numerical_column][
      (csv_train_class0[one_numerical_column]>=lower_bound)&
      (csv_train_class0[one_numerical_column]<=upper_bound)
    ]

    csv_train_class1_filtered=csv_train_class1[one_numerical_column][
      (csv_train_class1[one_numerical_column]>=lower_bound)&
      (csv_train_class1[one_numerical_column]<=upper_bound)
    ]

    # ================================================================================
    # Get histogram data

    csv_train_class0_filtered_hist,csv_train_class0_filtered_bin_edges=np.histogram(csv_train_class0_filtered)
    csv_train_class1_filtered_hist,csv_train_class1_filtered_bin_edges=np.histogram(csv_train_class1_filtered)
    
    # ================================================================================
    # Normalize data to correctly compare

    csv_train_class0_filtered_hist_normed=(np.array(csv_train_class0_filtered_hist)-np.array(csv_train_class0_filtered_hist).min())/(np.array(csv_train_class0_filtered_hist).max()-np.array(csv_train_class0_filtered_hist).min())
    csv_train_class1_filtered_hist_normed=(np.array(csv_train_class1_filtered_hist)-np.array(csv_train_class1_filtered_hist).min())/(np.array(csv_train_class1_filtered_hist).max()-np.array(csv_train_class1_filtered_hist).min())

    # ================================================================================
    fig,ax=plt.subplots(1,1,figsize=(20,5))
    ax.plot(csv_train_class0_filtered_bin_edges[:-1],csv_train_class0_filtered_hist_normed,color="blue")
    ax.plot(csv_train_class1_filtered_bin_edges[:-1],csv_train_class1_filtered_hist_normed,color="red")
  
    # ================================================================================
    from scipy.stats import rankdata,ks_2samp
    result_from_komogorov_smirnov=ks_2samp(csv_train_class0_filtered_hist_normed,csv_train_class1_filtered_hist_normed)

    if result_from_komogorov_smirnov.pvalue<0.05:
      large_difference_columns.append([one_numerical_column,result_from_komogorov_smirnov.pvalue])
  return large_difference_columns

In [None]:
large_difference_columns=investigate_difference_of_target0_and_target1_numerical_date_distribution(csv_train)

In [None]:
large_difference_columns

# Note  
- I thought I could use aggregation like "mean", "standard deviation", "min", "max" on columns  
which have large difference between data with target1(fraud) and data with target0(normal)
- To compare distribution, I tried using Kolmogorov-Smirnov test and it says for exmple C7 has large different distribution  
- But checking the distribution of C7 in above images, that plot shows similar distribution of red line and blue line  
and I wonder the reason of it

In [None]:
# ================================================================================
# Calculate ratio of NaN from all columns, and we will discard columns which have NaN over 50%

def check_nan(merged,NAN_CRITERION):
  number_of_rows_from_data=merged.shape[0]
  number_of_columns_from_data=merged.shape[1]

  # ================================================================================
  number_of_nan_in_column=merged.isnull().sum(axis=0)
  number_of_nan_in_row=merged.isnull().sum(axis=1)

  # ================================================================================
  df=(number_of_nan_in_column/number_of_rows_from_data*100).to_frame().reset_index()

  # ================================================================================
  df=df.rename(columns={"index":'column_name',0:'nan_percent'})

  # ================================================================================
  columns_to_be_dropped=list((df[df['nan_percent']>NAN_CRITERION])['column_name'])

  # ================================================================================
  plt.figure(figsize=(21,11),dpi=500)
  plt.bar(list(df["column_name"]),list(df["nan_percent"]))
  plt.xticks(rotation=90,fontsize=3)
  plt.axhline(y=50,color='r',linestyle='--')
  return df,columns_to_be_dropped

In [None]:
train_nan_ratio_df,train_columns_to_be_dropped=check_nan(csv_train,NAN_CRITERION=50)
train_nan_ratio_df

In [None]:
def discard_nan_columns(merged,columns_to_be_dropped):
  merged.drop(columns_to_be_dropped,axis=1,inplace=True)
  return merged

In [None]:
csv_train.shape

In [None]:
csv_train=discard_nan_columns(csv_train,train_columns_to_be_dropped)
csv_train.shape

In [None]:
# ================================================================================
# Add time related data for time consistency check test

def create_datetime_column(csv_train):

  start_datetime=datetime(2017,7,1,0,0)

  converted_datetime_series=csv_train['TransactionDT'].map(lambda x:start_datetime+timedelta(seconds=x))
  
  csv_train["TransactionDT_datetime"]=converted_datetime_series

  # ================================================================================
  rankdata_year_month=rankdata(list(csv_train["TransactionDT_datetime"].map(lambda x:str(x.year)+"-"+str(x.month).zfill(2))),method='dense')

  csv_train["TransactionDT_year_month"]=rankdata_year_month

  return csv_train

In [None]:
csv_train=create_datetime_column(csv_train)

In [None]:
# ================================================================================
# 3. Analyze time consistency of each feature data

# I got the idea from  
# https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600#Feature-Selection---Time-Consistency

In [None]:
def check_time_consistency_of_each_feature(csv_train):

  feature_to_be_checked=list(csv_train.columns)
  feature_to_be_checked.remove("isFraud")
  feature_to_be_checked.remove("TransactionDT_datetime")
  feature_to_be_checked.remove("TransactionDT_year_month")

  csv_train=csv_train.set_index("TransactionID")
  csv_train_target_df=csv_train[["isFraud","TransactionDT_year_month"]]

  feature_to_be_checked.remove("TransactionID")
  
  all_features_name_for_visualizeation=[]
  all_features_rocauc_for_visualizeation=[]
  for one_feature_to_be_checked in feature_to_be_checked:
    one_feature_df=csv_train[[one_feature_to_be_checked]].reset_index()
    one_feature_df=pd.merge(one_feature_df,csv_train_target_df,on=['TransactionID'],how='left')
    
    # ================================================================================
    normalized_train_X=one_feature_df.iloc[:,1].reset_index()
    train_y=one_feature_df.iloc[:,2].reset_index()
    del normalized_train_X["index"]
    del train_y["index"]

    # ================================================================================
    group_kfold=GroupKFold(n_splits=4)
    groups=list(one_feature_df['TransactionDT_year_month'])

    roc_auc_score_init=0
    for fold_n,(train,test) in enumerate(group_kfold.split(normalized_train_X,train_y,groups)):

      X_train_,X_valid=normalized_train_X.iloc[train],normalized_train_X.iloc[test]
      y_train_,y_valid=train_y.iloc[train],train_y.iloc[test]

      # ================================================================================
      params={
        'n_estimators':100, 
        'learning_rate':0.1, 
        'num_leaves':31, 
        'max_depth':-1, 
        'boosting':'gbdt'
      }

      # ================================================================================
      lgbclf=lgb.LGBMClassifier(**params)

      # ================================================================================
      # Train lgb model with train dataset

      lgbclf.fit(X_train_,y_train_.values.ravel())
      
      # ================================================================================
      # Delete used data

      del X_train_,y_train_

      # ================================================================================
      # Make prediction on test dataset

      val=lgbclf.predict_proba(X_valid)[:,1]

      # ================================================================================
      # Delete used data

      del X_valid

      # ================================================================================
      roc_auc_score_init+=roc_auc_score(y_valid,val)/4

    if roc_auc_score_init<0.5:
      print('one_feature_to_be_checked',one_feature_to_be_checked)
      print('roc_auc_score_init',roc_auc_score_init)

    all_features_name_for_visualizeation.append(one_feature_to_be_checked)
    all_features_rocauc_for_visualizeation.append(roc_auc_score_init)

  fig,ax=plt.subplots(1,1,figsize=(20,5),dpi=500)
  ax.bar(all_features_name_for_visualizeation,all_features_rocauc_for_visualizeation)
  ax.set_title('Time consistency test on all features')
  ax.set_xlabel('Feature names')
  ax.set_ylabel('ROC AUC score (average from 4 groupkfolds)')
  ax.set_xticklabels(all_features_name_for_visualizeation,rotation=90,fontsize=2)
  ax.axhline(y=0.5,color='r',linestyle='--')
  ax.axhline(y=0.52,color='r',linestyle='--')

In [None]:
check_time_consistency_of_each_feature(csv_train)

### Discussion  

If you set the threshold as 0.5 in roc_auc score, 2 columns (card4, V41) seems no time consistency  
If you set the threshold as slightly higher like 0.52 in roc_auc score,  
more columns will be determined as no time consistency columns

In [None]:
# ================================================================================
# Split full column data into numerical data and categorical data

def separate_full_column_data_into_categorical_and_numerical(csv_train):

  # ================================================================================
  # Set index

  csv_train=csv_train.set_index("TransactionID")

  # ================================================================================
  numerical_data=[]
  categorical_data=[]
  for one_column_name in csv_train:
    
    if 'float' in str(csv_train[one_column_name].dtype) or 'int' in str(csv_train[one_column_name].dtype):
      numerical_data.append(csv_train[[one_column_name]])
    else:
      categorical_data.append(csv_train[[one_column_name]])

  numerical_train_df=pd.concat(numerical_data,axis=1)
  categorical_train_df=pd.concat(categorical_data,axis=1)

  return numerical_train_df,categorical_train_df

In [None]:
# ================================================================================
# Delete useless columns which had been used for above time consistency check test

del csv_train["TransactionDT_datetime"]
del csv_train["TransactionDT_year_month"]

In [None]:
numerical_train_df,categorical_train_df=separate_full_column_data_into_categorical_and_numerical(csv_train)
numerical_train_df=numerical_train_df.astype("float32")
numerical_train_df.head()

In [None]:
# ================================================================================
# Add label data based on 'year and month' combination

def convert_time_delte(df):
  START_DATE=datetime.strptime('2017-11-30','%Y-%m-%d')
  df['DT_M']=df['TransactionDT'].apply(lambda x:(START_DATE+timedelta(seconds=x)))
  df['DT_M']=(df['DT_M'].dt.year-2017)*12+df['DT_M'].dt.month 
  return df

In [None]:
numerical_train_df=convert_time_delte(numerical_train_df)
numerical_train_df.head()

In [None]:
# ================================================================================
# Impute null of categorical data by "nullstr" string

def impute_categorical_data_by_mode(under40_nan_categorical_df):

  temp_df1=[]
  for one_column_name in under40_nan_categorical_df:
    under40_nan_categorical_df[one_column_name]=under40_nan_categorical_df[one_column_name].cat.add_categories('nullstr')
    bb=under40_nan_categorical_df[[one_column_name]].fillna("nullstr")
    temp_df1.append(bb)
  temp_train_df2=pd.concat(temp_df1,axis=1)
  return temp_train_df2

In [None]:
imputed_categorical_train_df=impute_categorical_data_by_mode(categorical_train_df)
imputed_categorical_train_df.head()

In [None]:
# ================================================================================
# Integrate categorical values of all columns which rarely show into "others" category

def categorical_other(imputed_categorical_df):
  
  low_threshold={
    "card1":100,
    "card2":50,
    "card3":20,
    "card5":20,
    "addr1":30,
    "id_31":10,
    "DeviceInfo":100}

  # ================================================================================
  column_collection=[]
  for one_column_name in imputed_categorical_df:
    one_column_df=imputed_categorical_df[[one_column_name]]
    if one_column_name in ["card1","card2","card3","card5","addr1","id_31","DeviceInfo"]:
      for one_cate in list(map(lambda x:x[0],list(one_column_df.value_counts()[one_column_df.value_counts()<low_threshold[one_column_name]].to_frame().T.columns))):
        imputed_categorical_df[one_column_name]=imputed_categorical_df[one_column_name].replace(one_cate,'others')
    else:
      continue
  return imputed_categorical_df

In [None]:
imputed_categorical_train_df=categorical_other(imputed_categorical_train_df)
imputed_categorical_train_df.head()

In [None]:
# ================================================================================
# Impute null of numerical data by mean value of each column

def impute_numerical_data_by_mean(under40_nan_numerical_df):
  temp_df=[]
  for one_column_name in under40_nan_numerical_df:
    one_df=under40_nan_numerical_df[[one_column_name]]
    one_df_mean=one_df.mean()
    temp_df.append(one_df.fillna(one_df_mean))
  temp_df2=pd.concat(temp_df,axis=1)

  # ================================================================================
  number_of_nan_in_entire_columns=temp_df2.isnull().sum(axis=0).sum()
  assert number_of_nan_in_entire_columns==0,'number_of_nan_in_entire_columns!=0'
  return temp_df2

In [None]:
imputed_numerical_train_df=impute_numerical_data_by_mean(numerical_train_df)
imputed_numerical_train_df.head()

In [None]:
# ================================================================================
# 4. Analyze the correlation of all features each other

In [None]:
def visualize_correlation_in_features(numerical_train_df):
  corr=numerical_train_df.corr()
  fig,ax=plt.subplots(figsize=(15,15),dpi=500)
  aa=ax.matshow(corr,cmap=plt.get_cmap('Reds'))
  fig.colorbar(aa,ax=ax)
  plt.xticks(range(len(corr.columns)), corr.columns,rotation=90,fontsize=3)
  plt.yticks(range(len(corr.columns)), corr.columns,fontsize=3)

In [None]:
visualize_correlation_in_features(imputed_numerical_train_df)

### Discussion  

Same groups like C1,C2,C3,,... have high correlation  
So, I think some of them which have high correlation can be removed,  
for example, if C1 and C2 have very high correlation, I deleted the one which has more NaNs from C1 and C2

In [None]:
# ================================================================================
# Display correlation table after deleteing useless rows like (V10,V10,1) and selecting the one from rows like (V10,V11,0.9), (V11,V10,0.9)

def investigate_correlation_in_features(numerical_train_df):
  corr=numerical_train_df.corr()

  c1 = corr.abs().unstack()
  res=c1.sort_values(ascending = False).reset_index()
  res2=c1.sort_values(ascending = True).reset_index()

  duplicated_list=[]
  for i in range(res.shape[0]):
    first_column_name=res.iloc[i,:]["level_0"]
    second_column_name=res.iloc[i,:]["level_1"]
    
    # ================================================================================
    # Sort for consistent order

    consistent_order_list=[first_column_name,second_column_name]
    consistent_order_list.sort()

    # ================================================================================
    filtered_row=res[(res['level_0']==consistent_order_list[0])&(res['level_1']==consistent_order_list[1])]

    duplicated_list.append(filtered_row)

  concat_duplicated_list=pd.concat(duplicated_list)

  # ================================================================================
  concat_duplicated_list=concat_duplicated_list.drop_duplicates(keep='first')
  concat_duplicated_list=concat_duplicated_list[concat_duplicated_list['level_0']!=concat_duplicated_list['level_1']]

  return concat_duplicated_list

In [None]:
correlation_df=investigate_correlation_in_features(imputed_numerical_train_df)

In [None]:
pd.set_option('display.float_format','{:.10f}'.format)
correlation_df