# Data overview

## Data description

Originally from [this](https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203) discussion, but I've updated it to be clearer:

### Transaction Table:

-   TransactionDT: timedelta from a given reference datetime (not an actual timestamp).
-   TransactionAMT: transaction payment amount in USD.
-   ProductCD: product code, the product for each transaction.
-   card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.
-   addr1: billing region.
-   addr2: billing country.
-   dist: distances between (not limited) billing address, mailing address, zip code, IP address, phone area, etc.
-   P\_ and (R\_\_) emaildomain: purchaser and recipient email domain, certain transactions don't need recipient, so R_emaildomain is null.
-   C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
-   D1-D15: timedelta, such as days between previous transaction, etc.
-   M1-M9: match, such as names on card and address, etc.
-   Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.

#### Categorical Features in Transaction Table:

-   ProductCD
-   card1 - card6
-   addr1, addr2
-   P_emaildomain
-   R_emaildomain
-   M1 - M9

### Identity Table:

Variables in this table are identity information – network connection information (IP, ISP, Proxy, etc) and digital signature (UA/browser/os/version, etc) associated with transactions.
They're collected by Vesta’s fraud protection system and digital security partners.
(The field names are masked and pairwise dictionary will not be provided for privacy protection and contract agreement)

#### Categorical Features in Identity Table:

-   DeviceType
-   DeviceInfo
-   id_12 - id_38


# Data cleaning

In this section, we are going to remove some columns and correct a small number of values.

In [None]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import platform

DATA_ROOT = "../input/ieee-fraud-detection"
OUTPUT_ROOT = "."

!dir "../input/ieee-fraud-detection"

In [None]:
# Check environment
if platform.system() == "Windows":
    local = True
else:
    local = False

# Load data
if local:
    OUTPUT_ROOT = DATA_ROOT
    train_transaction = pd.read_csv(f'{DATA_ROOT}/train_transaction.csv', nrows=10000)
    test_transaction = pd.read_csv(f'{DATA_ROOT}/test_transaction.csv', nrows=10000)
    # train_identity = pd.read_csv(f'{DATA_ROOT}/train_identity.csv', nrows=10000)
    # test_identity = pd.read_csv(f'{DATA_ROOT}/test_identity.csv', nrows=10000)
else:
    train_transaction = pd.read_csv(f'{DATA_ROOT}/train_transaction.csv')
    test_transaction = pd.read_csv(f'{DATA_ROOT}/test_transaction.csv')
    # train_identity = pd.read_csv(f'{DATA_ROOT}/train_identity.csv')
    # test_identity = pd.read_csv(f'{DATA_ROOT}/test_identity.csv')

In [None]:
def compare_data_card(train_data, test_data, plot=True):
    # Shape
    print(f'Train data shape: {train_data.shape}')
    print(f'Test data shape: {test_data.shape}')

    # Count unique values of card and address columns
    print('\nUnique values of card and address columns in train data:')
    for i in range(6):
        j = i+1
        print(f'card{j}: {train_data["card{}".format(j)].nunique()}', end=', ')
    for i in range(2):
        j = i+1
        print(f'addr{j}: {train_data["addr{}".format(j)].nunique()}', end=', ')

    print('\nUnique values of card and address columns in test data:')
    for i in range(6):
        j = i+1
        print(f'card{j}: {test_data["card{}".format(j)].nunique()}', end=', ')
    for i in range(2):
        j = i+1
        print(f'addr{j}: {test_data["addr{}".format(j)].nunique()}', end=', ')

    # Unique values in card6 column
    print('\nUnique values in card6 column in train data:')
    print(train_data['card6'].value_counts())

    print('Unique values in card6 column in test data:')
    print(test_data['card6'].value_counts())

    # Unique values in card4 column
    print('Unique values in card4 column in train data:')
    print(train_data['card4'].value_counts())

    print('Unique values in card4 column in test data:')
    print(test_data['card4'].value_counts())

    if plot:
        # Plot histogram of card1, card2, card3, card5
        plt.figure(figsize=(20, 10))
        plt.subplot(4, 2, 1)
        sns.histplot(train_data['card1'])
        plt.title('Train')
        plt.subplot(4, 2, 2)
        sns.histplot(test_data['card1'])
        plt.title('Test')
        plt.subplot(4, 2, 3)
        sns.histplot(train_data['card2'])
        plt.title('Train')
        plt.subplot(4, 2, 4)
        sns.histplot(test_data['card2'])
        plt.title('Test')
        plt.subplot(4, 2, 5)
        sns.histplot(train_data['card3'])
        plt.title('Train')
        plt.subplot(4, 2, 6)
        sns.histplot(test_data['card3'])
        plt.title('Test')
        plt.subplot(4, 2, 7)
        sns.histplot(train_data['card5'])
        plt.title('Train')
        plt.subplot(4, 2, 8)
        sns.histplot(test_data['card5'])
        plt.title('Test')
        plt.tight_layout()
        plt.show()

        # Plot histogram of addr1, addr2
        plt.figure(figsize=(20, 10))
        plt.subplot(2, 2, 1)
        sns.histplot(train_data['addr1'])
        plt.title('Train')
        plt.subplot(2, 2, 2)
        sns.histplot(test_data['addr1'])
        plt.title('Test')
        plt.subplot(2, 2, 3)
        sns.histplot(train_data['addr2'])
        plt.title('Train')
        plt.subplot(2, 2, 4)
        sns.histplot(test_data['addr2'])
        plt.title('Test')
        plt.tight_layout()
        plt.show()


compare_data_card(train_transaction, test_transaction)


At first glance, we can see that the test data doesn't have value "debit or credit" in *card6* column and the number of rows contains value "debit or credit" or "charge" in *card6* column is very small. Thus, I decided to remove the rows with value "debit or credit" in *card6* and change the "charge" value into "credit" value since charge card is a type of credit card.

There seems to be not many differences in *card1*, *card2*, *card3*, *card5*, *addr1* and *addr2* column between train data and test data.

In [None]:
def correct_card6_column(df):
    result_df = df.copy()
    
    # Remove rows where card6 is "debit or credit"
    result_df = result_df[result_df.card6 != 'debit or credit']

    # Change "charge" values in card6 column into "credit"
    result_df.loc[result_df.card6 == 'charge', 'card6'] = 'credit'

    return result_df

train_transaction = correct_card6_column(train_transaction)
test_transaction = correct_card6_column(test_transaction)

compare_data_card(train_transaction, test_transaction, False)

In [None]:
def compare_product_code(train_transaction, test_transaction):
    # Unique values in ProductCD column in train data
    print('\nUnique values in ProductCD column in train data:', end=' ')
    print(train_transaction['ProductCD'].unique())

    # Unique values in ProductCD column in test data
    print('Unique values in ProductCD column in test data:', end=' ')
    print(test_transaction['ProductCD'].unique())

compare_product_code(train_transaction, test_transaction)

There also seems to be no mismatch in ProductCD between train and test data.

In [None]:
def compare_email(train_transaction, test_transaction):
    # Count unique values of P_emaildomain colum in train data
    print('Unique values of P_emaildomain column in train data:', end=' ')
    print(train_transaction['P_emaildomain'].nunique())

    # Count unique values of P_emaildomain colum in test data
    print('Unique values of P_emaildomain column in test data:', end=' ')
    print(test_transaction['P_emaildomain'].nunique())

    # Count unique values of R_emaildomain colum in train data
    print('Unique values of R_emaildomain column in train data:', end=' ')
    print(train_transaction['R_emaildomain'].nunique())

    # Count unique values of R_emaildomain colum in test data
    print('Unique values of R_emaildomain column in test data:', end=' ')
    print(test_transaction['R_emaildomain'].nunique())

    # Different values in P_emaildomain column of train and test data
    print('Values appeared in P_emaildomain column of test data but not in train data:', end=' ')
    print(set(test_transaction['P_emaildomain'].unique()) -
          set(train_transaction['P_emaildomain'].unique()))

    # Different values in R_emaildomain column of train and test data
    print('Values appeared in R_emaildomain column of test data but not in train data:', end=' ')
    print(set(test_transaction['R_emaildomain'].unique()) -
          set(train_transaction['R_emaildomain'].unique()))


compare_email(train_transaction, test_transaction)

There is only 1 P_emaildomain appeared in test data but not in train data which is 'scranton.edu'.
Lets see how many rows are there where P_emaildomain is 'scranton.edu' in test data.

In [None]:
# Print rows where P_emaildomain is scranton.edu in test data
print('\nRows where P_emaildomain is scranton.edu in test data:')
print(len(test_transaction[test_transaction['P_emaildomain'] == 'scranton.edu']))

So there are only 2 rows in test data where P_emaildomain is 'scranton.edu'. Hence, we don't have to do anything special about this email domain, we just need to add 'scranton.edu' into the one-hot encoder later.

The next columns to be analyzed are M1-M9 columns.

In [None]:
def inspect_M_columns(df):
    # Value count of M1-M9 columns in data
    print('\nValue count of M1-M9 columns in data:')
    for i in range(1, 10):
        print('nan:', end=' ')
        print(df['M' + str(i)].isnull().sum())
        print(df['M' + str(i)].value_counts())

In [None]:
inspect_M_columns(train_transaction)

In [None]:
inspect_M_columns(test_transaction)

We can see that M1, M2, M3 nan value count are exactly identical in both train and test data, which means they are correlated somehow. After seeing this, maybe we can group columns in the data by their nan value count.

In [None]:
def group_column_by_nan(df, log=False):
    group = {}
    # Count nan values in each column
    for col in df.columns:
        s = df[col].isna().sum()
        if s not in group:
            group[s] = [col]
        else:
            group[s].append(col)

    if log:
        # Print number of groups
        print('Number of groups:', len(group))

        # Print groups
        for k, v in group.items():
            print(k, v)

    return group


group_column_by_nan(train_transaction, True)


Now, lets check out the correlations between columns in each group. We will only inspect group with V and D columns since there will be many redundant columns in the data.

In [None]:
def check_group(group):
    for col in group:
        if 'addr' in col or 'card' in col or 'M' in col:
            return True
    return False

def correlation_V_D_col(df, plot=False):
    group = group_column_by_nan(df)
    
    # Remove group with specified columns
    group_pop = []
    for k in group.keys():
        if check_group(group[k]):
            group_pop.append(k)
    
    for k in group_pop:
        group.pop(k)
    
    # Plot correlation between columns in a group
    count = 0
    correlation_matrices = []
    for k, v in group.items():
        if len(v) > 1 and k > 0:
            count += 1
            corr = df[v].corr()
            correlation_matrices.append(corr)
            if plot:
                print(v)
                plt.figure(figsize=(15, 10))
                sns.heatmap(corr, annot=True)
                plt.show()

    return correlation_matrices

print(len(correlation_V_D_col(train_transaction, True)))

In [None]:
def get_correlated_group(matrix):
    # Get columns with correlation higher than 0.7
    groups = []
    matrix = matrix.abs()
    matrix = matrix.unstack()
    matrix = matrix.sort_values(kind="quicksort", ascending=False)
    for i in range(len(matrix)):
        if matrix.iloc[i] > 0.7:
            groups.append(set(matrix.index[i]))

    # Join correlated columns(A little inefficient)
    check = True
    while check is True:
        check = False
        for i in range(len(groups)):
            for j in range(i+1, len(groups)):
                if groups[i].intersection(groups[j]) != set():
                    groups[i] = groups[i].union(groups[j])
                    groups.pop(j)
                    check = True
                    break
            if check is True:
                break

    # Remove single element groups
    group_pop = []
    for i in range(len(groups)):
        if len(groups[i]) < 2:
            group_pop.append(i)

    # Sort group pop in descending order then pop
    group_pop.sort(reverse=True)
    for i in group_pop:
        groups.pop(i)

    return groups


In [None]:
def remove_highly_correlated_columns(df):
    # Get correlation matrix
    matrices = correlation_V_D_col(df)

    # Get correlated groups
    groups = []
    for matrix in matrices:
        groups.extend(get_correlated_group(matrix))

    # Remove correlated columns
    columns_to_remove = []
    for group in groups:
        columns_to_remove.extend(list(group)[1:])

    # Remove columns
    columns = list(df.columns)
    for col in columns_to_remove:
        columns.remove(col)

    return columns


len(remove_highly_correlated_columns(train_transaction))


Next we need to remove columns where there are too many nan values. We will first visualize the data to choose the threshold and then remove later.

In [None]:
def visualize_nan_values(df):
    # Calculate nan values percentage in each column
    percentages = []
    for col in df.columns:
        percentage = df[col].isna().sum() / len(df)
        percentages.append(percentage)
    
    # Sort nan values percentage in ascending order
    percentages.sort()

    # Plot percentages
    plt.figure(figsize=(15, 10))
    plt.plot(percentages)
    plt.show()

visualize_nan_values(train_transaction)

In [None]:
def remove_columns_with_many_nan(df):
    # Calculate nan values percentage in each column, if it's more than 0.75 then remove the column
    columns_to_remove = []
    for col in df.columns:
        percentage = df[col].isna().sum() / len(df)
        if percentage > 0.75:
            columns_to_remove.append(col)

    # Remove columns
    columns = list(df.columns)
    for col in columns_to_remove:
        columns.remove(col)

    return columns


len(remove_columns_with_many_nan(train_transaction))


In [None]:
def filter_columns(df):
    # Filter columns in data using above functions
    columns = set(remove_highly_correlated_columns(df))
    columns = columns.intersection(set(remove_columns_with_many_nan(df)))
    return columns

len(filter_columns(train_transaction))

Lets quickly summarize what we have done to the data so far:
- Correct card column values.
- Discover a P_emaildomain value appeared in test data but not in train data: "*scranton.edu*".
- Visualize the data.
- Replace highly correlated columns with a single column.
- Remove columns with many nan values.

Finally, we have a list of columns that we will use later on in our model.

However, we still need to do some more data preprocessing before we can use them.

# Data processing

In this section, we will replace some values and one-hot encoding non-numerical columns.

Lets start by finding out which columns don't contain numerical values.

In [None]:
def get_non_numeric_columns(df):
    # Get non-numeric columns
    columns = df.select_dtypes(include=np.number).columns
    columns = list(set(df.columns) - set(columns))
    return columns

get_non_numeric_columns(train_transaction[filter_columns(train_transaction)])

We can see that all M columns are not numerical, they only contains "T" and "F" values, except for M4 which contains "M0", "M1" or "M2". Lets replace "T" and "F" values with 1 and 9 respectively.

In [None]:
def replace_boolean_M_col(df):
    result_df = df.copy()
    for i in range(9):
        result_df[f"M{i+1}"] = result_df[f"M{i+1}"].replace("T", 1)
        result_df[f"M{i+1}"] = result_df[f"M{i+1}"].replace("F", 0)
    return result_df


fixed_train_transaction = replace_boolean_M_col(train_transaction[filter_columns(train_transaction)])
print(fixed_train_transaction.info())


After changing M columns values, lets run the filter again to see if they are still non numerical.

In [None]:
get_non_numeric_columns(fixed_train_transaction)

The next step is to one hot encode the non-numerical columns.

In [None]:
def one_hot_encode_train(df, columns):
    # One hot encode columns
    result_df = df.copy()
    # Save uniques values to encode test data later
    uniques = {}
    for column in columns:
        # Get unique values in sorted order
        unique = list(result_df[column].value_counts().index)
        uniques[column] = unique
        # Replace them
        for i in range(len(unique)):
            result_df[column] = result_df[column].replace(unique[i], i)
    return result_df, uniques
    
final_train_transaction, uniques_val = one_hot_encode_train(fixed_train_transaction, get_non_numeric_columns(fixed_train_transaction))
print(final_train_transaction.info())
# Note that the index is off since I deleted some rows

In [None]:
def one_hot_encode_test(df, uniques, columns, label):
    # Remove label column
    columns.remove(label)

    # One hot encode columns
    result_df = df.copy()
    result_df = result_df[columns]

    for column in uniques.keys():
        unique = uniques[column]
        # Remember the scranton.edu email domain
        if column == "P_emaildomain":
            unique.append("scranton.edu")
        # Replace them
        for i in range(len(unique)):
            result_df[column] = result_df[column].replace(unique[i], i)

    return result_df


final_test_transaction = one_hot_encode_test(
    test_transaction, uniques_val, list(final_train_transaction.columns), "isFraud")
final_test_transaction = replace_boolean_M_col(final_test_transaction)
print(final_test_transaction.info())


That's all for data preprocessing, now lets export processed data to csv file. Note that I don't use identity data here since it has much lower records than transaction data so I think merging them is going to hurt our model performance.

In [None]:
# Check if there are any non-numeric columns left
print(get_non_numeric_columns(final_train_transaction))
print(get_non_numeric_columns(final_test_transaction))

In [None]:
# Export final_train_transaction to csv
final_train_transaction.to_csv(f"{OUTPUT_ROOT}/final_train_transaction.csv", index=False)
final_test_transaction.to_csv(f"{OUTPUT_ROOT}/final_test_transaction.csv", index=False)