## Problem Discussion
This competition is a binary classification problem. Our target variable is a binary attribute and our goal is to classify users into "fraudlent" or "not fraudlent" as well as possible.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 1. Importing Necessary Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import xgboost as xgb

import time
import warnings

pd.set_option('display.max_columns', None)
plt.style.use(style = 'seaborn')

warnings.filterwarnings('ignore')

## 2. Data Loading & Overview

Machine is going to read four csv files. Here, %%time will show the CPU & Wall time are needed to read those files. Data is separated into two datasets: information about the identity of the customer and transaction information. Not all transactions belong to identities, which are available.

In [None]:
%%time

train_tr = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_id = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')

test_tr = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_id = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

## 3. Exploratory Data Analysis

In [None]:
# Let's know the shape of the train & test data

print('train_transaction shape is {}'.format(train_tr.shape))
print('train_identity shape is {}'.format(train_id.shape))

print('\n')

print('test_transaction shape is {}'.format(test_tr.shape))
print('test_identity shape is {}'.format(test_id.shape))

### **Let's know about the train data**

In [None]:
print(train_tr.info())

print('\n')

print(train_id.info())

In [None]:
train_tr.head()

In [None]:
train_id.head()

### **Let's know about the test data**

In [None]:
print(test_tr.info())

print('\n')

print(test_id.info())

In [None]:
test_tr.head()

In [None]:
test_id.head()

### **Merging the transaction & identity data**

In [None]:
train = pd.merge(train_tr, train_id, how = 'left', on = 'TransactionID')

test = pd.merge(test_tr, test_id, how = 'left', on = 'TransactionID')

In [None]:
print(train.shape)
print(test.shape)

In [None]:
del train_tr, train_id, test_tr, test_id

In [None]:
# Let's see how does the train data look like

train.head()

## *Reduce Memory Use (Part-1)*

#### *Train Data*

In [None]:
num_train_cols = ['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'card1', 'dist1', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V238', 'V239', 'V240', 'V241', 'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V248', 'V249', 'V250', 'V251', 'V252', 'V253', 'V254', 'V255', 'V256', 'V257', 'V258', 'V259', 'V260', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 'V278', 'V279', 'V280', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320', 'V321', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_13', 'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 'id_32']

In [None]:
def detect_num_cols_to_shrink(list_of_num_cols, dataframe):
 
    convert_to_int8 = []
    convert_to_int16 = []
    convert_to_int32 = []
    
    #  sadly the datatype float8 does not exist
    convert_to_float16 = []
    convert_to_float32 = []
    
    for col in list_of_num_cols:
        
        if dataframe[col].dtype in ['int', 'int8', 'int32', 'int64']:
            
            describe_object = dataframe[col].describe()
            minimum = describe_object[3]
            maximum = describe_object[7]
            diff = abs(maximum - minimum)

            if diff < 255:
                convert_to_int8.append(col)
                
            elif diff < 65535:
                convert_to_int16.append(col)
                
            elif diff < 4294967295:
                convert_to_int32.append(col)   
                
        elif dataframe[col].dtype in ['float', 'float16', 'float32', 'float64']:
            
            describe_object = dataframe[col].describe()
            minimum = describe_object[3]
            maximum = describe_object[7]
            diff = abs(maximum - minimum)

            if diff < 65535:
                convert_to_float16.append(col)
                
            elif diff < 4294967295:
                convert_to_float32.append(col) 
        
    list_of_lists = []
    list_of_lists.append(convert_to_int8)
    list_of_lists.append(convert_to_int16)
    list_of_lists.append(convert_to_int32)
    list_of_lists.append(convert_to_float16)
    list_of_lists.append(convert_to_float32)
    
    return list_of_lists

In [None]:
num_cols_to_shrink_train = detect_num_cols_to_shrink(num_train_cols, train)

convert_to_int8 = num_cols_to_shrink_train[0]
convert_to_int16 = num_cols_to_shrink_train[1]
convert_to_int32 = num_cols_to_shrink_train[2]

convert_to_float16 = num_cols_to_shrink_train[3]
convert_to_float32 = num_cols_to_shrink_train[4]

print("convert_to_int8 :", convert_to_int8, "\n")
print("convert_to_int16 :", convert_to_int16, "\n")
print("convert_to_int32 :", convert_to_int32, "\n")

print("convert_to_float16 :", convert_to_float16, "\n")
print("convert_to_float32 :", convert_to_float32, "\n")

In [None]:
print("starting with converting process....")

for col in convert_to_int16:
    
    train[col] = train[col].astype('int16') 
    
for col in convert_to_int32:
    train[col] = train[col].astype('int32') 

for col in convert_to_float16:
    train[col] = train[col].astype('float16')
    
for col in convert_to_float32:
    train[col] = train[col].astype('float32')
    
print("successfully converted!")

#### *Test Data*

In [None]:
num_test_cols = ['TransactionID', 'TransactionDT', 'TransactionAmt', 'card1', 'dist1', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V238', 'V239', 'V240', 'V241', 'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V248', 'V249', 'V250', 'V251', 'V252', 'V253', 'V254', 'V255', 'V256', 'V257', 'V258', 'V259', 'V260', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 'V278', 'V279', 'V280', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320', 'V321', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339', 'id-01', 'id-02', 'id-03', 'id-04', 'id-05', 'id-06', 'id-07', 'id-08', 'id-09', 'id-10', 'id-11', 'id-13', 'id-14', 'id-17', 'id-18', 'id-19', 'id-20', 'id-21', 'id-22', 'id-24', 'id-25', 'id-26', 'id-32']

In [None]:
num_cols_to_shrink_test = detect_num_cols_to_shrink(num_test_cols, test)

convert_to_int8 = num_cols_to_shrink_test[0]
convert_to_int16 = num_cols_to_shrink_test[1]
convert_to_int32 = num_cols_to_shrink_test[2]

convert_to_float16 = num_cols_to_shrink_test[3]
convert_to_float32 = num_cols_to_shrink_test[4]

print("convert_to_int8 :", convert_to_int8, "\n")
print("convert_to_int16 :", convert_to_int16, "\n")
print("convert_to_int32 :", convert_to_int32, "\n")

print("convert_to_float16 :", convert_to_float16, "\n")
print("convert_to_float32 :", convert_to_float32, "\n")

In [None]:
print("starting with converting process....")

for col in convert_to_int16:
    
    test[col] = test[col].astype('int16') 
    
for col in convert_to_int32:
    test[col] = test[col].astype('int32') 

for col in convert_to_float16:
    test[col] = test[col].astype('float16')
    
for col in convert_to_float32:
    test[col] = test[col].astype('float32')
    
print("successfully converted!")

### **Checking Train column names & Test column names are same or not.**

We have observed there exist difference style in column name. Therefore, need to solve this issue.

In [None]:
def different_columns(traincols, testcols):
    
    for i in traincols:
        
        if i not in testcols:
            
            print(i)
            
different_columns(train.columns, test.columns)

In [None]:
test = test.rename(columns = {"id-01": "id_01", "id-02": "id_02", "id-03": "id_03", 
                            "id-06": "id_06", "id-05": "id_05", "id-04": "id_04", 
                            "id-07": "id_07", "id-08": "id_08", "id-09": "id_09", 
                            "id-10": "id_10", "id-11": "id_11", "id-12": "id_12", 
                            "id-15": "id_15", "id-14": "id_14", "id-13": "id_13", 
                            "id-16": "id_16", "id-17": "id_17", "id-18": "id_18", 
                            "id-21": "id_21", "id-20": "id_20", "id-19": "id_19", 
                            "id-22": "id_22", "id-23": "id_23", "id-24": "id_24", 
                            "id-27": "id_27", "id-26": "id_26", "id-25": "id_25", 
                            "id-28": "id_28", "id-29": "id_29", "id-30": "id_30", 
                            "id-31": "id_31", "id-32": "id_32", "id-33": "id_33", 
                            "id-34": "id_34", "id-35": "id_35", "id-36": "id_36", 
                            "id-37": "id_37", "id-38": "id_38"})
test.head()

### **Lets check again Train column names & Test column names are same or not.**

In [None]:
different_columns(train.columns, test.columns)

### **Fraud vs. Not Fraud**

In [None]:
fig = plt.figure(figsize = (5, 5))

sns.barplot([0,1],train['isFraud'].value_counts().values)
plt.show()

### **Missing Value**

In [None]:
#count total number of NULL values in train data

tot_missing_value = train.isnull().sum().sum()
print(tot_missing_value)

In [None]:
del tot_missing_value

In [None]:
column_missing_value = train.isnull().sum()
print(column_missing_value[0 : 60])
print(column_missing_value[60 : 120])
print(column_missing_value[120 : 180])
print(column_missing_value[180 : 240])
print(column_missing_value[240 : 300])
print(column_missing_value[300 : 360])
print(column_missing_value[360 : 420])
print(column_missing_value[420 : 434])

In [None]:
del column_missing_value

### **Train vs. Test TransactionDT**


In [None]:
fig = plt.figure(figsize = (10, 5))

plt.hist(train['TransactionDT'], label = 'T rain', bins = 35, color = 'red')
plt.hist(test['TransactionDT'], label = 'T est', bins = 35, color = 'yellow')
plt.legend()

plt.title('Train vs. Test TransactionDT Distribution')

* It is visible that train and test transaction dates don't overlap.

### **TransactionAmt**

Here, log transform is taken in some of these plots to better show the distribution. Otherwise the few, very large transactions skew the distribution. Log transformation is most likely the first thing one should do to remove skewness from the predictor. It can be easily done via Numpy, just by calling the log() function on the desired column.

In [None]:
# Train Data

fig, ax = plt.subplots(1, 2, figsize = (18, 4))

time_val = train['TransactionAmt'].values

sns.distplot(time_val, ax = ax[0], color = 'red')
ax[0].set_title('Train TransactionAmt Distribution', fontsize = 16)
ax[1].set_xlim([min(time_val), max(time_val)])

sns.distplot(np.log(time_val), ax = ax[1], color = 'green')
ax[1].set_title('Train LOG TransactionAmt Distribution', fontsize = 16)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])

plt.show()

In [None]:
del time_val

In [None]:
# Test Data

fig, ax = plt.subplots(1, 2, figsize = (18,4))

time_val = test['TransactionAmt'].values

sns.distplot(time_val, ax = ax[0], color = 'red')
ax[0].set_title('Test TransactionAmt Distribution', fontsize = 16)
ax[1].set_xlim([min(time_val), max(time_val)])

sns.distplot(np.log(time_val), ax = ax[1], color = 'green')
ax[1].set_title('Test LOG TransactionAmt Distribution', fontsize = 16)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])

plt.show()

In [None]:
del time_val

### **ProductCD**

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (20, 7))

sns.countplot(x = "ProductCD", ax = ax[0], hue = "isFraud", data = train)
ax[0].set_title('Train ProductCD', fontsize = 16)

sns.countplot(x = "ProductCD", ax = ax[1], data = test)
ax[1].set_title('Test ProductCD', fontsize = 16)

plt.show()

* W has the most number of observations, S the least.

In [None]:
print(train['ProductCD'].value_counts())
print('\n-----------\n')
print(test['ProductCD'].value_counts())

### **card1 - card6**

* These are all categorical, though some appear numeric.

In [None]:
# Train Data

card_cols = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6']
color_pal = ['cyan', 'yellow', 'blue', 'green', 'red']

color_idx = 0

for c in card_cols:
    
    if train[c].dtype in ['float64','int64']:
        
        train[c].plot(kind = 'hist',
                      title = c,
                      bins = 70,
                      figsize = (10, 2),
                      color = color_pal[color_idx])
    color_idx += 1
    
    plt.show()

In [None]:
# Test Data

card_cols = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6']
color_pal = ['cyan', 'yellow', 'blue', 'green', 'red']

color_idx = 0

for c in card_cols:
    
    if test[c].dtype in ['float64','int64']:
        
        test[c].plot( kind = 'hist',
                      title = c,
                      bins = 70,
                      figsize = (10, 2),
                      color = color_pal[color_idx])
    color_idx += 1
    
    plt.show()

In [None]:
# card4

fig, ax = plt.subplots(1, 2, figsize = (18, 4))

sns.countplot(x = "card4", ax = ax[0], data = train.loc[train['isFraud'] == 0])
ax[0].set_title('card4 isFraud = 0', fontsize = 16)

sns.countplot(x = "card4", ax = ax[1], data = train.loc[train['isFraud'] == 1])
ax[1].set_title('card4 isFraud = 1', fontsize = 16)

plt.show()

In [None]:
# card6

fig, ax = plt.subplots(1, 2, figsize = (18, 4))

sns.countplot(x = "card6", ax = ax[0], data = train.loc[train['isFraud'] == 0])
ax[0].set_title('card6 isFraud = 0', fontsize = 16)

sns.countplot(x = "card6", ax = ax[1], data = train.loc[train['isFraud'] == 1])
ax[1].set_title('card6 isFraud = 1', fontsize = 16)

plt.show()

**card2, card3, card5**

Column Name = Total Number of Unique Value, dtype

Train Data

* card2 = 500, int
* card3 = 114, int (150 >> 521287)
* card5 = 119, int

Test Data

* card2 = 497, int
* card3 = 115, int (150, 435558)
* card5 = 102, int

### **addr1, addr2**

In [None]:
train['addr1'].plot(kind = 'hist', bins = 500, figsize = (10, 2), title = 'addr1 distribution', color = 'red')
plt.show()

train['addr2'].plot(kind = 'hist', bins = 500, figsize = (10, 2), title = 'addr2 distribution', color = 'green')
plt.show()

Column Name = Total Number of Unique Values, dtype

Train Data

* addr1 = 332, int
* addr2 = 74, int (87 >> 520481)

Test Data

* addr1 = 292, int
* addr2 = 72, int (87 >> 435934)

### **P_emaildomain**

In [None]:
train['P_emaildomain'].value_counts()

In [None]:
test['P_emaildomain'].value_counts()

### **C1-C14**

Column Name = Total Number of Unique Value, dtype

Train Data

* C1 = 1657, int (1 >> 316791)
* C2 = 1216, int (1 >> 316261)
* C3 = 27, int (0 >> 588111)
* C4 = 1260, int (0 >> 451883)
* C5 = 319, int (0 >> 372435)
* C6 = 1328, int (1 >> 341552)
* C7 = 1103, int (0 >> 523142)
* C8 = 1253, int (0 >> 447667)
* C9 = 205, int (1 >> 228938)
* C10 = 1231, int (0 >> 453442)
* C11 = 1476, int (1 >> 389692)
* C12 = 1199, int (0 >> 489199)
* C13 = 1597, int (1 >> 199337)
* C14 = 1108, int (1 >> 320189)

Test Data

* C1 = 1174, int (1 >> 269951)
* C2 = 1123, int (1 >> 265435)
* C3 = 31, int  (0 >> 493779)
* C4 = 728, int (0 >> 371571)
* C5 = 353, int (0 >> 334573)
* C6 = 834, int (1 >> 289685)
* C7 = 518, int (0 >> 438095)
* C8 = 442, int (0 >> 368637)
* C9 = 354, int (1 >> 191416)
* C10 = 456, int (0 >> 370978)
* C11 = 937, int (1 >> 342100)
* C12 = 663, int (0 >> 366881)
* C13 = 1069, int (1 >> 168280)
* C14 = 569, int (1 >> 278331)

### **D1, D10, D15**

Column Name = Total Number of Unique Values, dtype

Train Data

* D1 = 641, int (0 >> 280130)
* D4 = 808, int (0 >> 166571)
* D10 = 818, int (0 >> 221930)
* D15 = 859, int (0 >> 174246)

Test Data

* D1 = 642, int (0 >> 227724)
* D4 = 808, int (0 >> 178278)
* D10 = 1075, int (0 >> 227999)
* D15 = 1078, int (0 >> 183720)

### **Missing Value**

In [None]:
def getNulls(data):
    
    total = data.isnull().sum()
    percent = data.isnull().sum() / data.isnull().count()
    missing_data = pd.concat([total, percent], axis = 1, keys = ['total', 'precent'])
    
    return missing_data

In [None]:
# Train Data Missing Values

missing_data_train = getNulls(train)
missing_data_train.head(434).T

In [None]:
# Test Data Missing Values

missing_data_test = getNulls(test)
missing_data_test.head(434).T

In [None]:
del missing_data_test

## 4. Feature Selection (Part-1)

In [None]:
# Get the columns that we have to drop

sel_cols = missing_data_train[missing_data_train['total'] > 100000].index

In [None]:
del missing_data_train

In [None]:
# Drop the columns

train.drop(sel_cols, axis = 1, inplace = True)
test.drop(sel_cols, axis = 1, inplace = True)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
missing_data_train = getNulls(train)
missing_data_train.head(182).T

In [None]:
missing_data_test = getNulls(test)
missing_data_test.head(181).T

## **5. Handle Missing Values**

In [None]:
print(list(train.columns))

In [None]:
# Merging Training & Testing Dataset to handle missing value

ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat([train, test], axis = 0, sort = False)
all_data.shape

In [None]:
all_data_cols = all_data.columns

for i in all_data_cols:
    
    if all_data[i].dtype == 'object':
        
        all_data[i] = all_data[i].fillna(all_data[i].mode()[0])

Column those are Object type:

* ProductCD
* card4
* card6
* P_emaildomain

In [None]:
for i in all_data_cols:
    
    if (i.startswith("C") or (i.startswith("V"))) and all_data[i].isnull().sum() > 0:
        
        all_data[i] = all_data[i].fillna(all_data[i].mode()[0])
        
missing_data = getNulls(all_data)
missing_data.head(182).T

In [None]:
all_data['card2'] = all_data['card2'].fillna(all_data['card2'].mean())
all_data['card3'] = all_data['card3'].fillna(all_data['card3'].mean())
all_data['card5'] = all_data['card5'].fillna(all_data['card5'].mean())

In [None]:
all_data['D1'] = all_data['D1'].fillna(all_data['D1'].mode()[0])
all_data['D10'] = all_data['D10'].fillna(all_data['D10'].mode()[0])
all_data['D15'] = all_data['D15'].fillna(all_data['D15'].mode()[0])

In [None]:
all_data['addr1'] = all_data['addr1'].fillna(all_data['addr1'].mean())
all_data['addr2'] = all_data['addr2'].fillna(all_data['addr2'].mode()[0])

In [None]:
missing_data = getNulls(all_data)
missing_data.head(182).T

## *Split all_data*

In [None]:
train = all_data[ : ntrain]
test = all_data[ntrain : ]

In [None]:
print(train.shape)
print(test.shape)

In [None]:
test.head()

In [None]:
test.drop(['isFraud'], axis = 1, inplace = True)

In [None]:
test.shape

## *Set Target Column*

In [None]:
target = train['isFraud']

## 6. Feature Selection (Part-2)
#### *Correlation*

In [None]:
print(list(train.columns))

In [None]:
train.corr()['isFraud'].to_csv("correlation.csv")

In [None]:
lst1 = ['TransactionID', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'isFraud']

corr1 = train[lst1].corr()
plt.subplots(figsize = (10, 6))
sns.heatmap(corr1, annot = True, cmap = "Blues")

In [None]:
lst2 = ['card1', 'card2', 'card3', 'card5', 'isFraud']

corr2 = train[lst2].corr()
plt.subplots(figsize = (12, 8))
sns.heatmap(corr2, annot = True, cmap = "Blues")

In [None]:
lst3 = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'isFraud']

corr3 = train[lst3].corr()
plt.subplots(figsize = (20, 16))
sns.heatmap(corr3, annot = True, cmap = "Blues")

In [None]:
lst4 = ['V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'isFraud']

corr4 = train[lst4].corr()
plt.subplots(figsize = (20, 16))
sns.heatmap(corr4, annot = True, cmap = "Blues")

In [None]:
lst5 = ['V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'isFraud']

corr5 = train[lst5].corr()
plt.subplots(figsize = (20, 16))
sns.heatmap(corr5, annot = True, cmap = "Blues")

In [None]:
lst6 = ['V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'isFraud']

corr6 = train[lst6].corr()
plt.subplots(figsize = (20, 16))
sns.heatmap(corr6, annot = True, cmap = "Blues")

In [None]:
lst7 = ['V137', 'V279', 'V280', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'isFraud']

corr7 = train[lst7].corr()
plt.subplots(figsize = (20, 16))
sns.heatmap(corr7, annot = True, cmap = "Blues")

In [None]:
lst8 = ['V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'isFraud']

corr8 = train[lst8].corr()
plt.subplots(figsize = (20, 16))
sns.heatmap(corr8, annot = True, cmap = "Blues")

In [None]:
lst9 = ['V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320', 'V321', 'isFraud']

corr9 = train[lst9].corr()
plt.subplots(figsize = (20, 16))
sns.heatmap(corr9, annot = True, cmap = "Blues")

## *Encode the categorical features*

In [None]:
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat([train, test], axis = 0, sort = False)
print(all_data.shape)

In [None]:
ntrain + ntest

In [None]:
# Encoding the data

all_data = pd.get_dummies(all_data)
print(all_data.shape)
all_data.head()

In [None]:
# Split the data again and get ready for training

train = all_data[ : ntrain]
test = all_data[ntrain : ]

In [None]:
test_id = test['TransactionID']

In [None]:
train.drop(['TransactionID', 'isFraud'], axis = 1, inplace = True)

test.drop(['TransactionID', 'isFraud'], axis = 1, inplace = True)

print(train.shape)
print(test.shape)

In [None]:
different_columns(train.columns, test.columns)

In [None]:
del ntrain
del ntest

## *Reduce Memory Use*

#### *Train Data*

In [None]:
n = (train.dtypes != 'object')

num_train_cols = list(n[n].index) 

print(num_train_cols)

In [None]:
num_cols_to_shrink_train = detect_num_cols_to_shrink(num_train_cols, train)

convert_to_int8 = num_cols_to_shrink_train[0]
convert_to_int16 = num_cols_to_shrink_train[1]
convert_to_int32 = num_cols_to_shrink_train[2]

convert_to_float16 = num_cols_to_shrink_train[3]
convert_to_float32 = num_cols_to_shrink_train[4]

print("convert_to_int8 :", convert_to_int8, "\n")
print("convert_to_int16 :", convert_to_int16, "\n")
print("convert_to_int32 :", convert_to_int32, "\n")

print("convert_to_float16 :", convert_to_float16, "\n")
print("convert_to_float32 :", convert_to_float32, "\n")

In [None]:
print("starting with converting process....")

for col in convert_to_int16:
    
    train[col] = train[col].astype('int16') 
    
for col in convert_to_int32:
    train[col] = train[col].astype('int32') 

for col in convert_to_float16:
    train[col] = train[col].astype('float16')
    
for col in convert_to_float32:
    train[col] = train[col].astype('float32')
    
print("successfully converted!")

#### *Test Data*

In [None]:
n = (test.dtypes != 'object')

num_test_cols = list(n[n].index) 

print(num_test_cols)

In [None]:
num_cols_to_shrink_test = detect_num_cols_to_shrink(num_test_cols, test)

convert_to_int8 = num_cols_to_shrink_test[0]
convert_to_int16 = num_cols_to_shrink_test[1]
convert_to_int32 = num_cols_to_shrink_test[2]

convert_to_float16 = num_cols_to_shrink_test[3]
convert_to_float32 = num_cols_to_shrink_test[4]

print("convert_to_int8 :", convert_to_int8, "\n")
print("convert_to_int16 :", convert_to_int16, "\n")
print("convert_to_int32 :", convert_to_int32, "\n")

print("convert_to_float16 :", convert_to_float16, "\n")
print("convert_to_float32 :", convert_to_float32, "\n")

In [None]:
print("starting with converting process....")

for col in convert_to_int16:
    
    test[col] = test[col].astype('int16') 
    
for col in convert_to_int32:
    test[col] = test[col].astype('int32') 

for col in convert_to_float16:
    test[col] = test[col].astype('float16')
    
for col in convert_to_float32:
    test[col] = test[col].astype('float32')
    
print("successfully converted!")

## 7. Model Building

### **XGBoost**

In [None]:
xgmodel = xgb.XGBClassifier(n_estimators = 5000,
                            max_depth = 12,
                            learning_rate = 0.02,
                            subsample = 0.8,
                            colsample_bytree = 0.4,
                            missing = -1,
                            random_state = 42,
                            tree_method = 'gpu_hist')
xgmodel.fit(train, target)

In [None]:
y_pred = xgmodel.predict_proba(test)

In [None]:
print(y_pred)

In [None]:
sub = pd.DataFrame()
sub['TransactionID'] = test_id
sub['isFraud'] = y_pred[:, 1]
sub.to_csv('submission1.csv', index = False)