In [None]:
# Preprocessing
from sklearn import preprocessing
import xgboost as xgb

# modules to handle data
import os
import pandas as pd
import numpy as np
import seaborn as sns
import csv
import pickle
import gc

%matplotlib inline
import matplotlib.pyplot as plt

# Create submission file with assigned predicted results from models
def create_file_for_submission(filename, classifier_predictions):
    kaggle_submission = pd.read_csv('./datasets/IEEEFraudDetection/sample_submission.csv', index_col='TransactionID')

    # Probabilities of class 2 ('isFraud'=1) is in column 1 of the matrix
    kaggle_submission['isFraud'] = classifier_predictions[:,1]
    kaggle_submission.to_csv(filename)
    
def load_classifier_from_picklefile(filename):
    infile = open(filename,'rb')
    loaded_Classifier = pickle.load(infile)
    infile.close()
    return loaded_Classifier
    
def save_classifier_to_picklefile(filename, save_Classifier):
    model_file = open(filename,'wb')
    pickle.dump(save_Classifier, model_file)
    model_file.close()

In [None]:
# load data 
train_identity = pd.read_csv('./datasets/IEEEFraudDetection/train_identity.csv', index_col='TransactionID')
train_transaction = pd.read_csv('./datasets/IEEEFraudDetection/train_transaction.csv', index_col='TransactionID')

test_identity = pd.read_csv('./datasets/IEEEFraudDetection/test_identity.csv', index_col='TransactionID')
test_transaction = pd.read_csv('./datasets/IEEEFraudDetection/test_transaction.csv', index_col='TransactionID')

In [None]:
# train_transaction.head(), train_identity.head()
# test_identity.head(), test_transaction.head()
print("Shape of train_identity : ", train_identity.shape)
print("Shape of train_transaction : ", train_transaction.shape)
print("Shape of test_identity : ", test_identity.shape)
print("Shape of test_transaction : ", test_transaction.shape)

In [None]:
train_merged = train_transaction.merge(train_identity,
                                       how='left', left_index=True, right_index=True)
test_merged = test_transaction.merge(test_identity,
                                     how='left',left_index=True, right_index=True)

print("Shape of train_merged_identity : ", train_merged.shape)
print("Shape of train_merged_transaction : ", test_merged.shape)

# Transaction columns (394 items including 'TransactionID')

1. TransactionID, isFraud, TransactionDT, TransactionAmt, ProductCD,
2. card1 - card6,
3. addr1 - addr2,
4. dist1 - dist2,
5. P_emaildomain, R_emaildomain,
6. C1 - C14,
7. D1 - D15,
8. M1 - M9,
9. V1 - V339

# Identity columns (41 items including 'TransactionID')
1. TransactionID,
2. id_01 - id_38,
3. DeviceType, DeviceInfo

In [None]:
# test_merged[19900:19910]
print("Merged DataFrame shape :", train_merged.shape)
train_merged.head()

# Categorical Features - Transaction
1. ProductCD
2. card1 - card6
3. addr1, addr2
4. P_emaildomain
5. R_emaildomain
6. M1 - M9

# Categorical Features - Identity
7. DeviceType
8. DeviceInfo
9. id_12 - id_38<BR>
<BR>** The TransactionDT feature is a timedelta from a given reference datetime (not an actual timestamp). **

### <B>1. ProductCD</B>

In [None]:
# Initialized variables
total_counts = len(train_merged)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
fig.suptitle('Categorical Features'.upper(), y=1.0, fontsize=14)

df_dType = pd.DataFrame({'ProductCD' : train_merged['ProductCD']})
df_id12 = pd.DataFrame({'id_12' : train_merged['id_12']})
#df_dType['ProductCD'] = df_dType['ProductCD'].fillna(-999)

ProductCD = sns.countplot(x='ProductCD', data=df_dType, ax=axes[0])
ProductCD.set_title('ProductCD', fontsize=14)
for rectBox in ProductCD.patches:
    count = rectBox.get_height()
    #print("ProductCD Counts :", count)
    
    ProductCD.text(rectBox.get_x() + rectBox.get_width()/2.,
                   count + 1000,
                   '{:1.2f}%'.format(count/total_counts*100),
                   ha="center", fontsize=11)

#print("----------------------------------------")
Card4 = sns.countplot(x=train_merged['card4'], data=train_merged, ax=axes[1])
Card4.set_title('card4', fontsize=14)
for rectBox in Card4.patches:
    count = rectBox.get_height()
    #print("Card4 Counts :", count)
    
    Card4.text(rectBox.get_x() + rectBox.get_width()/2.,
               count + 1000,
               '{:1.2f}%'.format(count/total_counts*100),
               ha="center", fontsize=11)

# Eg. To change the x-labels
# plt.xticks(np.arange(3), ("Missing", "Yes", "No"))
plt.show()

print("Missing Data for ProductCD :" + '{:1.2f}%'.format(train_merged['ProductCD'].isnull().sum() / total_counts * 100))
print("Missing Data for card 3 :" + '{:1.2f}%'.format(train_merged['card3'].isnull().sum() / total_counts * 100))


fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 8))
fig.subplots_adjust(hspace=1.0)
fig.suptitle('Categorical Feature counts'.upper(), y=1.02, fontsize=14)

isFraud = sns.countplot(x=train_merged['isFraud'], data=train_merged, ax=axes[0][0])
ProductCD = sns.countplot(x=train_merged['ProductCD'], data=train_merged, ax=axes[0][1])
DeviceType = sns.countplot(x=train_merged['DeviceType'], data=train_merged, ax=axes[1][0])

card4 = sns.countplot(x=train_merged['card4'], data=train_merged,ax=axes[1][1]).set_title("Card 4 by count")
card6 = sns.countplot(x=train_merged['card6'], data=train_merged, ax=axes[2][0]).set_title("Card 6 by count")
m1 = sns.countplot(x=train_merged['M1'], data=train_merged, ax=axes[2][1]).set_title("M1 by count")

plt.tight_layout()

### <B>2. card4 and card6</B>
(Missing Data : 0.27%)

In [None]:
c_cols = [c for c in train_merged if c[0] == 'c']

# Example : train_merged['id_01'].value_counts()
for ctr in range(len(c_cols)):
    col_name = c_cols[ctr]
    df_unique = train_merged[col_name].nunique()
    df_unique_counts = train_merged[col_name].value_counts()
    print ("No. of unique values in", col_name, " :", df_unique)
    #print ("No. of counts per unique value in", col_name, " :\n", df_unique_counts)
    print("Missing Data : " + '{:1.2f}%'.format(train_merged[col_name].isnull().sum() / total_counts * 100))
    
train_merged[c_cols].head()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
fig.suptitle('Categorical Features'.upper(), y=1.0, fontsize=14)
df_card4 = pd.DataFrame({'card4' : train_merged['card4']})
df_card6 = pd.DataFrame({'card4' : train_merged['card6']})

#df_dType['ProductCD'] = df_dType['ProductCD'].fillna(-999)

Card4 = sns.countplot(x=train_merged['card4'], data=train_merged, ax=axes[0])
Card4.set_title('card4', fontsize=14)
for rectBox in Card4.patches:
    count = rectBox.get_height()
    #print("Card4 Counts :", count)
    
    Card4.text(rectBox.get_x() + rectBox.get_width()/2.,
               count + 1000,
               '{:1.2f}%'.format(count/total_counts*100),
               ha="center", fontsize=11)

#print("----------------------------------------")
Card6 = sns.countplot(x=train_merged['card6'], data=train_merged, ax=axes[1])
Card6.set_title('card6', fontsize=14)
for rectBox in Card6.patches:
    count = rectBox.get_height()
    #print("Card6 Counts :", count)
    
    Card6.text(rectBox.get_x() + rectBox.get_width()/2.,
               count + 1000,
               '{:1.2f}%'.format(count/total_counts*100),
               ha="center", fontsize=11)

# Eg. To change the x-labels
# plt.xticks(np.arange(3), ("Missing", "Yes", "No"))
plt.show()

print("Missing Data for card 4 :" + '{:1.2f}%'.format(train_merged['card4'].isnull().sum() / total_counts * 100))
print("Missing Data for card 6 :" + '{:1.2f}%'.format(train_merged['card6'].isnull().sum() / total_counts * 100))

In [None]:
card1_Grp = pd.DataFrame()
card1_Grp['card1Count'] = train_merged.groupby(['card1'])['card1'].count()
card1_Grp['card1'] = card1_Grp.index

# There are too many Devices, so we will subset the top 20
card1_grp_top = card1_Grp.sort_values(by='card1Count',ascending=False).head(20)
order_card1 = card1_grp_top.sort_values(by='card1Count',ascending=False)['card1']

plt.figure(figsize=(25, 10))
sns.set(color_codes=True)
sns.set(font_scale = 1.3)
ax = sns.barplot(x='card1', y='card1Count', data=card1_grp_top, order=order_card1)
ax.set_title('Top 20 ranking of card1', fontsize=18)
for rectBox in ax.patches:
    count = rectBox.get_height()
    #print("Counts :", count)    
    ax.text(rectBox.get_x() + rectBox.get_width()/2.,
            count + 100,
            '{:1.2f}%'.format(count/total_counts*100),
            ha="center", fontsize=20)


xt = plt.xticks(rotation=60)
plt.show()

print("Total no. of card1 :", len(card1_Grp.index))
print("Missing Data for card1 : " +
      '{:1.2f}%'.format(train_merged['card1'].isnull().sum() / total_counts * 100))
print("Top 20 :\n", card1_grp_top['card1'])

for ctr in range(len(card1_grp_top)):
    if (card1_grp_top.iloc[ctr]['card1'] == -999):
        print("Found in Index", str(ctr) + ":", card1_grp_top.iloc[ctr]['card1Count'])
        print("Missing Data : " + '{:1.2f}%'.format(card1_grp_top.iloc[ctr]['card1Count'] / total_counts * 100))

In [None]:
card2_Grp = pd.DataFrame()
card2_Grp['card2Count'] = train_merged.groupby(['card2'])['card2'].count()
card2_Grp['card2'] = card2_Grp.index

# There are too many Devices, so we will subset the top 20
card2_grp_top = card2_Grp.sort_values(by='card2Count',ascending=False).head(20)
order_card2 = card2_grp_top.sort_values(by='card2Count',ascending=False)['card2']

plt.figure(figsize=(25, 10))
sns.set(color_codes=True)
sns.set(font_scale = 1.3)
ax = sns.barplot(x='card2', y='card2Count', data=card2_grp_top, order=order_card2)
ax.set_title('Top 20 ranking of card2', fontsize=18)
for rectBox in ax.patches:
    count = rectBox.get_height()
    #print("Counts :", count)    
    ax.text(rectBox.get_x() + rectBox.get_width()/2.,
            count + 1000,
            '{:1.2f}%'.format(count/total_counts*100),
            ha="center", fontsize=20)


xt = plt.xticks(rotation=60)
plt.show()

print("Total no. of card2 :", len(card2_Grp.index))
print("Missing Data for card2 : " +
      '{:1.2f}%'.format(train_merged['card2'].isnull().sum() / total_counts * 100))
print("Top 20 :\n", card2_grp_top['card2'])

for ctr in range(len(card2_grp_top)):
    if (card2_grp_top.iloc[ctr]['card2'] == -999):
        print("Found in Index", str(ctr) + ":", card2_grp_top.iloc[ctr]['card2Count'])
        print("Missing Data : " + '{:1.2f}%'.format(card2_grp_top.iloc[ctr]['card2Count'] / total_counts * 100))

In [None]:
card3_Grp = pd.DataFrame()
card3_Grp['card3Count'] = train_merged.groupby(['card3'])['card3'].count()
card3_Grp['card3'] = card3_Grp.index

# There are too many Devices, so we will subset the top 20
card3_grp_top = card3_Grp.sort_values(by='card3Count',ascending=False).head(20)
order_card3 = card3_grp_top.sort_values(by='card3Count',ascending=False)['card3']

plt.figure(figsize=(25, 10))
sns.set(color_codes=True)
sns.set(font_scale = 1.3)
ax = sns.barplot(x='card3', y='card3Count', data=card3_grp_top, order=order_card3)
ax.set_title('Top 20 ranking of card3', fontsize=18)
for rectBox in ax.patches:
    count = rectBox.get_height()
    #print("Counts :", count)    
    ax.text(rectBox.get_x() + rectBox.get_width()/2.,
            count + 1000,
            '{:1.2f}%'.format(count/total_counts*100),
            ha="center", fontsize=20)


xt = plt.xticks(rotation=60)
plt.show()

print("Total no. of card3 :", len(card3_Grp.index))
print("Missing Data for card3 : " +
      '{:1.2f}%'.format(train_merged['card3'].isnull().sum() / total_counts * 100))
print("Top 20 :\n", card3_grp_top['card3'])

for ctr in range(len(card3_grp_top)):
    if (card3_grp_top.iloc[ctr]['card3'] == -999):
        print("Found in Index", str(ctr) + ":", card3_grp_top.iloc[ctr]['card3Count'])
        print("Missing Data : " + '{:1.2f}%'.format(card3_grp_top.iloc[ctr]['card3Count'] / total_counts * 100))

In [None]:
card5_Grp = pd.DataFrame()
card5_Grp['card5Count'] = train_merged.groupby(['card5'])['card5'].count()
card5_Grp['card5'] = card5_Grp.index

# There are too many Devices, so we will subset the top 20
card5_grp_top = card5_Grp.sort_values(by='card5Count',ascending=False).head(20)
order_card5 = card5_grp_top.sort_values(by='card5Count',ascending=False)['card5']

plt.figure(figsize=(25, 10))
sns.set(color_codes=True)
sns.set(font_scale = 1.3)
ax = sns.barplot(x='card5', y='card5Count', data=card5_grp_top, order=order_card5)
ax.set_title('Top 20 ranking of card5', fontsize=18)
for rectBox in ax.patches:
    count = rectBox.get_height()
    #print("Counts :", count)    
    ax.text(rectBox.get_x() + rectBox.get_width()/2.,
            count + 1000,
            '{:1.2f}%'.format(count/total_counts*100),
            ha="center", fontsize=20)

xt = plt.xticks(rotation=60)
plt.show()

print("Total no. of card5 :", len(card5_Grp.index))
print("Missing Data for card5 : " +
      '{:1.2f}%'.format(train_merged['card5'].isnull().sum() / total_counts * 100))
print("Top 20 :\n", card5_grp_top['card5'])

for ctr in range(len(card5_grp_top)):
    if (card5_grp_top.iloc[ctr]['card5'] == -999):
        print("Found in Index", str(ctr) + ":", card5_grp_top.iloc[ctr]['card5Count'])
        print("Missing Data : " + '{:1.2f}%'.format(card5_grp_top.iloc[ctr]['card5Count'] / total_counts * 100))

### <B>3. addr1 and addr2</B>

In [None]:
# First create a dataframe with 2 cols: addr2 and addr2 count
addr1_Grp = pd.DataFrame()
addr1_Grp['addr1Count'] = train_merged.groupby(['addr1'])['addr1'].count()
addr1_Grp['addr1'] = addr1_Grp.index

# There are too many Devices, so we will subset the top 20
addr1_grp_top = addr1_Grp.sort_values(by='addr1Count',ascending=False).head(20)
order_addr1 = addr1_grp_top.sort_values(by='addr1Count',ascending=False)['addr1']

plt.figure(figsize=(25, 10))
sns.set(color_codes=True)
sns.set(font_scale = 1.3)
ax = sns.barplot(x='addr1', y='addr1Count', data=addr1_grp_top, order=order_addr1)
ax.set_title('Top 20 ranking of addr1', fontsize=18)
for rectBox in ax.patches:
    count = rectBox.get_height()
    #print("Counts :", count)    
    ax.text(rectBox.get_x() + rectBox.get_width()/2.,
            count + 100,
            '{:1.2f}%'.format(count/total_counts*100),
            ha="center", fontsize=20)

xt = plt.xticks(rotation=60)
plt.show()

print("Total no. of addr1 :", len(addr1_Grp.index))
print("Missing Data for addr1 : " +
      '{:1.2f}%'.format(train_merged['addr1'].isnull().sum() / total_counts * 100))
print("Top 20 :\n", addr1_grp_top['addr1'])

for ctr in range(len(addr1_grp_top)):
    if (addr1_grp_top.iloc[ctr]['addr1'] == -999):
        print("Found in Index", str(ctr) + ":", addr1_grp_top.iloc[ctr]['addr1Count'])
        print("Missing Data : " + '{:1.2f}%'.format(addr1_grp_top.iloc[ctr]['addr1Count'] / total_counts * 100))

In [None]:
df = pd.DataFrame(train_merged['addr1'], index = train_merged.index)
# Getting the unqiue values
df = df.nunique() #.value_counts()
print (df)

In [None]:
# First create a dataframe with 2 cols: addr2 and addr2 count
addr2_Grp = pd.DataFrame()
addr2_Grp['addr2Count'] = train_merged.groupby(['addr2'])['addr2'].count()
addr2_Grp['addr2'] = addr2_Grp.index

# There are too many Devices, so we will subset the top 20
addr2_grp_top = addr2_Grp.sort_values(by='addr2Count',ascending=False).head(20)
order_addr2 = addr2_grp_top.sort_values(by='addr2Count',ascending=False)['addr2']

plt.figure(figsize=(25, 10))
sns.set(color_codes=True)
sns.set(font_scale = 1.3)
ax = sns.barplot(x='addr2', y='addr2Count', data=addr2_grp_top, order=order_addr2)
ax.set_title('Top 20 ranking of addr2', fontsize=18)
for rectBox in ax.patches:
    count = rectBox.get_height()
    #print("Counts :", count)    
    ax.text(rectBox.get_x() + rectBox.get_width()/2.,
            count + 1000,
            '{:1.2f}%'.format(count/total_counts*100),
            ha="center", fontsize=20)


xt = plt.xticks(rotation=60)
plt.show()

print("Total no. of addr2 :", len(addr2_Grp.index))
print("Missing Data for addr2 : " +
      '{:1.2f}%'.format(train_merged['addr2'].isnull().sum() / total_counts * 100))
print("Top 20 :\n", addr2_grp_top['addr2'])


for ctr in range(len(addr2_grp_top)):
    if (addr2_grp_top.iloc[ctr]['addr2'] == -999):
        print("Found in Index", str(ctr) + ":", addr2_grp_top.iloc[ctr]['addr2Count'])
        print("Missing Data : " + '{:1.2f}%'.format(addr2_grp_top.iloc[ctr]['addr2Count'] / total_counts * 100))

In [None]:
df = pd.DataFrame(train_merged['addr2'], index = train_merged.index)
# Getting the unqiue values
df = df.nunique() #.value_counts()
print (df)

### <B>4. P_emaildomain</B>

In [None]:
# First create a dataframe with 2 cols: P_emaildomain and Email count
P_EmailGrp = pd.DataFrame()
P_EmailGrp['P_emailCount'] = train_merged.groupby(['P_emaildomain'])['P_emaildomain'].count()
P_EmailGrp['P_emaildomain'] = P_EmailGrp.index

# There are too many P_emaildomain, we will just take the top 10
group_top = P_EmailGrp.sort_values(by='P_emailCount',ascending=False).head(10)

plt.figure(figsize=(25, 10))
sns.set(color_codes=True)
sns.set(font_scale = 1.3)
ax = sns.barplot(x="P_emaildomain", y="P_emailCount", data=group_top)
ax.set_title('P_emaildomain', fontsize=18)
for rectBox in ax.patches:
    count = rectBox.get_height()
    #print("Counts :", count)    
    ax.text(rectBox.get_x() + rectBox.get_width()/2.,
            count + 1000,
            '{:1.2f}%'.format(count/total_counts*100),
            ha="center", fontsize=20)


xt = plt.xticks(rotation=60)
plt.show()

print("Total no. of P_emaildomain :", len(P_EmailGrp.index))
print("Missing Data for P_emaildomain : " +
      '{:1.2f}%'.format(train_merged['P_emaildomain'].isnull().sum() / total_counts * 100))

for ctr in range(len(group_top)):
    if (group_top.iloc[ctr]['P_emaildomain'] == -999):
        print("Found in Index", str(ctr) + ":", group_top.iloc[ctr]['P_emailCount'])
        print("Missing Data : " + '{:1.2f}%'.format(group_top.iloc[ctr]['P_emailCount'] / total_counts * 100))

### <B>5. R_emaildomain</B>

In [None]:
# First create a dataframe with 2 cols: R_emaildomain and Email count
R_EmailGrp = pd.DataFrame()
R_EmailGrp['R_emailCount'] = train_merged.groupby(['R_emaildomain'])['R_emaildomain'].count()
R_EmailGrp['R_emaildomain'] = R_EmailGrp.index

# There are too many Devices, so we will subset the top 10
group_top = R_EmailGrp.sort_values(by='R_emailCount',ascending=False).head(10)

plt.figure(figsize=(25, 10))
sns.set(color_codes=True)
sns.set(font_scale = 1.3)
ax = sns.barplot(x="R_emaildomain", y="R_emailCount", data=group_top)
ax.set_title('R_emaildomain', fontsize=18)
for rectBox in ax.patches:
    count = rectBox.get_height()
    #print("Counts :", count)    
    ax.text(rectBox.get_x() + rectBox.get_width()/2.,
            count + 1000,
            '{:1.2f}%'.format(count/total_counts*100),
            ha="center", fontsize=20)


xt = plt.xticks(rotation=60)
plt.show()

print("Total no. of R_emaildomain :", len(R_EmailGrp.index))
print("Missing Data for R_emaildomain : " +
      '{:1.2f}%'.format(train_merged['R_emaildomain'].isnull().sum() / total_counts * 100))

for ctr in range(len(group_top)):
    if (group_top.iloc[ctr]['R_emaildomain'] == -999):
        print("Found in Index", str(ctr) + ":", group_top.iloc[ctr]['R_emailCount'])
        print("Missing Data : " + '{:1.2f}%'.format(group_top.iloc[ctr]['R_emailCount'] / total_counts * 100))

### <B>6. M1 - M9</B>

In [None]:
M1_loc = train_merged.columns.get_loc("M1")
M9_loc = train_merged.columns.get_loc("M9")
df_m = train_merged.iloc[:,M1_loc:M9_loc+1] #subset dataframe M1-M9
df_m['isFraud'] = train_merged.isFraud 

df_m_cols = df_m.columns
figure, axes = plt.subplots(3, 3, figsize=(16, 12))
count = 0
for i in range(3): # rows loop
    for j in range(3): # cols loop
        mplot = sns.countplot(x=df_m_cols[count], hue = 'isFraud', data=df_m, ax=axes[i,j])
        count += 1 # to loop over col-names
plt.tight_layout()

In [None]:
print(df_m_cols.values)

In [None]:
for ctr in range(len(df_m_cols.values)): # df_m_cols loop
    col_name = df_m_cols[ctr]
    df_m1_unique = df_m[col_name].nunique()
    df_m1_unique_counts = df_m[col_name].value_counts()
    print ("No. of unique values in", col_name, " :", df_m1_unique)
    print ("No. of counts per unique value in ", col_name, " :\n", df_m1_unique_counts)
    print("Missing Data for", col_name + " : " +
          '{:1.2f}%'.format(train_merged[col_name].isnull().sum() / total_counts * 100))
    print("----------------------------------------")

### <B>7. DeviceType</B>

In [None]:
df_dType = pd.DataFrame({'DeviceType' : train_merged['DeviceType']})
df_dType['DeviceType'] = df_dType['DeviceType'].fillna(-999)

ax = sns.countplot(x='DeviceType', data=df_dType)
ax.set_title('DeviceType', fontsize=14)
for rectBox in ax.patches:
    count = rectBox.get_height()
    print("Counts :", count)
    
    ax.text(rectBox.get_x() + rectBox.get_width()/2.,
            count + 1000,
            '{:1.2f}%'.format(count/total_counts*100),
            ha="center", fontsize=11)

# Change the x-labels
#plt.xticks(np.arange(3), ("Missing", "Yes", "No"))
plt.show()

print("Missing Data : " + '{:1.2f}%'.format(train_merged['DeviceType'].isnull().sum() / total_counts * 100))

### <B>8. DeviceInfo</B>

In [None]:
# First create a dataframe with 2 cols: device info and the count by device
D_InfoGrp = pd.DataFrame()
D_InfoGrp['DeviceCount'] = train_merged.groupby(['DeviceInfo'])['DeviceInfo'].count()
D_InfoGrp['DeviceInfo'] = D_InfoGrp.index

# There are too many Devices, so we will subset the top 20
group_top = D_InfoGrp.sort_values(by='DeviceCount',ascending=False).head(20)

plt.figure(figsize=(25, 10))
sns.set(color_codes=True)
sns.set(font_scale = 1.3)
ax = sns.barplot(x="DeviceInfo", y="DeviceCount", data=group_top)
ax.set_title('DeviceInfo', fontsize=18)

for rectBox in ax.patches:
    count = rectBox.get_height()
    #print("Counts :", count)    
    ax.text(rectBox.get_x() + rectBox.get_width()/2.,
            count + 1000,
            '{:1.2f}%'.format(count/total_counts*100),
            ha="center", fontsize=20)


xt = plt.xticks(rotation=60)
plt.show()

print("Total no. of DeviceInfo :", len(D_InfoGrp.index))
print("Missing Data for DeviceInfo : " +
      '{:1.2f}%'.format(train_merged['DeviceInfo'].isnull().sum() / total_counts * 100))

for ctr in range(len(group_top)):
    if (group_top.iloc[ctr]['DeviceInfo'] == -999):
        print("Found in Index", str(ctr) + ":", group_top.iloc[ctr]['DeviceCount'])
        print("Missing Data : " + '{:1.2f}%'.format(group_top.iloc[ctr]['DeviceCount'] / total_counts * 100))

### <B>9. Id_01 - 38</B>

In [None]:
# Initialized variables
total_counts = len(train_merged)

i_cols = [c for c in train_merged if c[0] == 'i']

# Example : train_merged['id_01'].value_counts()
for ctr in range(len(i_cols)):
    col_name = i_cols[ctr]
    df_unique = train_merged[col_name].nunique()
    df_unique_counts = train_merged[col_name].value_counts()
    print ("No. of unique values in", col_name, " :", df_unique)
    #print ("No. of counts per unique value in", col_name, " :\n", df_unique_counts)
    print("Missing Data : " + '{:1.2f}%'.format(train_merged[col_name].isnull().sum() / total_counts * 100))
    
train_merged[i_cols].head()

In [None]:
missing_data_pc = ((train_merged.isnull().sum() / total_counts) * 100).round(2).astype(str) + "%"
missing_data_pc.index

In [None]:
# Print all values in data
sorted_missing_data_pc = missing_data_pc.sort_values(ascending=False)

print("Missing values in data :\n" + sorted_missing_data_pc.to_string())
print("\nTotal number of columns :", len(sorted_missing_data_pc))

In [None]:
total_num_features = len(sorted_missing_data_pc)
selected_num_features = 0
for ctr in range(total_num_features): # cols loop
    float_val = float(sorted_missing_data_pc[ctr].rstrip('%'))
    if float_val > 77.00 :
        selected_num_features += 1
        print("'" + sorted_missing_data_pc.index[ctr] + "',") #, " :", correlation_fraud[ctr])

print("Total selected features :", selected_num_features)

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(16, 10))
fig.subplots_adjust(hspace=1.0)
fig.suptitle('Categorical Feature counts'.upper(), y=1.02, fontsize=14)

isFraud = sns.countplot(x=train_merged['isFraud'], data=train_merged, ax=axes[0][0])
ProductCD = sns.countplot(x=train_merged['ProductCD'], data=train_merged, ax=axes[0][1])
DeviceType = sns.countplot(x=train_merged['DeviceType'], data=train_merged, ax=axes[1][0])
card4 = sns.countplot(x=train_merged['card4'], data=train_merged, ax=axes[1][1]).set_title("Card 4 by count")
card6 = sns.countplot(x=train_merged['card6'], data=train_merged, ax=axes[2][0]).set_title("Card 6 by count")
m1 = sns.countplot(x=train_merged['M1'], data=train_merged, ax=axes[2][1]).set_title("M1 by count")

plt.tight_layout()

# Inspecting Parameters is Fraud instances

In [None]:
# Subset fraud dataset
addr = 'addr1'
addrC = 'addr1Count'
fraud = pd.DataFrame()
is_fraud = train_merged[train_merged['isFraud']==1]

fraud[addrC] = is_fraud.groupby([addr])[addr].count()
fraud[addr] = fraud.index

# Subset NOT fraud dataset
NOfraud = pd.DataFrame()
no_fraud = train_merged[train_merged['isFraud']==0]
NOfraud[addrC] = no_fraud.groupby([addr])[addr].count()
NOfraud[addr] = NOfraud.index

# There are too many addr, so we will subset the top 20
group_top_f = fraud.sort_values(by=addrC,ascending=False).head(20)
order_f = group_top_f.sort_values(by=addrC,ascending=False)[addr]

group_top_l = NOfraud.sort_values(by=addrC,ascending=False).head(20)
order_l = group_top_l.sort_values(by=addrC,ascending=False)[addr]

f, axes = plt.subplots(4, 1, figsize=(18, 20))

sns.set(color_codes=True)
sns.set(font_scale = 1.3)
ax = sns.barplot(x=addr, y=addrC, data=group_top_f, order = order_f, ax=axes[0])
bx = sns.barplot(x=addr, y=addrC, data=group_top_l, order = order_l, ax=axes[1])

az = sns.barplot(x=addr, y=addrC, data=group_top_f, ax=axes[2])
bz = sns.barplot(x=addr, y=addrC, data=group_top_l, ax=axes[3])

font_size= {'size': 'x-large'}
ax.set_title("Fraud transactions by addr1 (ranked)", **font_size)
bx.set_title("Legit transactions by addr1 (ranked)", **font_size)

az.set_title("Fraud transactions by addr1", **font_size)
bz.set_title("Legit transactions by addr1", **font_size)

xt = plt.xticks(rotation=60)
plt.tight_layout()

In [None]:
print("No. of Frauds :", is_fraud.shape)
print("No. of Non-Frauds :", no_fraud.shape)

In [None]:
is_fraud.head()

In [None]:
features = train_merged.columns

print("Train Features : ", features)

In [None]:
f, axes = plt.subplots(1, 2, figsize=(18, 10))
sns.set(color_codes=True)
card4 = sns.countplot(x='card4', hue="isFraud", data=train_merged, ax=axes[0])
card6 = sns.countplot(x='card6', hue="isFraud", data=train_merged, ax=axes[1])

In [None]:
# Get top 10 
order_p=train_merged.P_emaildomain.value_counts().iloc[:10].index
order_r=train_merged.R_emaildomain.value_counts().iloc[:10].index

f, axes = plt.subplots(1, 2, figsize=(16, 8))

sns.set(color_codes=True)
p_email = sns.countplot(y='P_emaildomain',  hue="isFraud", data=train_merged, order = order_p, ax=axes[0])
r_email = sns.countplot(y='R_emaildomain',  hue="isFraud", data=train_merged, order = order_r, ax=axes[1])
plt.tight_layout()

In [None]:
# Getting columns that starts with 'C'
c_cols = [c for c in train_merged if c[0] == 'C']
train_merged[c_cols].head()

In [None]:
df = pd.DataFrame(train_merged[c_cols], index = train_merged.index)

#df.C1.unique()
#output = df.drop_duplicates()
#output.groupby('C1').size()

# Getting the unqiue values
df = df.nunique() #.value_counts()

print (df)

In [None]:
d_cols = [c for c in train_merged if c[0] == 'D']
train_merged[d_cols].head()

In [None]:
m_cols = [c for c in train_merged if c[0] == 'M']
train_merged[m_cols].head()

In [None]:
v_cols = [c for c in train_merged if c[0] == 'V']
train_merged[v_cols].head()

In [None]:
T_cols = [c for c in train_merged if c[0] == 'T']
train_merged[T_cols].head()

# Fill NaNs

In [None]:
train_merged.head()

In [None]:
# Assign values to NaN in the column
train_merged['R_emaildomain'] = train_merged['R_emaildomain'].fillna(-999)

In [None]:
y_train = train_merged['isFraud'].copy()

# Drop target, fill in NaNs
X_train = train_merged.drop('isFraud', axis=1)
X_test = test_merged.copy()

# Eg.'DeviceType' type 'O' has been changed from "int64" to "object" by fillna
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

y_train.value_counts()

In [None]:
import seaborn as sns

y = y_train.value_counts().values
sns.barplot(y_train.value_counts().index, y)

plt.title('Ground Truth count')

y_train.value_counts()

In [None]:
# shape does not include "TranscationID", because it is used for index_col
X_train.shape, X_test.shape, y_train.shape

In [None]:
X_train.head()

In [None]:
X_train.iloc[0:10]['card4']

# Examine different Machine Learning models 

# Preprocessing

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

std_scaler = Pipeline([
    ("standard_scaler", StandardScaler())
])

In [None]:
# Delete data to free up memory 
del train_merged, test_merged, train_transaction, train_identity
del test_transaction, test_identity

# Garbage Collection
gc.collect

In [None]:
#train_merged.dtypes
num_obj_types = 0
for column_name in X_train.columns:
    if X_train[column_name].dtype=='object':
        num_obj_types += 1
        print(column_name)
        
print("No of object types to be encoded :", num_obj_types, "out of", len(X_train.columns))

In [None]:
# Label Encoding before model training
for index in X_train.columns:
    if X_train[index].dtype=='object' or X_test[index].dtype=='object': 
        # print(X_train[index])
        # Encode labels with value between 0 and n_classes-1
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[index].values) + list(X_test[index].values))
        X_train[index] = lbl.transform(list(X_train[index].values))
        X_test[index] = lbl.transform(list(X_test[index].values))  

In [None]:
# Encoded and transformed
X_train.iloc[0:10]['card4']

In [None]:
# Standardize features by removing the mean and scaling to unit variance
#X_train = std_scaler.fit_transform(X_train)
#X_test = std_scaler.fit_transform(X_test)

###  If XGBClassifier model exists, load it

In [None]:
import os.path
retrieve_xgbc_file = './datasets/IEEEFraudDetection/XGBClassifier_model_pickle'

if os.path.exists(retrieve_xgbc_file):
    #xgb_Classifier.save_model and load_model give an "le" error when trying to obtain score
    # Unpickling saved binary file if exist so that training do not need to done
    loaded_XGBClassifier = load_classifier_from_picklefile(retrieve_xgbc_file)
    print("Unpickling existing XGBClassifier model...")
    print("Loaded Classifier :\n", loaded_XGBClassifier)
    print("with type\n", type(loaded_XGBClassifier))

In [None]:
loaded_XGBClassifier.score(X_train, y_train)

# 0.9897060317675348

In [None]:
# Using Cross Validation to check the performance
from sklearn.model_selection import cross_val_score

loaded_XGBClassifier_auc_scores = cross_val_score(loaded_XGBClassifier, X_train, y_train, cv=3, scoring='roc_auc')
loaded_XGBClassifier_auc_scores, loaded_XGBClassifier_auc_scores.mean()

# (array([0.85256255, 0.74023   , 0.80202556]), 0.798272703167005)

In [None]:
xgb_predictions = loaded_XGBClassifier.predict_proba(X_test)
create_file_for_submission("./datasets/IEEEFraudDetection/loaded_simple_xgboost.csv", xgb_predictions)

# Approach 
1. drop the following 51 columns gives better scoring
2. fill the missing information

In [None]:
drop_col = ['TransactionDT',
            'V300', 'V309', 'V111', 'C3', 'V124', 'V106', 'V125', 'V315', 'V134', 'V102',
            'V123', 'V316', 'V113', 'V136', 'V305', 'V110', 'V299', 'V289', 'V286', 'V318',
            'V103', 'V304', 'V116', 'V298', 'V284', 'V293', 'V137', 'V295', 'V301', 'V104',
            'V311', 'V115', 'V109', 'V119', 'V321', 'V114', 'V133', 'V122', 'V319', 'V105',
            'V112', 'V118', 'V117', 'V121', 'V108', 'V135', 'V320', 'V303', 'V297', 'V120',
            'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']
#drop_col = ['TransactionDT']

X_train.drop(drop_col,axis=1, inplace=True)
X_test.drop(drop_col, axis=1, inplace=True)
X_train.head()

In [None]:
X_train.shape, X_test.shape

In [None]:
#drop_col = ['TransactionDT',
#            'V300', 'V309', 'V111', 'C3', 'V124', 'V106', 'V125', 'V315', 'V134', 'V102',
#            'V123', 'V316', 'V113', 'V136', 'V305', 'V110', 'V299', 'V289', 'V286', 'V318',
#            'V103', 'V304', 'V116', 'V298', 'V284', 'V293', 'V137', 'V295', 'V301', 'V104',
#            'V311', 'V115', 'V109', 'V119', 'V321', 'V114', 'V133', 'V122', 'V319', 'V105',
#            'V112', 'V118', 'V117', 'V121', 'V108', 'V135', 'V320', 'V303', 'V297', 'V120']

# drop 123 columns with missing data more than 77%
drop_col = [
'id_24',
'id_26',
'id_21',
'id_07',
'id_25',
'id_08',
'id_23',
'id_27',
'id_22',
'dist2',
'D7',
'id_18',
'D13',
'D14',
'D12',
'id_04',
'id_03',
'D6',
'id_33',
'D8',
'D9',
'id_10',
'id_09',
'id_30',
'id_32',
'id_34',
'id_14',
'V151',
'V152',
'V153',
'V154',
'V155',
'V156',
'V164',
'V157',
'V158',
'V159',
'V160',
'V161',
'V162',
'V149',
'V150',
'V163',
'V165',
'V138',
'V139',
'V140',
'V141',
'V142',
'V143',
'V144',
'V145',
'V146',
'V166',
'V147',
'V148',
'V338',
'V339',
'V336',
'V335',
'V322',
'V323',
'V324',
'V325',
'V326',
'V327',
'V328',
'V329',
'V330',
'V331',
'V332',
'V333',
'V334',
'V337',
'DeviceInfo',
'id_13',
'id_16',
'V254',
'V244',
'V246',
'V247',
'V248',
'V249',
'V252',
'V253',
'V257',
'V242',
'V258',
'V260',
'V261',
'V219',
'V263',
'V264',
'V243',
'V241',
'V266',
'V229',
'V217',
'V223',
'V224',
'V225',
'V226',
'V228',
'V230',
'V240',
'V231',
'V232',
'V233',
'V235',
'V236',
'V237',
'V265',
'V262',
'V267',
'V274',
'V268',
'V278',
'V277',
'V276',
'V275',
'V218',
'V273',
'V269'
]

# Replace NaN with "gmail.com" in 'R_emaildomain'
X_train.drop(drop_col,axis=1, inplace=True)
X_test.drop(drop_col, axis=1, inplace=True)
X_train.head()

# Classifiers and Predictions
## (1) XGradient Boost Classifier

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train.head()

### Else create XGBClassifier model and save it

In [None]:
xgb_Classifier = xgb.XGBClassifier(n_estimators=500, n_jobs=4, max_depth=9, learning_rate=0.05,
                                   subsample=0.9, colsample_bytree=0.9, missing=-999, 
                                   gamma = 0.1, alpha = 4)

xgb_Classifier.fit(X_train, y_train)

In [None]:
# Returns the mean accuracy on the given test data and labels
xgb_Classifier.score(X_train, y_train)

# 0.9897669929217326 for full dataset
# 0.9893317980153757 for dropping 51 columns
# 0.9891607681105429 for dropping 'TransactionDT'
# 0.9888322552240323 for dropping columns with missing data of more than 75%
# 0.989768686287127 for dropping columns with missing data of more than 77%

In [None]:
xgb_predictions = xgb_Classifier.predict_proba(X_test)
# xgb_predictions.shape gives (506691, 2)
# xgb_predictions[0:2, 1] gives 2 values in the second column

# Pickling files
print("Pickling XGBClassifier model...")
filename = './datasets/IEEEFraudDetection/XGBClassifier_model_pickle'
save_classifier_to_picklefile(filename, xgb_Classifier)

In [None]:
create_file_for_submission("./datasets/IEEEFraudDetection/XGBClassifier_partial_dataset.csv", xgb_predictions)

In [None]:
# Search the best combination of hyperparameter values
# ** This will take more than 5+ hours **
from sklearn.model_selection import GridSearchCV

# loaded_XGBClassifier.get_params().keys()
kfold=3

colsample_bytrees = [0.9, 0.95]
gammas = [0.001, 0.01, 0.1, 1]
learning_rates = [0.05, 0.10]
max_depths = [9, 10]
missings = [-999]
n_estimators = [100, 200, 500]
subsamples = [0.9, 0.95]

xgb_Classifier_param_grid = {#'colsample_bytree': colsample_bytrees,
                             'gamma': gammas,
                             #'learning_rate': learning_rates,
                             #'max_depth': max_depths,
                             #'missing': missings,
                             'n_estimators': n_estimators,
                             #'subsample': subsamples
                            }

# Tuning the param for GridSearch and performance has been increased to approx. x% 
grid_search = GridSearchCV(xgb_Classifier, xgb_Classifier_param_grid,
                           cv=kfold, scoring="roc_auc", n_jobs= 4, verbose = 1)

grid_search.fit(X_train, y_train)
grid_search.best_params_

## (2) XGradient Boost Random Forest Classifier

###  If XGBRFClassifier model exists, load it

In [None]:
retrieve_xgbrfc_file = './datasets/IEEEFraudDetection/XGBRFClassifier_model_pickle'

if os.path.exists(retrieve_xgbrfc_file):
    # Unpickling saved binary file if exist so that training do not need to done
    loaded_XGBRFClassifier = load_classifier_from_picklefile(retrieve_xgbrfc_file)
    print("Unpickling existing XGBClassifier model...")
    print("Loaded Classifier :\n", loaded_XGBRFClassifier)
    print("with type\n", type(loaded_XGBRFClassifier))

In [None]:
loaded_XGBRFClassifier.score(X_train, y_train)

# 0.9757188336099164

In [None]:
xgbrf_predictions = loaded_XGBRFClassifier.predict_proba(X_test)
create_file_for_submission("./datasets/IEEEFraudDetection/loaded_simple_xgbrf.csv", xgbrf_predictions)

### Else create XGBRFClassifier model and save it

In [None]:
xgb_RFClassifier = xgb.XGBRFClassifier(n_estimators=500,
                        n_jobs=4,
                        max_depth=9,
                        learning_rate=0.05,
                        subsample=0.9,
                        colsample_bytree=0.9,
                        missing=-999)

# scale_pos_weight=1
xgb_RFClassifier.fit(X_train, y_train)

In [None]:
xgb_RFClassifier.score(X_train, y_train)

In [None]:
xgbrf_predictions = xgb_RFClassifier.predict_proba(X_test)

# Pickling files
print("Pickling XGBRandomForestClassifier model...")
filename = './datasets/IEEEFraudDetection/XGBRFClassifier_model_pickle'
save_classifier_to_picklefile(filename, xgb_RFClassifier)

In [None]:
create_file_for_submission("./datasets/IEEEFraudDetection/XGBRFClassifier.csv", xgbrf_predictions)

## (3) Bagging Classifier

In [None]:
from sklearn.ensemble import 
from sklearn.tree import DecisionTreeClassifier

bag_Classifier = BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=500,
                                   max_samples=50000, bootstrap=True, n_jobs=-1, random_state=42)

In [None]:
bag_Classifier.fit(X_train, y_train)
bag_predictions = bag_Classifier.predict(X_test)

In [None]:
X_train.shape, X_test.shape, y_train.shape, bag_predictions.shape

In [None]:
bag_Classifier.score(X_train, y_train)

# 0.9781183323737596 partial dataset
# 0.9780556778541674 full dataset

In [None]:
classifier_predictions = np.reshape(bag_predictions, (bag_predictions.shape[0], 1))
classifier_index = np.reshape(X_test.index, (X_test.index.shape[0], 1))

print("The reshape of Prediction numpy array : ", classifier_predictions.shape)
classifier_predicted_results = np.concatenate((classifier_index, classifier_predictions), axis=1)

classifier_predicted_results.shape

In [None]:
create_file_for_submission("./datasets/IEEEFraudDetection/Bag_Classifier_partial_dataset.csv", classifier_predicted_results)

## (4) SVM Classifiers

In [None]:
from sklearn.svm import LinearSVC

svm_classifier = LinearSVC(class_weight='balanced')
svm_classifier.fit(X_train, y_train)

In [None]:
# Using Cross Validation to check the performance of Support Vector Machine
svm_scores = cross_val_score(svm_classifier, X_train, y_train, cv=3)
svm_scores.mean()

In [None]:
svm_predictions = svm_classifier.predict(X_test)
svm_predictions.shape

In [None]:
X_test.index.shape

In [None]:
classifier_predictions = np.reshape(svm_predictions, (svm_predictions.shape[0], 1))
classifier_index = np.reshape(X_test.index, (X_test.index.shape[0], 1))

print("The reshape of Prediction numpy array : ", classifier_predictions.shape)
classifier_predicted_results = np.concatenate((classifier_index, classifier_predictions), axis=1)

classifier_predicted_results.shape

In [None]:
create_file_for_submission("./datasets/IEEEFraudDetection/svm_classifier.csv", classifier_predicted_results)

# Ensemble Learning : Voting Classifier

In [None]:
import os.path
retrieve_sftvc_file = './datasets/IEEEFraudDetection/SoftVotingClassifier_model_pickle'

if os.path.exists(retrieve_sftvc_file):
    #xgb_Classifier.save_model and load_model give an "le" error when trying to obtain score
    # Unpickling saved binary file if exist so that training do not need to done
    loaded_SoftVoteClassifier = load_classifier_from_picklefile(retrieve_sftvc_file)
    print("Unpickling existing SoftVoteClassifier model...")
    print("Loaded Classifier :\n", loaded_SoftVoteClassifier)
    print("with type\n", type(loaded_SoftVoteClassifier))

In [None]:
loaded_SoftVoteClassifier.score(X_train, y_train)

# 0.9894046127273343

In [None]:
sftvc_predictions = loaded_SoftVoteClassifier.predict_proba(X_test)
create_file_for_submission("./datasets/IEEEFraudDetection/loaded_simple_sftvc.csv", sftvc_predictions)

In [None]:
from sklearn.ensemble import VotingClassifier

voting_classifier = VotingClassifier(
    estimators=[('XGB_classifier', loaded_XGBClassifier),
                ('XGB_RF_classifier', loaded_XGBRFClassifier),],
    n_jobs=4,
    voting='soft') 

voting_classifier.fit(X_train, y_train)

voting_classifier.get_params() # gives parameters of the VotingClassifier

In [None]:
voting_classifier.score(X_train, y_train)

# 0.9894046127273343

In [None]:
# Pickling files
print("Pickling VotingClassifier model...")
filename = './datasets/IEEEFraudDetection/SoftVotingClassifier_model_pickle'
save_classifier_to_picklefile(filename, voting_classifier)

In [None]:
voting_classifier_auc_scores = cross_val_score(voting_classifier, X_train, y_train, cv=3, scoring='roc_auc')
voting_classifier_auc_scores, voting_classifier_auc_scores.mean()

# (array([0.87149802, 0.74444081, 0.78814395]), 0.8013609304639683)

In [None]:
# Compute probabilities of possible outcomes for samples
voting_classifier_predictions = voting_classifier.predict_proba(X_test)

voting_classifier_predictions.shape

In [None]:
print("Sample of Predictions")
print("SoftVoting :\n", voting_classifier_predictions[10:20, 1])
print("XGBRF : \n", xgbrf_predictions[10:20, 1])
print("XGB : \n", xgb_predictions[10:20, 1])

In [None]:
create_file_for_submission("./datasets/IEEEFraudDetection/Soft_Voting_Classifier.csv", voting_classifier_predictions)

In [None]:
#print ("Predicted {} frauds".format(int([voting_classifier_predictions['isFraud']==1].sum())))
print("Predicted {} frauds".format(int((voting_classifier_predictions[:, 1]>0.5).sum())))

In [None]:
# Search the best combination of hyperparameter values
from sklearn.model_selection import GridSearchCV

# loaded_XGBClassifier.get_params().keys()
kfold=3

colsample_bytrees = [0.9, 0.95]
gammas = [0.001, 0.01, 0.1, 1]
learning_rates = [0.05, 0.10]
max_depths = [9, 10]
missings = [0, -999]
n_estimators = [100, 200]
subsamples = [0.9, 0.95]

voting_classifier_param_grid = {'colsample_bytree': colsample_bytrees,
                                'gamma': gammas,
                                'learning_rate': learning_rates,
                                'max_depth': max_depths,
                                'missing': missings,
                                'n_estimators': n_estimators,
                                'subsample': subsamples
                               }

# Tuning the param for GridSearch and performance has been increased to approx. x% 
grid_search = GridSearchCV(voting_classifier, voting_classifier_param_grid,
                           cv=kfold, scoring="roc_auc", n_jobs= 4, verbose = 1)

grid_search.fit(X_train, y_train)
grid_search.best_params_

# Submissions

- kaggle competitions submit -c ieee-fraud-detection -f submission.csv -m "Message"
- Submitted Public score of 0.9366 at position 321 with XGBClassifier with mean score of 98.971%
- Submitted Public score of 0.8543 with scaled features for XGBClassifier with mean score of 98.964%
- Submitted Public score of 0.8757 with XGBRFClassifier having mean score of 97.572%
- Submitted Public score of 0.5240 with balanced weight SVM Linear Classifier with mean cv score of 96.437%.
- Submitted Public score of 0.9375 at position 334 using Soft Voting Classifier with XGBC and XGBRFC gives a mean score of 98.940%.
- Submitted Public score of <B>0.9376 at position 343</B> using XGBClassifier with partial dataset and score of 98.933%.
<BR>(Suspect over fitting with skewed dataset)

# Helper Codes

In [None]:
pd.Series(np.intersect1d(pd.Series([1,2,3,5,42]), pd.Series([4,5,6,20,42])))

In [None]:
cat_cols = ['ProductCD',
            'card1', 'card2', 'card3', 'card4','card5', 'card6',
            'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain',
            'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
            'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21',
            'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31',
            'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
            'DeviceType', 'DeviceInfo']

len(cat_cols)

In [None]:
row_idx = 0
value = 4685.0
found_pos_idx = 0

for c in X_train['C1']:
    row_idx = row_idx + 1
    if c == value:
        found_pos_idx = row_idx
        #print("Found", value)

print(found_pos_idx, "of", row_idx)

c=X_train['C1']
c[found_pos_idx-1:found_pos_idx].values

In [None]:
# Check if value exists in pandas data frame
if getattr(X_train, 'C1').isin(['4685.0']).any():
    print("Found 4685.0")

c = X_train['C1']
c[0:10].values

In [None]:
#train_merged.dtypes
# Before LabelEncoder
X_train['DeviceType'].head()

In [None]:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(X_train['DeviceType'].values) + list(X_test['DeviceType'].values))
X_train['DeviceType'] = lbl.transform(list(X_train['DeviceType'].values))

In [None]:
# After LabelEncoder
X_train['DeviceType'].head()

In [None]:
# Correlation Matrix
corr_matrix = train_merged.corr()
correlation_fraud = corr_matrix['isFraud'].sort_values(ascending=False)

total_num_features = len(correlation_fraud)
selected_num_features = 0
for ctr in range(total_num_features): # cols loop
    if correlation_fraud[ctr] > -0.04 and correlation_fraud[ctr] < 0.04:
        selected_num_features += 1
        print("'" + correlation_fraud.index[ctr] + "',") #, " :", correlation_fraud[ctr])

print("Total selected features :", selected_num_features)

In [None]:
id_ctr = list(range(12, 39)) # Range from 12 - 38
id_list = []
for num in id_ctr:
    numStr = "id_" + str(num) 
    id_list.append(numStr)

print(id_list)

for identifier in id_list:
    df = pd.DataFrame(train_merged[identifier], index = train_merged.index) # TransactionID is the index
    # Getting the unqiue values
    df = df.nunique()
    #print (df)