In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, auc
import lightgbm as lgb

In [None]:
train = pd.read_csv('input/train_transaction.csv')
#train.head()

In [None]:
test = pd.read_csv('input/test_transaction.csv')
#test.head()

In [None]:
train_identity = pd.read_csv('input/train_identity.csv')
test_identity = pd.read_csv('input/test_identity.csv')
#train_identity.head()

In [None]:
train = pd.merge(train, train_identity, on='TransactionID', how='left')
test = pd.merge(test, test_identity, on='TransactionID', how='left')

# Data Overview

In [6]:
train.head()

In [7]:
list(train.columns)

In [8]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_features = train.select_dtypes(include=numerics).columns

In [9]:
categorical_features = list(set(train.columns) - set(numerical_features))

In [10]:
card_columns = ['card'+str(i+1) for i in range(6)]
train[card_columns].head()

In [11]:
train[['P_emaildomain', 'R_emaildomain', 'addr1', 'addr2', 'dist1', 'dist2']].head()

Unnamed: 0,P_emaildomain,R_emaildomain,addr1,addr2,dist1,dist2
0,,,315.0,87.0,19.0,
1,gmail.com,,325.0,87.0,,
2,outlook.com,,330.0,87.0,287.0,
3,yahoo.com,,476.0,87.0,,
4,gmail.com,,420.0,87.0,,


In [12]:
C_columns = ['C'+str(i+1) for i in range(14)]
train[C_columns].head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14
0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0
1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
3,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0
4,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0


In [13]:
D_columns = ['D'+str(i+1) for i in range(15)]
train[D_columns].head()

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15
0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0
1,0.0,,,0.0,,,,,,0.0,,,,,0.0
2,0.0,,,0.0,,,,,,0.0,315.0,,,,315.0
3,112.0,112.0,0.0,94.0,0.0,,,,,84.0,,,,,111.0
4,0.0,,,,,,,,,,,,,,


In [14]:
train['M1'].unique()

array(['T', nan, 'F'], dtype=object)

In [15]:
M_columns = ['M'+str(i+1) for i in range(9)]
train[M_columns].head()

Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9
0,T,T,T,M2,F,T,,,
1,,,,M0,T,T,,,
2,T,T,T,M0,F,F,F,F,F
3,,,,M0,T,F,,,
4,,,,,,,,,


In [16]:
train[['DeviceType', 'DeviceInfo']].head()

Unnamed: 0,DeviceType,DeviceInfo
0,,
1,,
2,,
3,,
4,mobile,SAMSUNG SM-G892A Build/NRD90M


In [17]:
id_columns_numerical = [column_name if train[column_name].dtype in numerics else 'id_01'
                        for column_name in ['id_'+str(i+1)  if i>8 else 'id_0'+str(i+1) for i in range(38)]]
id_columns_numerical = list(set(id_columns_numerical))
train[id_columns_numerical].head()

Unnamed: 0,id_21,id_17,id_19,id_05,id_01,id_02,id_07,id_08,id_04,id_10,...,id_03,id_25,id_32,id_26,id_20,id_06,id_24,id_18,id_14,id_13
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,166.0,542.0,,0.0,70787.0,,,,,...,,,32.0,,144.0,,,,-480.0,


In [18]:
id_columns_categorical = [column_name if train[column_name].dtype not in numerics else 'id_16'
                        for column_name in ['id_'+str(i+1)  if i>8 else 'id_0'+str(i+1) for i in range(38)]]
id_columns_categorical = list(set(id_columns_categorical))
train[id_columns_categorical].head()

Unnamed: 0,id_35,id_12,id_28,id_23,id_34,id_38,id_36,id_31,id_33,id_16,id_37,id_29,id_15,id_30,id_27
0,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,
4,T,NotFound,New,,match_status:2,T,F,samsung browser 6.2,2220x1080,NotFound,T,NotFound,New,Android 7.0,


In [19]:
train.shape

(590540, 434)

# EDA

In [None]:
plt.style.use('seaborn')

In [20]:
bin_num = 0
bin_size = 5000
num_bins = int(len(train)/bin_size) + 1
fraud_frequency = np.zeros(num_bins)
for i, isFraud in enumerate(train.isFraud):
    fraud_frequency[bin_num] += isFraud
    if (i+1) % bin_size == 0:
        bin_num += 1
fraud_frequency = fraud_frequency[:-1]

In [None]:
plt.figure()
fraud_frequency_series = pd.Series(fraud_frequency, index=pd.Series(range(num_bins-1)))
fraud_frequency_series.plot()

In [21]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_features = train.select_dtypes(include=numerics).columns

categorical_features = list(set(train.columns) - set(numerical_features))

In [22]:
len(categorical_features)

31

In [None]:
#%%time
fig, ax = plt.subplots(15, 2, figsize=(15,60))
k = l = 0
for i in range(30):
    feature = categorical_features[i]
    categories = train[feature].unique()
    counts = [(train[feature] == category).sum() if category == category else train[feature].isnull().sum() for category in categories]
    ax[k,l].bar(x=list(range(len(counts))), height=counts, tick_label=list(categories))
    ax[k,l].set_title(feature)
    l += 1
    if l>1:
        l = 0
        k += 1

In [24]:
OS_feature = train['id_30'].unique()
id_30_value_counts = train['id_30'].value_counts()
OS_isFraud_fractions = []
OS_isFraud_fractions.append((train[train['id_30'] != OS_feature[0]]['isFraud'] == 1).sum()
                         /train[train['id_30'] != train['id_30']]['TransactionID'].count()) # for nan
for i in range(1, len(OS_feature)):
    OS_isFraud_fractions.append((train[train['id_30'] == OS_feature[i]]['isFraud'] == 1).sum()/id_30_value_counts[OS_feature[i]])

In [None]:
plt.figure(figsize=(20,20))
plt.bar(x=list(range(len(os_feature))), height=OS_isFraud_fractions)
plt.xticks(list(range(len(os_feature))), os_feature, rotation='vertical')
plt.show()

In [25]:
ProductCD = train['ProductCD'].unique()
ProductCD_value_counts = train['ProductCD'].value_counts()
ProductCD_isFraud_fractions = []
for i in range(0, len(ProductCD)):
    ProductCD_isFraud_fractions.append((train[train['ProductCD'] == ProductCD[i]]['isFraud'] == 1).sum()/ProductCD_value_counts[ProductCD[i]])

In [None]:
plt.figure(figsize=(10,5))
plt.bar(x=list(range(len(ProductCD))), height=ProductCD_isFraud_fractions)
plt.xticks(list(range(len(ProductCD))), ProductCD, rotation='vertical')
plt.show()

# Feature Engineering

In [26]:
start_date = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [27]:
#from https://www.kaggle.com/ysjf13/cis-fraud-detection-visualize-feature-engineering

def addNewFeatures(data): 
    
#     data['is_proton_mail'] = (data['P_emaildomain'] == 'protonmail.com') | (data['R_emaildomain']  == 'protonmail.com')
    
    data['TransactionAmt_Log'] = np.log(data['TransactionAmt'])

    # New feature - decimal part of the transaction amount.
    data['TransactionAmt_decimal'] = ((data['TransactionAmt'] - data['TransactionAmt'].astype(int)) * 1000).astype(int)

    # New feature - day of week in which a transaction happened.
    data['Transaction_day_of_week'] = np.floor((data['TransactionDT'] / (3600 * 24) - 1) % 7)

    # New feature - hour of the day in which a transaction happened.
    data['Transaction_hour'] = np.floor(data['TransactionDT'] / 3600) % 24

    data['uid'] = data['card1'].astype(str)+'_'+data['card2'].astype(str)

    data['uid2'] = data['uid'].astype(str)+'_'+data['card3'].astype(str)+'_'+data['card5'].astype(str)

    data['uid3'] = data['uid2'].astype(str)+'_'+data['addr1'].astype(str)+'_'+data['addr2'].astype(str)

    data['D9'] = np.where(data['D9'].isna(),0,1)
    
    return data

In [28]:
train = addNewFeatures(train)
test = addNewFeatures(test)

### ProductCD

In [29]:
dict_ProductCD_isFraud_fractions = dict(zip(ProductCD, ProductCD_isFraud_fractions))

In [30]:
train['ProductCD_isFraud_fraction'] = train['ProductCD'].apply(lambda x: dict_ProductCD_isFraud_fractions[x])
test['ProductCD_isFraud_fraction'] = test['ProductCD'].apply(lambda x: dict_ProductCD_isFraud_fractions[x])

### Email suffix features

In [31]:
for emaildomain in ['P_emaildomain', 'R_emaildomain']:
    train[emaildomain+'_suffix'] = train[emaildomain].apply(lambda x: str(x).split('.')[-1])
    test[emaildomain+'_suffix'] = test[emaildomain].apply(lambda x: str(x).split('.')[-1])

In [32]:
train['P_emaildomain_suffix'].unique(), train['R_emaildomain_suffix'].unique()

(array(['nan', 'com', 'net', 'mx', 'gmail', 'es', 'de', 'fr', 'uk', 'jp'],
       dtype=object),
 array(['nan', 'com', 'net', 'mx', 'es', 'de', 'edu', 'fr', 'gmail', 'uk',
        'jp'], dtype=object))

### Device Info

In [33]:
list(train['DeviceInfo'].unique())

[nan,
 'SAMSUNG SM-G892A Build/NRD90M',
 'iOS Device',
 'Windows',
 'MacOS',
 'SM-G930V Build/NRD90M',
 'BLADE A602 Build/MRA58K',
 'XT1635-02 Build/NPN26.118-22-2',
 'Z970',
 'SM-N920V Build/NRD90M',
 'Redmi Note 4 Build/MMB29M',
 'Lenovo PB1-750M Build/S100',
 'LT22i Build/6.2.A.1.100',
 'rv:52.0',
 'SM-G950U Build/NRD90M',
 'LG-H872 Build/NRD90U',
 'LG-K500 Build/MMB29M',
 'SM-P550 Build/MMB29M',
 'SM-J700M Build/MMB29K',
 'Trident/7.0',
 'rv:57.0',
 'SAMSUNG SM-G930T Build/NRD90M',
 'Blade V6 Plus Build/MRA58K',
 'BLL-L23 Build/HUAWEIBLL-L23',
 'KYOCERA-C6742A Build/LMY47V',
 'F3113 Build/33.2.A.4.70',
 'D5306 Build/19.4.A.0.182',
 'M4 SS4457 Build/MRA58K',
 'SM-G955U Build/NRD90M',
 'SM-G610M Build/MMB29K',
 'SAMSUNG SM-G935F Build/NRD90M',
 'XT1635-01',
 'rv:56.0',
 'VS500',
 'CAM-L03 Build/HUAWEICAM-L03',
 'RCT6303W87M7 Build/MRA58K',
 'M4 SS4451 Build/LMY47D',
 'KFFOWI Build/LVY48F',
 'Moto E (4) Build/NMA26.42-19',
 'SAMSUNG',
 'E2306 Build/26.3.A.1.33',
 'Ilium L910 Build/MRA

In [34]:
def add_device_features(df):
    df['Device'] = df['DeviceInfo'].apply(lambda x: str(x).split()[0])
    df['DeviceBuild'] = df['DeviceInfo'].apply(lambda x: str(x).split()[-1].split('/')[-1])
    return df

In [35]:
train = add_device_features(train)
test = add_device_features(test)

### Features from id features

#### OS Feature

In [36]:
OS_feature

array([nan, 'Android 7.0', 'iOS 11.1.2', 'Mac OS X 10_11_6', 'Windows 10',
       'Android', 'Linux', 'iOS 11.0.3', 'Mac OS X 10_7_5',
       'Mac OS X 10_12_6', 'Mac OS X 10_13_1', 'iOS 11.1.0',
       'Mac OS X 10_9_5', 'Windows 7', 'Windows 8.1', 'Mac', 'iOS 10.3.3',
       'Mac OS X 10.12', 'Mac OS X 10_10_5', 'Mac OS X 10_11_5',
       'iOS 9.3.5', 'Android 5.1.1', 'Android 7.1.1', 'Android 6.0',
       'iOS 10.3.1', 'Mac OS X 10.9', 'iOS 11.1.1', 'Windows Vista',
       'iOS 10.3.2', 'iOS 11.0.2', 'Mac OS X 10.11', 'Android 8.0.0',
       'iOS 10.2.0', 'iOS 10.2.1', 'iOS 11.0.0', 'Mac OS X 10.10',
       'Mac OS X 10_12_3', 'Mac OS X 10_12', 'Android 6.0.1', 'iOS',
       'Mac OS X 10.13', 'Mac OS X 10_12_5', 'Mac OS X 10_8_5',
       'iOS 11.0.1', 'iOS 10.0.2', 'Android 5.0.2', 'Windows XP',
       'iOS 11.2.0', 'Mac OS X 10.6', 'Windows 8', 'Mac OS X 10_6_8',
       'Mac OS X 10_11_4', 'Mac OS X 10_12_1', 'iOS 10.1.1',
       'Mac OS X 10_11_3', 'Mac OS X 10_12_4', 'Mac OS X 10

In [37]:
OS_dict_isFraud_fractions = dict(zip(OS_feature, OS_isFraud_fractions))

In [38]:
def add_os_features(df):
    df['OS_name'] = df['id_30'].apply(lambda x: str(x).split()[0])
    df['OS_isFraud_fraction'] = df['id_30'].apply(lambda x: (OS_dict_isFraud_fractions[x] if (x==x and x in OS_feature) 
                                                          else OS_isFraud_fractions[0]))
    return df

In [39]:
train = add_os_features(train)
test = add_os_features(test)

In [40]:
list(train.columns)

['TransactionID',
 'isFraud',
 'TransactionDT',
 'TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'P_emaildomain',
 'R_emaildomain',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V

## Mean and std features

In [41]:
#from https://www.kaggle.com/ysjf13/cis-fraud-detection-visualize-feature-engineering

i_cols = id_columns_categorical+['ProductCD', 'addr1', 'addr2']+C_columns+D_columns+['card1','card2','card3','card5','uid','uid2','uid3']

for col in i_cols:
    for agg_type in ['mean','std']:
        new_col_name = col+'_TransactionAmt_'+agg_type
        temp_df = pd.concat([train[[col, 'TransactionAmt']], test[[col,'TransactionAmt']]])
        #temp_df['TransactionAmt'] = temp_df['TransactionAmt'].astype(int)
        temp_df = temp_df.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})

        temp_df.index = list(temp_df[col])
        temp_df = temp_df[new_col_name].to_dict()   

        train[new_col_name] = train[col].map(temp_df)
        test[new_col_name]  = test[col].map(temp_df)

In [42]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,card3_TransactionAmt_mean,card3_TransactionAmt_std,card5_TransactionAmt_mean,card5_TransactionAmt_std,uid_TransactionAmt_mean,uid_TransactionAmt_std,uid2_TransactionAmt_mean,uid2_TransactionAmt_std,uid3_TransactionAmt_mean,uid3_TransactionAmt_std
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,147.65346,255.330369,185.236343,322.134467,257.916667,210.732868,257.916667,210.732868,193.0,176.069589
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,147.65346,255.330369,212.7937,396.390243,213.629639,392.797197,213.629639,392.797197,239.981619,503.767457
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,147.65346,255.330369,98.77496,141.059909,104.827829,130.363122,104.827829,130.363122,65.685652,55.170209
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,147.65346,255.330369,124.389514,191.8809,120.967279,196.723219,120.967279,196.723219,113.086685,211.762109
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,147.65346,255.330369,212.7937,396.390243,99.811667,69.829736,99.811667,69.829736,50.0,


# Preprocessing

In [43]:
train = train.replace(np.inf,999)
test = test.replace(np.inf,999)

In [44]:
list(train.columns)

['TransactionID',
 'isFraud',
 'TransactionDT',
 'TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'P_emaildomain',
 'R_emaildomain',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V

In [45]:
numerical_features = train.select_dtypes(include=numerics).columns
categorical_features = list(set(train.columns) - set(numerical_features))

In [46]:
categorical_features

['id_35',
 'M6',
 'card4',
 'id_23',
 'M9',
 'R_emaildomain_suffix',
 'id_36',
 'M2',
 'P_emaildomain_suffix',
 'uid',
 'R_emaildomain',
 'id_37',
 'M3',
 'M8',
 'id_28',
 'DeviceInfo',
 'OS_name',
 'uid2',
 'Device',
 'id_27',
 'P_emaildomain',
 'M5',
 'ProductCD',
 'id_33',
 'id_16',
 'id_29',
 'id_30',
 'id_12',
 'DeviceBuild',
 'id_34',
 'M4',
 'id_38',
 'uid3',
 'M1',
 'M7',
 'card6',
 'DeviceType',
 'id_31',
 'id_15']

In [47]:
for column in list(categorical_features):
    le = LabelEncoder()
    le.fit(list(train[column]) + list(test[column]))
    train[column] = le.transform(list(train[column]))
    test[column] = le.transform(list(test[column]))

In [48]:
train[categorical_features].head()

Unnamed: 0,id_35,M6,card4,id_23,M9,R_emaildomain_suffix,id_36,M2,P_emaildomain_suffix,uid,...,id_34,M4,id_38,uid3,M1,M7,card6,DeviceType,id_31,id_15
0,2,1,1,3,2,8,2,1,8,4655,...,4,2,2,14931,1,2,1,2,136,3
1,2,1,2,3,2,8,2,2,0,10937,...,4,0,2,34958,2,2,1,2,136,3
2,2,0,4,3,0,8,2,1,0,13001,...,4,0,2,41169,1,0,2,2,136,3
3,2,0,2,3,2,8,2,2,0,9637,...,4,0,2,30530,2,2,2,2,136,3
4,1,2,2,3,2,8,0,2,0,12826,...,3,3,1,40650,2,2,1,1,162,1


In [49]:
train[categorical_features].isnull().sum().sum()

0

In [50]:
test[categorical_features].isnull().sum().sum()

0

In [51]:
def preprocess(df):
    df = df.fillna(0)
    try:
        df = df.drop('TransactionID', axis=1)
        df = df.drop('TransactionDT', axis=1)
    except:
        pass
    return df

In [52]:
(train[numerical_features].isnull().sum() !=0 ).sum() 

467

In [53]:
train = preprocess(train)
test = preprocess(test)

In [54]:
corr_matrix = train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

In [None]:
#f = plt.figure(figsize=(100, 100))
#corr_matrix.style.background_gradient(cmap='coolwarm')

In [None]:
f = plt.figure(figsize=(100, 100))
corr_matrix[['isFraud']].min()

### Drop correlated columns

In [75]:
# select columns with correlations above threshold
threshold = 0.99
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
to_drop

['C2',
 'C10',
 'C11',
 'C12',
 'V18',
 'V97',
 'V101',
 'V102',
 'V103',
 'V132',
 'V133',
 'V134',
 'V154',
 'V155',
 'V164',
 'V177',
 'V179',
 'V182',
 'V211',
 'V213',
 'V231',
 'V232',
 'V233',
 'V241',
 'V244',
 'V251',
 'V269',
 'V279',
 'V293',
 'V295',
 'V306',
 'V316',
 'V317',
 'V318',
 'V322',
 'V323',
 'V324',
 'V332',
 'V333',
 'id_29',
 'id_36',
 'uid2',
 'uid3',
 'Device',
 'OS_name',
 'id_35_TransactionAmt_std',
 'id_12_TransactionAmt_mean',
 'id_12_TransactionAmt_std',
 'id_28_TransactionAmt_mean',
 'id_28_TransactionAmt_std',
 'id_23_TransactionAmt_mean',
 'id_23_TransactionAmt_std',
 'id_34_TransactionAmt_std',
 'id_38_TransactionAmt_mean',
 'id_38_TransactionAmt_std',
 'id_36_TransactionAmt_mean',
 'id_36_TransactionAmt_std',
 'id_31_TransactionAmt_std',
 'id_16_TransactionAmt_std',
 'id_37_TransactionAmt_mean',
 'id_37_TransactionAmt_std',
 'id_29_TransactionAmt_mean',
 'id_29_TransactionAmt_std',
 'id_15_TransactionAmt_mean',
 'id_15_TransactionAmt_std',
 'id_27

In [68]:
already_dropped = ['C12','V101', 'V177', 'V241','V293','V322','uid2','uid3','id_35_TransactionAmt_std','id_12_TransactionAmt_std','id_23_TransactionAmt_mean','id_36_TransactionAmt_std','id_37_TransactionAmt_std','id_29_TransactionAmt_mean','id_29_TransactionAmt_std','id_27_TransactionAmt_mean','id_27_TransactionAmt_std','D9_TransactionAmt_mean','D9_TransactionAmt_std']
to_drop = list(set(to_drop)-set(already_dropped))
train = train.drop(to_drop, axis=1)
test = test.drop(to_drop, axis=1)

In [69]:
train.head()

Unnamed: 0,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,card1_TransactionAmt_mean,card1_TransactionAmt_std,card2_TransactionAmt_mean,card2_TransactionAmt_std,card3_TransactionAmt_mean,card5_TransactionAmt_mean,card5_TransactionAmt_std,uid_TransactionAmt_mean,uid3_TransactionAmt_mean,uid3_TransactionAmt_std
0,0,68.5,4,13926,0.0,150.0,1,142.0,1,315.0,...,316.570357,351.513997,0.0,0.0,147.65346,185.236343,322.134467,257.916667,193.0,176.069589
1,0,29.0,4,2755,404.0,150.0,2,102.0,1,325.0,...,213.053819,391.543884,227.107106,373.703941,147.65346,212.7937,396.390243,213.629639,239.981619,503.767457
2,0,59.0,4,4663,490.0,150.0,4,166.0,2,330.0,...,104.87694,130.380968,136.179809,228.571548,147.65346,98.77496,141.059909,104.827829,65.685652,55.170209
3,0,50.0,4,18132,567.0,150.0,2,117.0,2,476.0,...,120.958705,196.463487,133.628801,226.771834,147.65346,124.389514,191.8809,120.967279,113.086685,211.762109
4,0,50.0,1,4497,514.0,150.0,2,102.0,1,420.0,...,99.811667,69.829736,223.770752,457.894839,147.65346,212.7937,396.390243,99.811667,50.0,0.0


# Modeling

In [70]:
x_train, x_val, y_train, y_val = train_test_split(train.drop('isFraud', axis=1), train['isFraud'], test_size=0.3)

In [71]:
x_train.shape, y_train.shape

((413378, 474), (413378,))

gnb = GaussianNB()
gnb.fit(x_train, y_train)

pred = gnb.predict(x_test)
fpr, tpr, thresholds = roc_curve(y_val, pred, pos_label=1)
auc(fpr, tpr)

In [72]:
param = {'boosting': 'gbdt', 'colsample_bytree': 1, 
          'learning_rate': 0.1, 'max_depth': 10, 'metric': 'auc',
          'min_child_samples': 50, 'num_leaves': 500, 
          'objective': 'binary', 'reg_alpha': 0.5, 
          'reg_lambda': 0.8, 'subsample': 0.5 }


param2 = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':1800,
                    'max_bin':255,
                    'verbose':100,
                    'early_stopping_rounds':100, 
                } 

lgtrain = lgb.Dataset(x_train, label=y_train)
lgval = lgb.Dataset(x_val[:int(len(x_val)*0.67)], label=y_val[:int(len(x_val)*0.67)])

model_lgb = lgb.train(param2, lgtrain, 1000, 
                    valid_sets=[lgtrain, lgval], early_stopping_rounds=250, 
                    verbose_eval=500)



Training until validation scores don't improve for 100 rounds.
[500]	training's auc: 0.992848	valid_1's auc: 0.959031
[1000]	training's auc: 0.99932	valid_1's auc: 0.967447
[1500]	training's auc: 0.999929	valid_1's auc: 0.970276
Did not meet early stopping. Best iteration is:
[1800]	training's auc: 0.999986	valid_1's auc: 0.971137


In [73]:
pred = model_lgb.predict(x_val[int(len(x_val)*0.67):])

fpr, tpr, thresholds = roc_curve(y_val[int(len(x_val)*0.67):], pred, pos_label=1)
auc(fpr, tpr)

0.9696430805321583

best = 0.9702220660615989

In [62]:
submission = pd.read_csv('input/sample_submission.csv')

In [63]:
submission['isFraud'] = model_lgb.predict(test)

In [None]:
#submission['isFraud'] = [i if i>0 else 0 for i in submission['isFraud']]

In [64]:
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000452
1,3663550,0.001779
2,3663551,0.000672
3,3663552,0.000271
4,3663553,0.000854


In [65]:
submission.to_csv('submission.csv', index=False)

In [None]:
pred = model_lgb.predict(x_val)
pred = [i if i>0 else 0 for i in pred]

fpr, tpr, thresholds = roc_curve(y_val, pred2, pos_label=1)
auc(fpr, tpr)

In [None]:
submission.shape

In [None]:
#lgb.plot_tree()