In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc

from utils.schemas import *

In [2]:
data_folder = 'input'
schema_train_transaction.update(schema_train_identity)
schema_test_transaction.update(schema_test_identity)

In [3]:
train = pd.read_csv(data_folder+'/train_merged.csv', dtype = schema_train_transaction)
test = pd.read_csv(data_folder+'/test_merged.csv', dtype = schema_test_transaction)

In [5]:
train.shape, test.shape

((590540, 433), (506691, 432))

In [6]:
cat_ft = ['ProductCD', 'P_emaildomain', 'R_emaildomain'] + ['card{}'.format(i) for i in range(1, 7)] \
+ ['addr1', 'addr2'] + ['M{}'.format(i) for i in range(1, 10)] + ['DeviceType', 'DeviceInfo'] \
+ ['id_{}'.format(i) for i in range(12, 39)]

In [7]:
num_ft = [x for x in train.columns if x not in cat_ft]

In [8]:
len(cat_ft), len(num_ft)

(49, 384)

In [9]:
test['isFraud'] = np.nan

In [10]:
train_index = train.shape[0]
test_index = test.shape[0]
df = pd.concat([train, test], axis=0)
df.shape

(1097231, 433)

In [11]:
del train
del test
gc.collect()

21

In [20]:
for ft in cat_ft:
    v_c = df[ft].value_counts(dropna=False)
    print('LEN de {0}: {1}'.format(ft, len(v_c)))
    print('Suma de NA: {}'.format(df[ft].isna().sum()))
    print(v_c,'\n')

LEN de ProductCD: 5
Suma de NA: 0
W    800657
C    137785
R     73346
H     62397
S     23046
Name: ProductCD, dtype: int64 

LEN de P_emaildomain: 61
Suma de NA: 163648
gmail.com           435803
yahoo.com           182784
NaN                 163648
hotmail.com          85649
anonymous.com        71062
aol.com              52337
comcast.net          14474
icloud.com           12316
outlook.com           9934
att.net               7647
msn.com               7480
sbcglobal.net         5767
live.com              5720
verizon.net           5011
ymail.com             4075
bellsouth.net         3437
yahoo.com.mx          2827
me.com                2713
cox.net               2657
optonline.net         1937
live.com.mx           1470
charter.net           1443
mail.com              1156
rocketmail.com        1105
gmail                  993
earthlink.net          979
outlook.es             863
mac.com                862
hotmail.fr             674
hotmail.es             627
                    

Suma de NA: 447739
T      588323
NaN    447739
F       61169
Name: M2, dtype: int64 

LEN de M3: 3
Suma de NA: 447739
T      518244
NaN    447739
F      131248
Name: M3, dtype: int64 

LEN de M4: 4
Suma de NA: 519189
NaN    519189
M0     357789
M2     122947
M1      97306
Name: M4, dtype: int64 

LEN de M5: 3
Suma de NA: 660114
NaN    660114
F      240155
T      196962
Name: M5, dtype: int64 

LEN de M6: 3
Suma de NA: 328299
F      419433
T      349499
NaN    328299
Name: M6, dtype: int64 

LEN de M7: 3
Suma de NA: 581283
NaN    581283
F      444604
T       71344
Name: M7, dtype: int64 

LEN de M8: 3
Suma de NA: 581256
NaN    581256
F      323650
T      192325
Name: M8, dtype: int64 

LEN de M9: 3
Suma de NA: 581256
NaN    581256
T      441935
F       74040
Name: M9, dtype: int64 

LEN de DeviceType: 3
Suma de NA: 819490
NaN        819490
desktop    159568
mobile     118173
Name: DeviceType, dtype: int64 

LEN de DeviceInfo: 2800
Suma de NA: 863508
NaN                               863

Suma de NA: 1087000
NaN                     1087000
IP_PROXY:TRANSPARENT       7203
IP_PROXY:ANONYMOUS         2010
IP_PROXY:HIDDEN            1018
Name: id_23, dtype: int64 

LEN de id_24: 18
Suma de NA: 1087744
NaN     1087744
11.0       5666
15.0       2948
16.0        315
21.0        222
24.0        141
18.0        104
12.0         26
19.0         24
26.0         14
17.0          9
25.0          9
20.0          4
22.0          1
23.0          1
14.0          1
13.0          1
10.0          1
Name: id_24, dtype: int64 

LEN de id_25: 441
Suma de NA: 1087060
NaN      1087060
321.0       5233
205.0        569
426.0        469
442.0        188
501.0        151
371.0        132
509.0        115
524.0        114
123.0         97
126.0         64
479.0         59
236.0         56
463.0         55
143.0         54
432.0         53
385.0         51
505.0         51
247.0         48
239.0         45
472.0         45
486.0         43
356.0         38
270.0         37
511.0         36
359.0   

LEN de id_35: 3
Suma de NA: 819269
NaN    819269
T      149464
F      128498
Name: id_35, dtype: int64 

LEN de id_36: 3
Suma de NA: 819269
NaN    819269
F      267353
T       10609
Name: id_36, dtype: int64 

LEN de id_37: 3
Suma de NA: 819269
NaN    819269
T      215149
F       62813
Name: id_37, dtype: int64 

LEN de id_38: 3
Suma de NA: 819269
NaN    819269
F      168980
T      108982
Name: id_38, dtype: int64 



In [12]:
first_na_impute_cat = {
    'P_emaildomain': 'na.na',
    'R_emaildomain': 'na.na',
    'card2': -1,
    'card3': -1,
    'card4': 'U',
    'card5': -1,
    'card6': 'U',
    'addr1': -1,
    'addr2': -1,
    'M1': 'U',
    'M2': 'U',
    'M3': 'U',
    'M4': 'M3',
    'M5': 'U',
    'M6': 'U',
    'M7': 'U',
    'M8': 'U',
    'M9': 'U',
    'DeviceType': 'U',
    'DeviceInfo': 'U',
    'id_12': 'U',
    'id_13': -1,
    'id_14': -1,
    'id_15': 'Unknown',
    'id_16': 'U',
    'id_17': -1,
    'id_18': -1,
    'id_19': -1,
    'id_20': -1,
    'id_21': -1,
    'id_22': -1,
    'id_23': 'IP_PROXY:NA',
    'id_24': -1,
    'id_25': -1,
    'id_26': -1,
    'id_27': 'U',
    'id_28': 'U',
    'id_29': 'U',
    'id_30': 'U',
    'id_31': 'U',
    'id_32': 99,
    'id_33': '9999x9999',
    'id_34': 'match_status:-2',
    'id_35': 'U',
    'id_36': 'U',
    'id_37': 'U',
    'id_38': 'U',
}

In [13]:
df_imputed = df.fillna(first_na_impute_cat)
# df_imputed = df

In [14]:
df_imputed.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,isFraud
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,U,99.0,9999x9999,match_status:-2,U,U,U,U,0.0
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,U,99.0,9999x9999,match_status:-2,U,U,U,U,0.0
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,U,99.0,9999x9999,match_status:-2,U,U,U,U,0.0
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,U,U,99.0,9999x9999,match_status:-2,U,U,U,U,0.0
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,0.0


In [15]:
df_imputed['P_emaildomain_0'] = df_imputed['P_emaildomain'].apply(lambda x: x.split('.')[0])
df_imputed['P_emaildomain_1'] = df_imputed['P_emaildomain'].apply(lambda x: '.'.join(x.split('.')[1:]))
df_imputed['R_emaildomain_0'] = df_imputed['R_emaildomain'].apply(lambda x: x.split('.')[0])
df_imputed['R_emaildomain_1'] = df_imputed['R_emaildomain'].apply(lambda x: '.'.join(x.split('.')[1:]))

In [16]:
def proc_id_30_0(x):
    x = x.lower()
    try:
        if 'windows' in x or 'android' in x:
            return x.split()[1]
        elif 'ios' in x:
            return x.split()[1].split('.')[0]
        elif 'mac' in x:
            return x.split()[-1]
        elif 'linux' in x:
            return 'linux'
        else:
            return 'other'
    except:
        return 'other'
    
def proc_id_30_1(x):
    x = x.lower()
    if 'windows' in x:
        return 'windows'
    elif 'ios' in x:
        return 'ios'
    elif 'android' in x:
        return 'android'
    elif 'mac' in x:
        return 'mac'
    elif 'linux' in x:
        return 'linux'
    else:
        return 'other'

In [17]:
df_imputed['proc_id_30_0'] = df_imputed['id_30'].apply(lambda x: proc_id_30_0(x))
df_imputed['proc_id_30_1'] = df_imputed['id_30'].apply(lambda x: proc_id_30_1(x))

In [18]:
def proc_id_31_0(x):
    x = x.lower()
    if 'chrome' in x and not 'android' in x:
        return 'chrome'
    elif 'mobile' in x and 'safari' in x:
        return 'safari_mobile'
    elif 'ie' in x:
        return 'ie'
    elif 'safari' in x and not 'mobile' in x:
        return 'safari'
    elif 'chrome' in x and 'android' in x:
        return 'chrome_android'
    elif 'edge' in x:
        return 'edge'
    elif 'firefox' in x:
        return 'firefox'
    elif 'samsung' in x:
        return 'samsung'
    elif 'other' in x:
        return 'other'
    elif 'opera' in x:
        return 'opera'
    elif 'android' in x and not 'chrome' in x:
        return 'android'
    elif 'chrome' in x and 'ios' in x:
        return 'chrome_ios'
    elif 'google' in x:
        return 'google'
    else:
        return 'other'

In [19]:
df_imputed['proc_id_31_0'] = df_imputed['id_31'].apply(lambda x: proc_id_31_0(x))

In [20]:
df_imputed.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_37,id_38,isFraud,P_emaildomain_0,P_emaildomain_1,R_emaildomain_0,R_emaildomain_1,proc_id_30_0,proc_id_30_1,proc_id_31_0
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,U,0.0,na,na,na,na,other,other,other
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,U,0.0,gmail,com,na,na,other,other,other
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,U,0.0,outlook,com,na,na,other,other,other
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,U,U,0.0,yahoo,com,na,na,other,other,other
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,T,T,0.0,gmail,com,na,na,7.0,android,samsung


In [21]:
new_cat_ft = list()
for i, j in zip(df_imputed.dtypes, df_imputed.columns):
    if i in ['O', 'object']:
        new_cat_ft.append(j)
        
new_cat_ft = list(set(new_cat_ft)-set(cat_ft))
new_cat_ft = cat_ft + new_cat_ft
len(new_cat_ft)

56

In [22]:
# new_cat_ft

In [35]:
num_ft_imput_dict = dict()
for c in df_imputed.columns:
    if c not in new_cat_ft and df_imputed[c].isna().sum() > 0:
        mdn = df_imputed[c].median()
        num_ft_imput_dict[c] = mdn

In [91]:
df_imputed2 = df_imputed.fillna(num_ft_imput_dict)

In [23]:
def freq_encoder(df, label, new_label, min_freq = 0.001):
    rows = df.shape[0]
    n = 0
    dict_fe = dict()
    vc = df[label].value_counts()
    for i, j in zip(vc.index, vc):
        ratio = j/rows
        if ratio > min_freq:
            dict_fe[i] = n
            n += 1
        else:
            dict_fe[i] = n
        
    if n < 2**8:
        _d_type = 'uint8'
    elif n >= 2**8 and n < 8**16:
        _d_type = 'uint16'
    elif n >= 2**16 and n < 8**32:
        _d_type = 'uint32'
    else:
        _d_type = 'uint64'
        
    df[new_label] = df[label].apply(lambda x: dict_fe[x]).astype(_d_type)
    
    n = 0
    dict_fe = dict()
    vc = df[label].value_counts()
    for i, j in zip(vc.index, vc):
        ratio = j/rows
        if ratio > min_freq:
            dict_fe[i] = n
            n += 1
        else:
            dict_fe[i] = n
            
    df[new_label] = df[label].apply(lambda x: dict_fe[x]).astype(_d_type)
    
    return df

In [24]:
df_imputed2 = df_imputed
df_imputed2.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_37,id_38,isFraud,P_emaildomain_0,P_emaildomain_1,R_emaildomain_0,R_emaildomain_1,proc_id_30_0,proc_id_30_1,proc_id_31_0
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,U,0.0,na,na,na,na,other,other,other
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,U,0.0,gmail,com,na,na,other,other,other
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,U,0.0,outlook,com,na,na,other,other,other
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,U,U,0.0,yahoo,com,na,na,other,other,other
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,T,T,0.0,gmail,com,na,na,7.0,android,samsung


In [25]:
fr = 0.0005
print('frecuencia minima: {}'.format(fr))
for ft in new_cat_ft:
    df_imputed2 = freq_encoder(df_imputed2, ft, str(ft+'_fe1'), min_freq = fr)
    print('\t{0}: {1}'.format(ft, len(df_imputed2[ft+'_fe1'].value_counts())))
    
fr = 0.001
print('frecuencia minima: {}'.format(fr))
for ft in new_cat_ft:
    df_imputed2 = freq_encoder(df_imputed2, ft, str(ft+'_fe2'), min_freq = fr)
    print('\t{0}: {1}'.format(ft, len(df_imputed2[ft+'_fe2'].value_counts())))
    
fr = 0
print('frecuencia minima: {}'.format(fr))
for ft in new_cat_ft:
    df_imputed2 = freq_encoder(df_imputed2, ft, str(ft), min_freq = fr)
    print('\t{0}: {1}'.format(ft, len(df_imputed2[ft].value_counts())))

frecuencia minima: 0.0005
	ProductCD: 5
	P_emaildomain: 35
	R_emaildomain: 23
	card1: 294
	card2: 170
	card3: 15
	card4: 5
	card5: 30
	card6: 4
	addr1: 59
	addr2: 5
	M1: 3
	M2: 3
	M3: 3
	M4: 4
	M5: 3
	M6: 3
	M7: 3
	M8: 3
	M9: 3
	DeviceType: 3
	DeviceInfo: 12
	id_12: 3
	id_13: 18
	id_14: 8
	id_15: 3
	id_16: 3
	id_17: 7
	id_18: 9
	id_19: 46
	id_20: 64
	id_21: 4
	id_22: 3
	id_23: 4
	id_24: 4
	id_25: 4
	id_26: 8
	id_27: 3
	id_28: 3
	id_29: 3
	id_30: 42
	id_31: 58
	id_32: 4
	id_33: 30
	id_34: 4
	id_35: 3
	id_36: 3
	id_37: 3
	id_38: 3
	proc_id_31_0: 12
	R_emaildomain_1: 8
	proc_id_30_1: 6
	R_emaildomain_0: 18
	P_emaildomain_1: 10
	P_emaildomain_0: 30
	proc_id_30_0: 27
frecuencia minima: 0.001
	ProductCD: 5
	P_emaildomain: 25
	R_emaildomain: 16
	card1: 172
	card2: 95
	card3: 10
	card4: 5
	card5: 25
	card6: 4
	addr1: 56
	addr2: 5
	M1: 3
	M2: 3
	M3: 3
	M4: 4
	M5: 3
	M6: 3
	M7: 3
	M8: 3
	M9: 3
	DeviceType: 3
	DeviceInfo: 7
	id_12: 3
	id_13: 14
	id_14: 6
	id_15: 3
	id_16: 3
	id_17: 5
	id_18: 7
	i

In [26]:
new_cat_ft2 = new_cat_ft + [x for x in df_imputed2.columns if '_fe1' in x or '_fe2' in x]
new_cat_ft2

['ProductCD',
 'P_emaildomain',
 'R_emaildomain',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'DeviceType',
 'DeviceInfo',
 'id_12',
 'id_13',
 'id_14',
 'id_15',
 'id_16',
 'id_17',
 'id_18',
 'id_19',
 'id_20',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_25',
 'id_26',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_32',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'proc_id_31_0',
 'R_emaildomain_1',
 'proc_id_30_1',
 'R_emaildomain_0',
 'P_emaildomain_1',
 'P_emaildomain_0',
 'proc_id_30_0',
 'ProductCD_fe1',
 'P_emaildomain_fe1',
 'R_emaildomain_fe1',
 'card1_fe1',
 'card2_fe1',
 'card3_fe1',
 'card4_fe1',
 'card5_fe1',
 'card6_fe1',
 'addr1_fe1',
 'addr2_fe1',
 'M1_fe1',
 'M2_fe1',
 'M3_fe1',
 'M4_fe1',
 'M5_fe1',
 'M6_fe1',
 'M7_fe1',
 'M8_fe1',
 'M9_fe1',
 'DeviceType_fe1',
 'DeviceInfo_fe1',
 'id_12_fe1',
 'id_13_fe1',
 'id_14_fe1',
 'id_15_fe1',
 'id_16_f

In [95]:
for c, d in zip(df_imputed2.columns, df_imputed2.dtypes):
    if c not in new_cat_ft+['isFraud'] and str(d)[:4] != 'uint':
        df_imputed2[c] = np.log1p(df_imputed2[c])

In [27]:
df_imputed2.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_36_fe2,id_37_fe2,id_38_fe2,proc_id_31_0_fe2,R_emaildomain_1_fe2,proc_id_30_1_fe2,R_emaildomain_0_fe2,P_emaildomain_1_fe2,P_emaildomain_0_fe2,proc_id_30_0_fe2
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,2,0
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,8,0
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1,1,2,8,0,4,0,0,0,7


In [34]:
dict(df_imputed2.dtypes)

{'C1': dtype('float16'),
 'C10': dtype('float16'),
 'C11': dtype('float16'),
 'C12': dtype('float16'),
 'C13': dtype('float16'),
 'C14': dtype('float16'),
 'C2': dtype('float16'),
 'C3': dtype('float16'),
 'C4': dtype('float16'),
 'C5': dtype('float16'),
 'C6': dtype('float16'),
 'C7': dtype('float16'),
 'C8': dtype('float16'),
 'C9': dtype('float16'),
 'D1': dtype('float16'),
 'D10': dtype('float16'),
 'D11': dtype('float16'),
 'D12': dtype('float16'),
 'D13': dtype('float16'),
 'D14': dtype('float16'),
 'D15': dtype('float16'),
 'D2': dtype('float16'),
 'D3': dtype('float16'),
 'D4': dtype('float16'),
 'D5': dtype('float16'),
 'D6': dtype('float16'),
 'D7': dtype('float16'),
 'D8': dtype('float16'),
 'D9': dtype('float16'),
 'DeviceInfo': dtype('uint16'),
 'DeviceType': dtype('uint8'),
 'M1': dtype('uint8'),
 'M2': dtype('uint8'),
 'M3': dtype('uint8'),
 'M4': dtype('uint8'),
 'M5': dtype('uint8'),
 'M6': dtype('uint8'),
 'M7': dtype('uint8'),
 'M8': dtype('uint8'),
 'M9': dtype('uin

In [28]:
train = df_imputed2.iloc[:train_index, :]
test = df_imputed2.iloc[train_index:, :]

In [29]:
train.to_csv(data_folder+'/train_generated_0.csv', header=True, index=None)
print('train guardado')
test.to_csv(data_folder+'/test_generated_0.csv', header=True, index=None)
print('test guardado')

train guardado
test guardado


In [4]:
data_folder = 'input'
train = pd.read_csv(data_folder+'/train_generated_0.csv', dtype = schema_generated_0)
# test = pd.read_csv(data_folder+'/test_generated_0.csv', dtype = schema_generated_0)

In [20]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb

In [6]:
def custom_loss(y_pred, y_true):
    precision, recall, thresholds = precision_recall_curve(np.where(y_true >= 0.5, 1, 0), y_pred)
    AUC = auc(recall, precision)
    if AUC != AUC:
        AUC = 0
    return 'PR_AUC', AUC, True

In [29]:
params = {
#     'max_depth': 11,
    'num_leaves': 361,
    'metric': ['PR_AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'learning_rate': 0.07,
    'colsample_bytree': 0.6,
    'objective': 'xentropy',
    'n_jobs': -1,
    'seed': 42,
    'bagging_fraction': 0.7,
    'bagging_freq': 8,
    'lambda_l1': 0.2,
    'lambda_l2': 0.2,
#     'is_unbalance': True
}

In [30]:
lgb_model = lgb.LGBMClassifier(**params)

In [15]:
X_cols = [x for x in train.columns if x not in ['isFraud', 'TransactionDT', 'TransactionID']]

In [132]:
# df_imputed2_1 = df_imputed2[df_imputed2.isFraud==1]
# df_imputed2_0 = df_imputed2[df_imputed2.isFraud==0].sample(frac=0.3)
# X = pd.concat([df_imputed2_1, df_imputed2_0])[X_cols]
# y = pd.concat([df_imputed2_1, df_imputed2_0]).isFraud

In [26]:
X = train.sort_values('TransactionDT')[X_cols]
y = train.isFraud

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
tscv = TimeSeriesSplit(n_splits=5)

In [32]:
lgb_model.feature_importances_

array([ 663,  184,  459,   88, 1286,  534,  594,   28,   75,  315,  573,
         25,  144,  424,  982, 1412, 1077,  251,  155,  294, 2002, 1603,
       1333, 1366, 1297,  204,  201, 1182,  796,  563,   16,    2,   73,
        134,  256,  121,  128,   91,   86,   70, 1399,   90,  476, 4564,
          1,   67,   41,   24,   53,   31,   12,   48,    9,    0,    1,
         26,   87,    0,    0,    5,    0,    1,    8,    3,    1,    0,
          0,   91,    0,    0,    0,    6,   61,   10,  235,  452,  238,
        115,   84,  357,  222,   87,  209,   91,   61,  120,   43,    5,
        117,    3,   65,    2,    1,  179,   98,  159,   16,   15,   15,
         26,   26,   95,  112,   43,    7,    3,   13,   18,   16,   16,
        113,   43,  162,   22,   13,   12,  174,  180,  172,   33,   28,
         14,   19,   61,   47,   11,    7,   13,   12,   21,   15,   29,
          9,   20,   24,   30,    8,   20,    8,   11,   14,   16,   10,
         23,  153,    9,    6,   10,    4,    0,   

In [31]:
n = 1
for train_index, test_index in tscv.split(X=X, y=y):
    print('FOLD {}'.format(n))
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    lgb_model.fit(X_train,
                  y_train,
                  eval_set=[(X_train, y_train), (X_test, y_test)],
                  verbose=50,
                  early_stopping_rounds=50,
                  eval_metric=custom_loss
                 )
    del X_train
    del X_test
    del y_train
    del y_test
    gc.collect()
    n += 1

FOLD 1
Training until validation scores don't improve for 50 rounds.
[50]	training's PR_AUC: 0.871842	valid_1's PR_AUC: 0.582042
[100]	training's PR_AUC: 0.968293	valid_1's PR_AUC: 0.590685
[150]	training's PR_AUC: 0.98953	valid_1's PR_AUC: 0.596291
[200]	training's PR_AUC: 0.996639	valid_1's PR_AUC: 0.601554
[250]	training's PR_AUC: 0.998221	valid_1's PR_AUC: 0.60498
Early stopping, best iteration is:
[249]	training's PR_AUC: 0.998221	valid_1's PR_AUC: 0.605395
FOLD 2


KeyboardInterrupt: 

In [19]:
lgb_model.fit(X_train,
                   y_train,
                   eval_set=[(X_train, y_train), (X_test, y_test)],
                   verbose=50,
                   early_stopping_rounds=50,
                   eval_metric=custom_loss
                  )

Training until validation scores don't improve for 50 rounds.
[50]	training's PR_AUC: 0.625616	valid_1's PR_AUC: 0.618846
[100]	training's PR_AUC: 0.662864	valid_1's PR_AUC: 0.651928
[150]	training's PR_AUC: 0.684206	valid_1's PR_AUC: 0.667586
[200]	training's PR_AUC: 0.701413	valid_1's PR_AUC: 0.681069
[250]	training's PR_AUC: 0.717748	valid_1's PR_AUC: 0.693366
[300]	training's PR_AUC: 0.730906	valid_1's PR_AUC: 0.704521
[350]	training's PR_AUC: 0.744675	valid_1's PR_AUC: 0.714907
[400]	training's PR_AUC: 0.753716	valid_1's PR_AUC: 0.722393
[450]	training's PR_AUC: 0.760865	valid_1's PR_AUC: 0.726149
[500]	training's PR_AUC: 0.770495	valid_1's PR_AUC: 0.730711


KeyboardInterrupt: 

In [111]:
n_round = lgb_model.best_iteration_
n_round

2038

In [113]:
params = {
    'max_depth': 13,
    'metric': ['PR_AUC'],
    'first_metric_only': True,
    'n_estimators': int(n_round*1.1),
    'learning_rate': 0.05,
    'colsample_bytree': 0.8,
    'objective': 'xentropy',
    'n_jobs': -1,
    'seed': 42,
    'bagging_fraction': 0.8,
    'lambda_l1': 0,
    'lambda_l2': 0,
}

In [114]:
lgb_model = lgb.LGBMClassifier(**params)

In [115]:
lgb_model.fit(X, y, verbose=100)

LGBMClassifier(bagging_fraction=0.8, boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.8, first_metric_only=True,
        importance_type='split', lambda_l1=0, lambda_l2=0,
        learning_rate=0.05, max_depth=13, metric=['PR_AUC'],
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=2241, n_jobs=-1, num_leaves=31, objective='xentropy',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=42,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

In [44]:
del X_train
del X_test
del y_train
del y_test

In [45]:
gc.collect()

8

In [139]:
X_test = df_imputed2[X.columns].iloc[train_index:, :]

In [140]:
y_preds = lgb_model.predict_proba(X_test)

In [141]:
y_preds[:,1]

array([0.00095012, 0.00021622, 0.00047366, ..., 0.00846984, 0.01005928,
       0.00357157])

In [142]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [143]:
df_sub['isFraud'] = y_preds[:,1]

In [144]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.00095
1,3663550,0.000216
2,3663551,0.000474
3,3663552,0.003246
4,3663553,0.002967


In [145]:
df_sub.to_csv('submissions/benchmark_freq_enc_3_30porciento_sincard1.csv', sep=',', header=True, index=None)

In [146]:
df_imp = pd.DataFrame({'ft': X.columns, 'imp': lgb_model.feature_importances_}).sort_values('imp', ascending=False)

In [147]:
df_imp.head(30)

Unnamed: 0,ft,imp
385,card2,1895
43,TransactionAmt,1722
383,addr1,1595
20,D15,1027
422,id_31,1012
4,C13,919
390,dist1,800
15,D10,751
388,card5,723
23,D4,671


In [254]:
import datetime

In [264]:
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
X_train = pd.DataFrame()
X_train['TransactionDT'] = train_imputed['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

X_train['year'] = X_train['TransactionDT'].dt.year
X_train['month'] = X_train['TransactionDT'].dt.month
X_train['dow'] = X_train['TransactionDT'].dt.dayofweek
X_train['hour'] = X_train['TransactionDT'].dt.hour
X_train['day'] = X_train['TransactionDT'].dt.day
print('hello world')

In [265]:
X_train.head(30)

Unnamed: 0,TransactionDT,year,month,dow,hour,day
0,2017-12-02 00:00:00,2017,12,5,0,2
1,2017-12-02 00:00:01,2017,12,5,0,2
2,2017-12-02 00:01:09,2017,12,5,0,2
3,2017-12-02 00:01:39,2017,12,5,0,2
4,2017-12-02 00:01:46,2017,12,5,0,2
5,2017-12-02 00:01:50,2017,12,5,0,2
6,2017-12-02 00:02:02,2017,12,5,0,2
7,2017-12-02 00:02:09,2017,12,5,0,2
8,2017-12-02 00:02:15,2017,12,5,0,2
9,2017-12-02 00:02:16,2017,12,5,0,2
