In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import gc
from sklearn.decomposition import PCA
import lightgbm as lgb

from utils.schemas import *
from utils.functions import *

In [2]:
data_folder = 'input'
schema_train_transaction.update(schema_train_identity)
schema_test_transaction.update(schema_test_identity)

In [3]:
train = pd.read_csv(data_folder+'/train_merged_3.csv', dtype = schema_train_transaction)
test = pd.read_csv(data_folder+'/test_merged_3.csv', dtype = schema_test_transaction)

In [4]:
train.shape, test.shape

((590540, 440), (506691, 439))

In [5]:
cat_ft = ['ProductCD', 'P_emaildomain', 'R_emaildomain'] + ['card{}'.format(i) for i in range(1, 7)] \
+ ['addr1', 'addr2'] + ['M{}'.format(i) for i in range(1, 10)] + ['DeviceType', 'DeviceInfo'] \
+ ['id_{}'.format(i) for i in range(12, 39)]

In [6]:
len(cat_ft)

49

In [7]:
for c, t in zip(train.columns, train.dtypes):
    if t == 'object' and c not in cat_ft:
        cat_ft.append(c)

In [8]:
len(cat_ft)

50

In [9]:
num_ft = [x for x in train.columns if x not in cat_ft]

In [10]:
len(cat_ft), len(num_ft)

(50, 390)

In [11]:
train_null_list = list()
test_null_list = list()

print('Train nulls...')
for c in train.columns:
    train_null_list.append(train[c].isna().sum())

print('Test nulls...')
for c in test.columns:
    test_null_list.append(test[c].isna().sum())
    
len_train = len(train)
len_test = len(test)

train_per = [x/len_train for x in train_null_list]
test_per = [x/len_test for x in test_null_list]
    
    
train_null = pd.DataFrame({'ft': train.columns, 'nulls': train_null_list, 'percentage': train_per})\
.sort_values('nulls', ascending=False)
test_null = pd.DataFrame({'ft': test.columns, 'nulls': test_null_list, 'percentage': test_per})\
.sort_values('nulls', ascending=False)

Train nulls...
Test nulls...


In [12]:
drop_cols = ['D9']

In [13]:
for c in train_null.ft:
    if (train_null[(train_null.ft == c)].percentage > 0.9).bool() and (test_null[(test_null.ft == c)].percentage > 0.9).bool():
        drop_cols.append(c)

In [14]:
drop_cols

['D9',
 'id_24',
 'id_25',
 'id_07',
 'id_08',
 'id_21',
 'id_26',
 'id_27',
 'id_23',
 'id_22',
 'dist2']

In [15]:
train_null[(train_null.ft == c)].percentage.values[0]

0.0

In [16]:
for c in train_null.ft:
    val_train = train_null[(train_null.ft == c)].percentage.values[0]
    try:
        val_test = test_null[(test_null.ft == c)].percentage.values[0]
        if abs(val_train-val_test) > 0.12:
            print(c, val_train, val_test, val_train/val_test, abs(val_train-val_test))
    except:
        pass

D13 0.8950926270870728 0.7564906422257353 1.183217051374945 0.13860198486133757
D14 0.8946946862193924 0.77265433962711 1.1579494740833012 0.12204034659228236
D6 0.8760676668811597 0.7537295906183453 1.162310300385647 0.12233807626281434
M7 0.5863531682866528 0.46382903978953643 1.2641579504222333 0.12252412849711641
M8 0.5863311545365258 0.46380140953756827 1.2641857969365067 0.12252974499895758
M9 0.5863311545365258 0.46380140953756827 1.2641857969365067 0.12252974499895758
V4 0.47293494090154775 0.348374058351145 1.3575492478973596 0.12456088255040276
V1 0.47293494090154775 0.348374058351145 1.3575492478973596 0.12456088255040276
D11 0.47293494090154775 0.348374058351145 1.3575492478973596 0.12456088255040276
V2 0.47293494090154775 0.348374058351145 1.3575492478973596 0.12456088255040276
V11 0.47293494090154775 0.348374058351145 1.3575492478973596 0.12456088255040276
V10 0.47293494090154775 0.348374058351145 1.3575492478973596 0.12456088255040276
V9 0.47293494090154775 0.34837405835

In [17]:
cols_d = [x for x in train.columns if 'D' in x and len(x) <= 3]

In [18]:
cols_v = [x for x in train.columns if 'V' in x]

In [19]:
cols_m = [x for x in train.columns if 'M' in x]

In [20]:
cols_c = [x for x in train.columns if 'C' in x and len(x)<4]

In [101]:
# groups V
cols1 = ['V{}'.format(x) for x in range(1,12)]
cols2 = ['V{}'.format(x) for x in range(12,35)]
cols3 = ['V{}'.format(x) for x in range(35,53)]
cols4 = ['V{}'.format(x) for x in range(53,75)]
cols5 = ['V{}'.format(x) for x in range(75,95)]
cols6 = ['V{}'.format(x) for x in range(95,138)]
cols7 = ['V{}'.format(x) for x in range(138,167)]
cols8 = ['V{}'.format(x) for x in range(167,217)]
cols9 = ['V{}'.format(x) for x in range(217,279)]
cols10 = ['V{}'.format(x) for x in range(279,322)]
cols11 = ['V{}'.format(x) for x in range(322,340)]

bin_v = ['V1', 'V14', 'V41', 'V65', 'V88', 'V107', 'V305']

cols_v = [cols1,cols2,cols3,cols4,cols5,cols6,cols7,cols8,cols9,cols10,cols11]

In [22]:
def micro_train(X_fit, y_fit, X_val, y_val):
    params = {
#     'num_leaves': 256,
    'max_depth': -1,
#     'max_leaf_nodes': 45,
#     'min_sample_leaf': 20,
    'metric': ['AUC'],
    'first_metric_only': True,
    'n_estimators': 50000,
    'num_threads': 32,
    'learning_rate': 0.01,
    'colsample_bytree': 0.5,
    'objective': 'xentropy',
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'importance_type': 'gain',
    'bagging_seed': 42,
    'random_state':42,
    'seed': 42,
    'feature_fraction_seed': 42,
    'drop_seed': 42,
    'data_random_seed': 42,
    }
    model = lgb.LGBMClassifier(**params)
    model.fit(X_fit, y_fit, eval_set=[(X_val, y_val), (X_fit, y_fit)],
         early_stopping_rounds=50, verbose=50)

In [23]:
X = pd.DataFrame()
for i, j in enumerate(cols_v):
    print(i)
    pca = PCA(n_components=2, random_state=42)
    cols = [x for x in j if x not in bin_v]
    X_new = pca.fit_transform(train[j].fillna(-1)) 
    X = pd.concat([X, pd.DataFrame(X_new)], axis=1)
    nulls_ = np.where(train[j[-1]].isna(),1, 0)
    X = pd.concat([X, pd.DataFrame(nulls_)], axis=1)
    
X_final = pd.concat([X, train[bin_v]], axis=1)

0
1
2
3
4
5
6
7
8
9
10


In [24]:
cols_v = [x for x in train.columns if 'V' in x]

pca = PCA(n_components=2, random_state=42)
X_new = pca.fit_transform(train[cols_v].fillna(-1)) 
X_final = pd.concat([X_final, pd.DataFrame(X_new)], axis=1)

In [25]:
X_fit = X_final[:int(len(X)*0.8)]
y_fit = train.isFraud[:int(len(X)*0.8)]

X_val = X_final[int(len(X)*0.8):]
y_val = train.isFraud[int(len(X)*0.8):]

In [26]:
pca = PCA(n_components=2, random_state=42)
X_new = pca.fit_transform(train[cols_d].fillna(-1)) 
X_final_d = pd.DataFrame(X_new)

In [27]:
X_fit = X_final_d[:int(len(X)*0.8)]
y_fit = train.isFraud[:int(len(X)*0.8)]

X_val = X_final_d[int(len(X)*0.8):]
y_val = train.isFraud[int(len(X)*0.8):]

In [28]:
pca = PCA(n_components=2, random_state=42)
X_new = pca.fit_transform(train[cols_c].fillna(-1)) 
X_final_c = pd.DataFrame(X_new)

In [29]:
train_m_imputed = train[cols_m].fillna('U').copy()
for c in cols_m:
    train_m_imputed[c] = freq_encoder(train_m_imputed, c, c, 0)

Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9
0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1


In [31]:
pca = PCA(n_components=2, random_state=42)
X_new = pca.fit_transform(train_m_imputed)
X_final_m = pd.DataFrame(X_new)

In [70]:
X = pd.concat([X_final, X_final_d, X_final_c, X_final_m], axis=1)

In [30]:
gc.collect()

7

In [71]:
X_fit = X[:int(len(X)*0.8)]
y_fit = train.isFraud[:int(len(X)*0.8)]

X_val = X[int(len(X)*0.8):]
y_val = train.isFraud[int(len(X)*0.8):]

micro_train(X_fit, y_fit, X_val, y_val)

Training until validation scores don't improve for 50 rounds.
[50]	training's auc: 0.854244	valid_0's auc: 0.843456
[100]	training's auc: 0.864031	valid_0's auc: 0.849647
[150]	training's auc: 0.870292	valid_0's auc: 0.854255
[200]	training's auc: 0.874868	valid_0's auc: 0.857455
[250]	training's auc: 0.878671	valid_0's auc: 0.859826
[300]	training's auc: 0.882535	valid_0's auc: 0.86283
[350]	training's auc: 0.88562	valid_0's auc: 0.865222
[400]	training's auc: 0.888593	valid_0's auc: 0.867281
[450]	training's auc: 0.891397	valid_0's auc: 0.86919
[500]	training's auc: 0.893918	valid_0's auc: 0.870346
[550]	training's auc: 0.896211	valid_0's auc: 0.872078
[600]	training's auc: 0.89857	valid_0's auc: 0.873894
[650]	training's auc: 0.900114	valid_0's auc: 0.874892
[700]	training's auc: 0.901999	valid_0's auc: 0.875643
[750]	training's auc: 0.903456	valid_0's auc: 0.876399
[800]	training's auc: 0.904814	valid_0's auc: 0.877006
[850]	training's auc: 0.906317	valid_0's auc: 0.877869
[900]	tr

In [20]:
n = 1
new_cols = list()
for group_col in cols_v:
    train['V_group_{}'.format(n)] = (train.groupby(group_col)['TransactionDT'].transform('count'))
    train['V_group_{}_is_null'.format(n)] = np.where(train['V_group_{}'.format(n)].isna(), 0, 1)
    test['V_group_{}'.format(n)] = (test.groupby(group_col)['TransactionDT'].transform('count'))
    test['V_group_{}_is_null'.format(n)] = np.where(test['V_group_{}'.format(n)].isna(), 0, 1)
    new_cols.append('V_group_{}'.format(n))
    n += 1

In [21]:
train.date = train.date.astype(np.datetime64)
test.date = test.date.astype(np.datetime64)

In [22]:
train['year'] = train['date'].dt.year
test['year'] = test['date'].dt.year

In [23]:
train['card'] = train.card1.astype(str) + train.card2.astype(str) + train.card3.astype(str) + train.card4.astype(str)\
+ train.card5.astype(str) + train.card6.astype(str)

test['card'] = test.card1.astype(str) + test.card2.astype(str) + test.card3.astype(str) + test.card4.astype(str)\
+ test.card5.astype(str) + test.card6.astype(str)

In [11]:
id_cols = ['ProductCD','V157','V158','V153','V139','V140','V311',
          'V156','V146', 'V241','V154','V155','V197','V88',
          'V148','V147','V198','V94','V286','V226','V305',
           'M4','M8','M6',
           'M5','M2','M3','M9','M7',
#           'id_15','id_29','id_28','id_32','id_36','id_38','id_37',
#           'id_12','id_35','id_16'
          ]

In [12]:
# id_cols = ['M4','M8','M6',
#            'M5','M2','M3','M9','M7']

In [13]:
train.DeviceType.value_counts()

desktop    85165
mobile     55645
Name: DeviceType, dtype: int64

In [25]:
cols_c = [x for x in train.columns if 'C' in x and len(x) < 4]

In [17]:
cols_v2 = [x for x in train.columns if 'V' in x and len(x) < 5]

In [29]:
train['TransactionDT_lag'] = train[cols_c+cols_d+cols_v2+['TransactionDT']].fillna(-1).groupby(cols_c+cols_d+cols_v2)['TransactionDT'].shift(1)
gc.collect()

292

In [30]:
train['Time_diff'] = round(((train['TransactionDT'] - train['TransactionDT_lag'])/(3600*24)),0)

In [33]:
train[train.Time_diff > 1][['isFraud','Time_diff','D1','D2','D3','D4','D5','D6']].head(50)

Unnamed: 0,isFraud,Time_diff,D1,D2,D3,D4,D5,D6
9486,0,2.0,0.0,,,55.0,55.0,
9652,0,2.0,0.0,,,0.0,,0.0
9772,0,2.0,0.0,,,,,
10162,0,2.0,0.0,,,,,
10241,0,2.0,0.0,,,0.0,,
10356,0,2.0,0.0,,,,,
10414,0,2.0,0.0,,,,,
10560,0,2.0,0.0,,,0.0,,
10626,0,2.0,0.0,,,,,
10731,0,2.0,0.0,,,,,


In [27]:
for c in train.columns:
    if train[c].isna().sum()==0:
        print(c)

isFraud
TransactionDT
TransactionAmt
ProductCD
card1
C1
C2
C3
C4
C5
C6
C7
C8
C9
C10
C11
C12
C13
C14
has_identity
date
dayofweek
hour
day
month
TransactionAmt_decimal
V_group_1_is_null
V_group_2_is_null
V_group_3_is_null
V_group_4_is_null
V_group_5_is_null
V_group_6_is_null
V_group_7_is_null
V_group_8_is_null
V_group_9_is_null
V_group_10_is_null
V_group_11_is_null
year
card


# Full DF

In [43]:
test['isFraud'] = np.nan

In [44]:
train_index = train.shape[0]
test_index = test.shape[0]
df = pd.concat([train, test], axis=0)
df.shape

(1097231, 440)

In [45]:
del train
del test
gc.collect()

4247

In [102]:
X = pd.DataFrame()
for i, j in enumerate(cols_v):
    print(i, end='\r')
    pca = PCA(n_components=2, random_state=42)
    cols = [x for x in j if x not in bin_v]
    X_new = pca.fit_transform(df[j].fillna(-1)) 
    X = pd.concat([X, pd.DataFrame(X_new)], axis=1)
#     nulls_ = np.where(df[j[-1]].isna(),1, 0)
#     X = pd.concat([X, pd.DataFrame(nulls_)], axis=1)
    
print('')
# X_final = pd.concat([df[bin_v].reset_index(drop=True), X], axis=1)

10


In [103]:
cols_v2 = [x for x in df.columns if 'V' in x]

pca = PCA(n_components=2, random_state=42)
X_new = pca.fit_transform(df[cols_v2].fillna(-1)) 
X_final = pd.concat([X, pd.DataFrame(X_new)], axis=1)

In [104]:
pca = PCA(n_components=2, random_state=42)
X_new = pca.fit_transform(df[cols_d].fillna(-1)) 
X_final_d = pd.DataFrame(X_new)

In [105]:
pca = PCA(n_components=2, random_state=42)
X_new = pca.fit_transform(df[cols_c].fillna(-1)) 
X_final_c = pd.DataFrame(X_new)

In [106]:
train_m_imputed = df[cols_m].fillna('U').copy()
for c in cols_m:
    train_m_imputed[c] = freq_encoder(train_m_imputed, c, c, 0)

In [107]:
pca = PCA(n_components=2, random_state=42)
X_new = pca.fit_transform(train_m_imputed)
X_final_m = pd.DataFrame(X_new)

In [108]:
X = pd.concat([X_final, X_final_d, X_final_c, X_final_m], axis=1)

In [109]:
gc.collect()

0

In [110]:
X.head()

Unnamed: 0,0,1,0.1,1.1,0.2,1.2,0.3,1.3,0.4,1.4,...,0.5,1.5,0.6,1.6,0.7,1.7,0.8,1.8,0.9,1.9
0,2.404144,-0.650763,0.32397,-0.5589,-5.41663,0.526042,0.234551,-0.566889,-0.16298,0.425263,...,-187.456311,-1.06911,-10376.696016,-552.202093,-286.140773,-9.506381,-29.515973,-21.999515,-1.224494,-2.485306e-22
1,-3.722485,0.011006,-0.038629,0.217503,0.770855,0.251194,-0.088012,0.246138,-0.009564,-0.214485,...,-187.456311,-1.06911,-10376.663601,-652.079929,-301.923628,-2.157724,-29.844353,-22.143923,1.775506,2.67099e-20
2,2.404144,-0.650763,0.32397,-0.5589,1.142362,-0.601238,0.234551,-0.566889,0.27908,0.545363,...,-187.456311,-1.06911,-10376.66363,-652.079992,-9.871609,71.931517,-29.826001,-21.964276,-1.224494,-7.5249e-23
3,-3.722485,0.011006,0.32397,-0.5589,1.142362,-0.601238,0.234551,-0.566889,0.27908,0.545363,...,-187.456311,-1.06911,-10377.176559,1319.371217,-95.9256,-56.243696,-18.673952,-2.354387,1.775506,-1.30927e-22
4,-3.722485,0.011006,-6.570013,0.752694,-5.41663,0.526042,-6.517419,1.024724,-6.254782,-0.651203,...,-185.428142,-0.025041,159993.746672,-1232.14733,-303.297541,-2.928574,-29.320634,-22.517405,1.775506,4.179158e-22


In [114]:
X.columns = ['PCA_{}'.format(i) for i in range(X.shape[1])]
X.head()

Unnamed: 0,PCA_0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,...,PCA_20,PCA_21,PCA_22,PCA_23,PCA_24,PCA_25,PCA_26,PCA_27,PCA_28,PCA_29
0,2.404144,-0.650763,0.32397,-0.5589,-5.41663,0.526042,0.234551,-0.566889,-0.16298,0.425263,...,-187.456311,-1.06911,-10376.696016,-552.202093,-286.140773,-9.506381,-29.515973,-21.999515,-1.224494,-2.485306e-22
1,-3.722485,0.011006,-0.038629,0.217503,0.770855,0.251194,-0.088012,0.246138,-0.009564,-0.214485,...,-187.456311,-1.06911,-10376.663601,-652.079929,-301.923628,-2.157724,-29.844353,-22.143923,1.775506,2.67099e-20
2,2.404144,-0.650763,0.32397,-0.5589,1.142362,-0.601238,0.234551,-0.566889,0.27908,0.545363,...,-187.456311,-1.06911,-10376.66363,-652.079992,-9.871609,71.931517,-29.826001,-21.964276,-1.224494,-7.5249e-23
3,-3.722485,0.011006,0.32397,-0.5589,1.142362,-0.601238,0.234551,-0.566889,0.27908,0.545363,...,-187.456311,-1.06911,-10377.176559,1319.371217,-95.9256,-56.243696,-18.673952,-2.354387,1.775506,-1.30927e-22
4,-3.722485,0.011006,-6.570013,0.752694,-5.41663,0.526042,-6.517419,1.024724,-6.254782,-0.651203,...,-185.428142,-0.025041,159993.746672,-1232.14733,-303.297541,-2.928574,-29.320634,-22.517405,1.775506,4.179158e-22


In [20]:
for ft in cat_ft:
    v_c = df[ft].value_counts(dropna=False)
    print('LEN de {0}: {1}'.format(ft, len(v_c)))
    print('Suma de NA: {}'.format(df[ft].isna().sum()))
    print(v_c,'\n')

LEN de ProductCD: 5
Suma de NA: 0
W    800657
C    137785
R     73346
H     62397
S     23046
Name: ProductCD, dtype: int64 

LEN de P_emaildomain: 61
Suma de NA: 163648
gmail.com           435803
yahoo.com           182784
NaN                 163648
hotmail.com          85649
anonymous.com        71062
aol.com              52337
comcast.net          14474
icloud.com           12316
outlook.com           9934
att.net               7647
msn.com               7480
sbcglobal.net         5767
live.com              5720
verizon.net           5011
ymail.com             4075
bellsouth.net         3437
yahoo.com.mx          2827
me.com                2713
cox.net               2657
optonline.net         1937
live.com.mx           1470
charter.net           1443
mail.com              1156
rocketmail.com        1105
gmail                  993
earthlink.net          979
outlook.es             863
mac.com                862
hotmail.fr             674
hotmail.es             627
                    

Suma de NA: 447739
T      588323
NaN    447739
F       61169
Name: M2, dtype: int64 

LEN de M3: 3
Suma de NA: 447739
T      518244
NaN    447739
F      131248
Name: M3, dtype: int64 

LEN de M4: 4
Suma de NA: 519189
NaN    519189
M0     357789
M2     122947
M1      97306
Name: M4, dtype: int64 

LEN de M5: 3
Suma de NA: 660114
NaN    660114
F      240155
T      196962
Name: M5, dtype: int64 

LEN de M6: 3
Suma de NA: 328299
F      419433
T      349499
NaN    328299
Name: M6, dtype: int64 

LEN de M7: 3
Suma de NA: 581283
NaN    581283
F      444604
T       71344
Name: M7, dtype: int64 

LEN de M8: 3
Suma de NA: 581256
NaN    581256
F      323650
T      192325
Name: M8, dtype: int64 

LEN de M9: 3
Suma de NA: 581256
NaN    581256
T      441935
F       74040
Name: M9, dtype: int64 

LEN de DeviceType: 3
Suma de NA: 819490
NaN        819490
desktop    159568
mobile     118173
Name: DeviceType, dtype: int64 

LEN de DeviceInfo: 2800
Suma de NA: 863508
NaN                               863

Suma de NA: 1087000
NaN                     1087000
IP_PROXY:TRANSPARENT       7203
IP_PROXY:ANONYMOUS         2010
IP_PROXY:HIDDEN            1018
Name: id_23, dtype: int64 

LEN de id_24: 18
Suma de NA: 1087744
NaN     1087744
11.0       5666
15.0       2948
16.0        315
21.0        222
24.0        141
18.0        104
12.0         26
19.0         24
26.0         14
17.0          9
25.0          9
20.0          4
22.0          1
23.0          1
14.0          1
13.0          1
10.0          1
Name: id_24, dtype: int64 

LEN de id_25: 441
Suma de NA: 1087060
NaN      1087060
321.0       5233
205.0        569
426.0        469
442.0        188
501.0        151
371.0        132
509.0        115
524.0        114
123.0         97
126.0         64
479.0         59
236.0         56
463.0         55
143.0         54
432.0         53
385.0         51
505.0         51
247.0         48
239.0         45
472.0         45
486.0         43
356.0         38
270.0         37
511.0         36
359.0   

LEN de id_35: 3
Suma de NA: 819269
NaN    819269
T      149464
F      128498
Name: id_35, dtype: int64 

LEN de id_36: 3
Suma de NA: 819269
NaN    819269
F      267353
T       10609
Name: id_36, dtype: int64 

LEN de id_37: 3
Suma de NA: 819269
NaN    819269
T      215149
F       62813
Name: id_37, dtype: int64 

LEN de id_38: 3
Suma de NA: 819269
NaN    819269
F      168980
T      108982
Name: id_38, dtype: int64 



In [118]:
first_na_impute_cat = {
    'P_emaildomain': 'na.na',
    'R_emaildomain': 'na.na',
    'card2': -1,
    'card3': -1,
    'card4': 'U',
    'card5': -1,
    'card6': 'U',
    'addr1': -1,
    'addr2': -1,
    'M1': 'U',
    'M2': 'U',
    'M3': 'U',
    'M4': 'M3',
    'M5': 'U',
    'M6': 'U',
    'M7': 'U',
    'M8': 'U',
    'M9': 'U',
    'DeviceType': 'U',
    'DeviceInfo': 'U',
    'id_12': 'U',
    'id_13': -1,
    'id_14': -1,
    'id_15': 'Unknown',
    'id_16': 'U',
    'id_17': -1,
    'id_18': -1,
    'id_19': -1,
    'id_20': -1,
    'id_21': -1,
    'id_22': -1,
    'id_23': 'IP_PROXY:NA',
    'id_24': -1,
    'id_25': -1,
    'id_26': -1,
    'id_27': 'U',
    'id_28': 'U',
    'id_29': 'U',
    'id_30': 'U',
    'id_31': 'U',
    'id_32': 99,
    'id_33': '9999x9999',
    'id_34': 'match_status:-2',
    'id_35': 'U',
    'id_36': 'U',
    'id_37': 'U',
    'id_38': 'U',
}

In [119]:
df_imputed = df.fillna(first_na_impute_cat)
# df_imputed = df

In [120]:
df_imputed.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,isFraud,month
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,99.0,9999x9999,match_status:-2,U,U,U,U,0.0,12
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,99.0,9999x9999,match_status:-2,U,U,U,U,0.0,12
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,99.0,9999x9999,match_status:-2,U,U,U,U,0.0,12
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,U,99.0,9999x9999,match_status:-2,U,U,U,U,0.0,12
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,0.0,12


In [121]:
df_imputed['P_emaildomain_0'] = df_imputed['P_emaildomain'].apply(lambda x: x.split('.')[0])
df_imputed['P_emaildomain_1'] = df_imputed['P_emaildomain'].apply(lambda x: '.'.join(x.split('.')[1:]))
df_imputed['R_emaildomain_0'] = df_imputed['R_emaildomain'].apply(lambda x: x.split('.')[0])
df_imputed['R_emaildomain_1'] = df_imputed['R_emaildomain'].apply(lambda x: '.'.join(x.split('.')[1:]))

In [122]:
def proc_id_30_0(x):
    x = x.lower()
    try:
        if 'windows' in x or 'android' in x:
            return x.split()[1]
        elif 'ios' in x:
            return x.split()[1].split('.')[0]
        elif 'mac' in x:
            return x.split()[-1]
        elif 'linux' in x:
            return 'linux'
        else:
            return 'other'
    except:
        return 'other'
    
def proc_id_30_1(x):
    x = x.lower()
    if 'windows' in x:
        return 'windows'
    elif 'ios' in x:
        return 'ios'
    elif 'android' in x:
        return 'android'
    elif 'mac' in x:
        return 'mac'
    elif 'linux' in x:
        return 'linux'
    else:
        return 'other'

In [123]:
df_imputed['proc_id_30_0'] = df_imputed['id_30'].apply(lambda x: proc_id_30_0(x))
df_imputed['proc_id_30_1'] = df_imputed['id_30'].apply(lambda x: proc_id_30_1(x))

In [124]:
def proc_id_31_0(x):
    x = x.lower()
    if 'chrome' in x and not 'android' in x:
        return 'chrome'
    elif 'mobile' in x and 'safari' in x:
        return 'safari_mobile'
    elif 'ie' in x:
        return 'ie'
    elif 'safari' in x and not 'mobile' in x:
        return 'safari'
    elif 'chrome' in x and 'android' in x:
        return 'chrome_android'
    elif 'edge' in x:
        return 'edge'
    elif 'firefox' in x:
        return 'firefox'
    elif 'samsung' in x:
        return 'samsung'
    elif 'other' in x:
        return 'other'
    elif 'opera' in x:
        return 'opera'
    elif 'android' in x and not 'chrome' in x:
        return 'android'
    elif 'chrome' in x and 'ios' in x:
        return 'chrome_ios'
    elif 'google' in x:
        return 'google'
    else:
        return 'other'

In [125]:
df_imputed['proc_id_31_0'] = df_imputed['id_31'].apply(lambda x: proc_id_31_0(x))

In [126]:
df_imputed.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_38,isFraud,month,P_emaildomain_0,P_emaildomain_1,R_emaildomain_0,R_emaildomain_1,proc_id_30_0,proc_id_30_1,proc_id_31_0
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,0.0,12,na,na,na,na,other,other,other
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,0.0,12,gmail,com,na,na,other,other,other
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,U,0.0,12,outlook,com,na,na,other,other,other
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,U,0.0,12,yahoo,com,na,na,other,other,other
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,T,0.0,12,gmail,com,na,na,7.0,android,samsung


In [127]:
def dev_name(dataframe):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
    dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]

    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    gc.collect()
    
    return dataframe

In [128]:
df_imputed = dev_name(df_imputed)

In [129]:
df_imputed.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,month,P_emaildomain_0,P_emaildomain_1,R_emaildomain_0,R_emaildomain_1,proc_id_30_0,proc_id_30_1,proc_id_31_0,device_name,device_version
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,12,na,na,na,na,other,other,other,U,
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,12,gmail,com,na,na,other,other,other,U,
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,12,outlook,com,na,na,other,other,other,U,
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,12,yahoo,com,na,na,other,other,other,U,
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,12,gmail,com,na,na,7.0,android,samsung,Samsung,NRD90M


In [130]:
new_cat_ft = list()
for i, j in zip(df_imputed.dtypes, df_imputed.columns):
    if i in ['O', 'object']:
        new_cat_ft.append(j)
        
new_cat_ft = list(set(new_cat_ft)-set(cat_ft))
new_cat_ft = cat_ft + new_cat_ft
len(new_cat_ft)

59

In [131]:
# num_ft_imput_dict = dict()
# for c in df_imputed.columns:
#     if c not in new_cat_ft and df_imputed[c].isna().sum() > 0:
#         mdn = df_imputed[c].median()
#         num_ft_imput_dict[c] = mdn

# df_imputed2 = df_imputed.fillna(num_ft_imput_dict)

In [132]:
def freq_encoder(df, label, new_label, min_freq = 0.001):
    rows = df.shape[0]
    n = 0
    dict_fe = dict()
    vc = df[label].value_counts()
    for i, j in zip(vc.index, vc):
        ratio = j/rows
        if ratio > min_freq:
            dict_fe[i] = n
            n += 1
        else:
            dict_fe[i] = n
        
    if n < 2**8:
        _d_type = 'uint8'
    elif n >= 2**8 and n < 8**16:
        _d_type = 'uint16'
    elif n >= 2**16 and n < 8**32:
        _d_type = 'uint32'
    else:
        _d_type = 'uint64'
        
    df[new_label] = df[label].apply(lambda x: dict_fe[x]).astype(_d_type)
    
    n = 0
    dict_fe = dict()
    vc = df[label].value_counts()
    for i, j in zip(vc.index, vc):
        ratio = j/rows
        if ratio > min_freq:
            dict_fe[i] = n
            n += 1
        else:
            dict_fe[i] = n
            
    df[new_label] = df[label].apply(lambda x: dict_fe[x]).astype(_d_type)
    
    return df

In [133]:
df_imputed2 = df_imputed

In [134]:
df_imputed2['device_version'] = df_imputed2['device_version'].replace(np.nan, 'U')

In [135]:
df_imputed2.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,month,P_emaildomain_0,P_emaildomain_1,R_emaildomain_0,R_emaildomain_1,proc_id_30_0,proc_id_30_1,proc_id_31_0,device_name,device_version
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,12,na,na,na,na,other,other,other,U,U
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,12,gmail,com,na,na,other,other,other,U,U
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,12,outlook,com,na,na,other,other,other,U,U
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,12,yahoo,com,na,na,other,other,other,U,U
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,12,gmail,com,na,na,7.0,android,samsung,Samsung,NRD90M


In [136]:
fr = 0.0005
print('frecuencia minima: {}'.format(fr))
for ft in new_cat_ft:
    df_imputed2 = freq_encoder(df_imputed2, ft, str(ft+'_fe1'), min_freq = fr)
    print('\t{0}: {1}'.format(ft, len(df_imputed2[ft+'_fe1'].value_counts())))
    
# fr = 0.001
# print('frecuencia minima: {}'.format(fr))
# for ft in new_cat_ft:
#     df_imputed2 = freq_encoder(df_imputed2, ft, str(ft+'_fe2'), min_freq = fr)
#     print('\t{0}: {1}'.format(ft, len(df_imputed2[ft+'_fe2'].value_counts())))
    
fr = 0
print('frecuencia minima: {}'.format(fr))
for ft in new_cat_ft:
    df_imputed2 = freq_encoder(df_imputed2, ft, str(ft), min_freq = fr)
    print('\t{0}: {1}'.format(ft, len(df_imputed2[ft].value_counts())))

frecuencia minima: 0.0005
	ProductCD: 5
	P_emaildomain: 35
	R_emaildomain: 23
	card1: 294
	card2: 170
	card3: 15
	card4: 5
	card5: 30
	card6: 4
	addr1: 59
	addr2: 5
	M1: 3
	M2: 3
	M3: 3
	M4: 4
	M5: 3
	M6: 3
	M7: 3
	M8: 3
	M9: 3
	DeviceType: 3
	DeviceInfo: 12
	id_12: 3
	id_13: 18
	id_14: 8
	id_15: 3
	id_16: 3
	id_17: 7
	id_18: 9
	id_19: 46
	id_20: 64
	id_21: 4
	id_22: 3
	id_23: 4
	id_24: 4
	id_25: 4
	id_26: 8
	id_27: 3
	id_28: 3
	id_29: 3
	id_30: 42
	id_31: 58
	id_32: 4
	id_33: 30
	id_34: 4
	id_35: 3
	id_36: 3
	id_37: 3
	id_38: 3
	date: 1
	R_emaildomain_1: 8
	proc_id_31_0: 12
	proc_id_30_1: 6
	device_name: 15
	R_emaildomain_0: 18
	P_emaildomain_0: 30
	device_version: 15
	P_emaildomain_1: 10
	proc_id_30_0: 27
frecuencia minima: 0
	ProductCD: 5
	P_emaildomain: 61
	R_emaildomain: 61
	card1: 17091
	card2: 502
	card3: 134
	card4: 5
	card5: 139
	card6: 5
	addr1: 442
	addr2: 94
	M1: 3
	M2: 3
	M3: 3
	M4: 4
	M5: 3
	M6: 3
	M7: 3
	M8: 3
	M9: 3
	DeviceType: 3
	DeviceInfo: 2799
	id_12: 3
	id_13: 56


In [137]:
new_cat_ft2 = new_cat_ft + [x for x in df_imputed2.columns if '_fe1' in x or '_fe2' in x]
new_cat_ft2

['ProductCD',
 'P_emaildomain',
 'R_emaildomain',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'DeviceType',
 'DeviceInfo',
 'id_12',
 'id_13',
 'id_14',
 'id_15',
 'id_16',
 'id_17',
 'id_18',
 'id_19',
 'id_20',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_25',
 'id_26',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_32',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'date',
 'R_emaildomain_1',
 'proc_id_31_0',
 'proc_id_30_1',
 'device_name',
 'R_emaildomain_0',
 'P_emaildomain_0',
 'device_version',
 'P_emaildomain_1',
 'proc_id_30_0',
 'ProductCD_fe1',
 'P_emaildomain_fe1',
 'R_emaildomain_fe1',
 'card1_fe1',
 'card2_fe1',
 'card3_fe1',
 'card4_fe1',
 'card5_fe1',
 'card6_fe1',
 'addr1_fe1',
 'addr2_fe1',
 'M1_fe1',
 'M2_fe1',
 'M3_fe1',
 'M4_fe1',
 'M5_fe1',
 'M6_fe1',
 'M7_fe1',
 'M8_fe1',
 'M9_fe1',
 'DeviceType_fe1',
 'DeviceInfo_fe1',
 'id_12_fe1',
 'id_13

In [95]:
# for c, d in zip(df_imputed2.columns, df_imputed2.dtypes):
#     if c not in new_cat_ft+['isFraud'] and str(d)[:4] != 'uint':
#         df_imputed2[c] = np.log1p(df_imputed2[c])

In [53]:
df_imputed2.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,R_emaildomain_0_fe2,proc_id_31_0_fe2,P_emaildomain_0_fe2,proc_id_30_0_fe2,P_emaildomain_1_fe2,R_emaildomain_1_fe2,proc_id_30_1_fe2,card_fe2,device_name_fe2,device_version_fe2
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,2,0,1,0,0,170,0,0
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,138,0,0
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,8,0,0,0,0,100,0,0
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,14,0,0
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,8,0,7,0,0,4,170,3,2


In [138]:
old_cols = df.columns

In [145]:
df_imputed2 = pd.concat([X.reset_index(drop=True), df_imputed2.reset_index(drop=True)], axis=1)

In [146]:
len(old_cols), len(df_imputed2.columns)

(440, 538)

In [147]:
new_cols = list(set(df_imputed2.columns) - set(old_cols))
dict_dtypes = reduce_memory2(df_imputed2[new_cols])

Reduce_memory...


In [148]:
df_imputed2[new_cols] = df_imputed2[new_cols].astype(dict_dtypes)

In [149]:
dict(df_imputed2.dtypes)

{'PCA_0': dtype('float16'),
 'PCA_1': dtype('float16'),
 'PCA_2': dtype('float16'),
 'PCA_3': dtype('float16'),
 'PCA_4': dtype('float16'),
 'PCA_5': dtype('float16'),
 'PCA_6': dtype('float16'),
 'PCA_7': dtype('float16'),
 'PCA_8': dtype('float16'),
 'PCA_9': dtype('float16'),
 'PCA_10': dtype('float32'),
 'PCA_11': dtype('float32'),
 'PCA_12': dtype('float32'),
 'PCA_13': dtype('float32'),
 'PCA_14': dtype('float32'),
 'PCA_15': dtype('float32'),
 'PCA_16': dtype('float32'),
 'PCA_17': dtype('float32'),
 'PCA_18': dtype('float32'),
 'PCA_19': dtype('float32'),
 'PCA_20': dtype('float32'),
 'PCA_21': dtype('float32'),
 'PCA_22': dtype('float32'),
 'PCA_23': dtype('float32'),
 'PCA_24': dtype('float16'),
 'PCA_25': dtype('float16'),
 'PCA_26': dtype('float16'),
 'PCA_27': dtype('float16'),
 'PCA_28': dtype('float16'),
 'PCA_29': dtype('float16'),
 'C1': dtype('float16'),
 'C10': dtype('float16'),
 'C11': dtype('float16'),
 'C12': dtype('float16'),
 'C13': dtype('float16'),
 'C14': dty

In [150]:
train = df_imputed2[df_imputed2.isFraud.notnull()]
test = df_imputed2[df_imputed2.isFraud.isnull()]

In [151]:
train.shape, test.shape

((590540, 538), (506691, 538))

In [153]:
train.to_csv(data_folder+'/train_generated_4.csv', header=True, index=None)
print('train guardado')
test.to_csv(data_folder+'/test_generated_4.csv', header=True, index=None)
print('test guardado')

train guardado
test guardado


# Synthetic DF

#### Splitting train equal to train / test split

In [8]:
max_dt_train = train.TransactionDT.max()
min_dt_test = test.TransactionDT.min()
dt_gap = min_dt_test - max_dt_train
print(max_dt_train, min_dt_test, dt_gap)

15811131 18403224 2592093


In [9]:
dt_gap_train = dt_gap/2
dt_gap_train

1296046.5

In [10]:
split_line_train = (train.TransactionDT.max() - train.TransactionDT.min())/2
split_line_train

7862365.5

In [11]:
train_syn = train[train.TransactionDT < (split_line_train-dt_gap_train)]
test_syn = train[train.TransactionDT > (split_line_train+dt_gap_train)]
train_syn.shape, test_syn.shape

((270901, 433), (221908, 433))

In [12]:
len_train_syn = train_syn.shape[0]
len_train_syn

270901

In [13]:
df = pd.concat([train_syn, test_syn], axis=0)
df.shape

(492809, 433)

In [13]:
# for ft in cat_ft:
#     v_c = df[ft].value_counts(dropna=False)
#     print('LEN de {0}: {1}'.format(ft, len(v_c)))
#     print('Suma de NA: {}'.format(df[ft].isna().sum()))
#     print(v_c,'\n')

In [14]:
first_na_impute_cat = {
    'P_emaildomain': 'na.na',
    'R_emaildomain': 'na.na',
    'card2': -1,
    'card3': -1,
    'card4': 'U',
    'card5': -1,
    'card6': 'U',
    'addr1': -1,
    'addr2': -1,
    'M1': 'U',
    'M2': 'U',
    'M3': 'U',
    'M4': 'M3',
    'M5': 'U',
    'M6': 'U',
    'M7': 'U',
    'M8': 'U',
    'M9': 'U',
    'DeviceType': 'U',
    'DeviceInfo': 'U',
    'id_12': 'U',
    'id_13': -1,
    'id_14': -1,
    'id_15': 'Unknown',
    'id_16': 'U',
    'id_17': -1,
    'id_18': -1,
    'id_19': -1,
    'id_20': -1,
    'id_21': -1,
    'id_22': -1,
    'id_23': 'IP_PROXY:NA',
    'id_24': -1,
    'id_25': -1,
    'id_26': -1,
    'id_27': 'U',
    'id_28': 'U',
    'id_29': 'U',
    'id_30': 'U',
    'id_31': 'U',
    'id_32': 99,
    'id_33': '9999x9999',
    'id_34': 'match_status:-2',
    'id_35': 'U',
    'id_36': 'U',
    'id_37': 'U',
    'id_38': 'U',
}

In [15]:
# train_syn = train_syn.fillna(first_na_impute_cat)
# test_syn = test_syn.fillna(first_na_impute_cat)

df = df.fillna(first_na_impute_cat)

In [16]:
# train_syn.head()

In [18]:
train_syn['P_emaildomain_0'] = train_syn['P_emaildomain'].apply(lambda x: x.split('.')[0])
train_syn['P_emaildomain_1'] = train_syn['P_emaildomain'].apply(lambda x: '.'.join(x.split('.')[1:]))
train_syn['R_emaildomain_0'] = train_syn['R_emaildomain'].apply(lambda x: x.split('.')[0])
train_syn['R_emaildomain_1'] = train_syn['R_emaildomain'].apply(lambda x: '.'.join(x.split('.')[1:]))

test_syn['P_emaildomain_0'] = test_syn['P_emaildomain'].apply(lambda x: x.split('.')[0])
test_syn['P_emaildomain_1'] = test_syn['P_emaildomain'].apply(lambda x: '.'.join(x.split('.')[1:]))
test_syn['R_emaildomain_0'] = test_syn['R_emaildomain'].apply(lambda x: x.split('.')[0])
test_syn['R_emaildomain_1'] = test_syn['R_emaildomain'].apply(lambda x: '.'.join(x.split('.')[1:]))

In [16]:
def proc_id_30_0(x):
    x = x.lower()
    try:
        if 'windows' in x or 'android' in x:
            return x.split()[1]
        elif 'ios' in x:
            return x.split()[1].split('.')[0]
        elif 'mac' in x:
            return x.split()[-1]
        elif 'linux' in x:
            return 'linux'
        else:
            return 'other'
    except:
        return 'other'
    
def proc_id_30_1(x):
    x = x.lower()
    if 'windows' in x:
        return 'windows'
    elif 'ios' in x:
        return 'ios'
    elif 'android' in x:
        return 'android'
    elif 'mac' in x:
        return 'mac'
    elif 'linux' in x:
        return 'linux'
    else:
        return 'other'

In [20]:
train_syn['proc_id_30_0'] = train_syn['id_30'].apply(lambda x: proc_id_30_0(x))
train_syn['proc_id_30_1'] = train_syn['id_30'].apply(lambda x: proc_id_30_1(x))

test_syn['proc_id_30_0'] = test_syn['id_30'].apply(lambda x: proc_id_30_0(x))
test_syn['proc_id_30_1'] = test_syn['id_30'].apply(lambda x: proc_id_30_1(x))

In [17]:
df['proc_id_30_0'] = df['id_30'].apply(lambda x: proc_id_30_0(x))
df['proc_id_30_1'] = df['id_30'].apply(lambda x: proc_id_30_1(x))

In [18]:
def proc_id_31_0(x):
    x = x.lower()
    if 'chrome' in x and not 'android' in x:
        return 'chrome'
    elif 'mobile' in x and 'safari' in x:
        return 'safari_mobile'
    elif 'ie' in x:
        return 'ie'
    elif 'safari' in x and not 'mobile' in x:
        return 'safari'
    elif 'chrome' in x and 'android' in x:
        return 'chrome_android'
    elif 'edge' in x:
        return 'edge'
    elif 'firefox' in x:
        return 'firefox'
    elif 'samsung' in x:
        return 'samsung'
    elif 'other' in x:
        return 'other'
    elif 'opera' in x:
        return 'opera'
    elif 'android' in x and not 'chrome' in x:
        return 'android'
    elif 'chrome' in x and 'ios' in x:
        return 'chrome_ios'
    elif 'google' in x:
        return 'google'
    else:
        return 'other'

In [22]:
train_syn['proc_id_31_0'] = train_syn['id_31'].apply(lambda x: proc_id_31_0(x))
test_syn['proc_id_31_0'] = test_syn['id_31'].apply(lambda x: proc_id_31_0(x))

In [19]:
df['proc_id_31_0'] = df['id_31'].apply(lambda x: proc_id_31_0(x))

In [20]:
df.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,proc_id_30_0,proc_id_30_1,proc_id_31_0
0,0,86400,68.5,W,13926,-1.0,150.0,discover,142.0,credit,...,match_status:-2,U,U,U,U,U,U,other,other,other
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,match_status:-2,U,U,U,U,U,U,other,other,other
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,match_status:-2,U,U,U,U,U,U,other,other,other
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,match_status:-2,U,U,U,U,U,U,other,other,other
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,7.0,android,samsung


In [21]:
def dev_name(dataframe):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
    dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]

    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    gc.collect()
    
    return dataframe

In [25]:
train_syn = dev_name(train_syn)
test_syn = dev_name(test_syn)

In [22]:
df = dev_name(df)

In [23]:
df.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_36,id_37,id_38,DeviceType,DeviceInfo,proc_id_30_0,proc_id_30_1,proc_id_31_0,device_name,device_version
0,0,86400,68.5,W,13926,-1.0,150.0,discover,142.0,credit,...,U,U,U,U,U,other,other,other,U,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,U,U,U,U,U,other,other,other,U,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,U,U,U,U,U,other,other,other,U,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,U,U,U,U,U,other,other,other,U,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,7.0,android,samsung,Samsung,NRD90M


In [24]:
new_cat_ft = list()
for i, j in zip(train_syn.dtypes, train_syn.columns):
    if i in ['O', 'object']:
        new_cat_ft.append(j)
        
new_cat_ft = list(set(new_cat_ft)-set(cat_ft))
new_cat_ft = cat_ft + new_cat_ft
len(new_cat_ft)

49

In [29]:
# num_ft_imput_dict = dict()
# for c in df_imputed.columns:
#     if c not in new_cat_ft and df_imputed[c].isna().sum() > 0:
#         mdn = df_imputed[c].median()
#         num_ft_imput_dict[c] = mdn

In [30]:
# df_imputed2 = df_imputed.fillna(num_ft_imput_dict)

In [34]:
train_syn['device_version'] = train_syn['device_version'].replace([np.nan, None], 'U')
test_syn['device_version'] = test_syn['device_version'].replace([np.nan, None], 'U')

In [25]:
df['device_version'] = df['device_version'].replace([np.nan, None], 'U')

In [26]:
df.device_version.head()

0         U
1         U
2         U
3         U
4    NRD90M
Name: device_version, dtype: object

In [27]:
def freq_encoder2(df, label, min_freq = 0.001):
    rows = df.shape[0]
    n = 0
    dict_fe = dict()
    vc = df[label].value_counts()
    for i, j in zip(vc.index, vc):
        ratio = j/rows
        if ratio > min_freq:
            dict_fe[i] = n
            n += 1
        else:
            dict_fe[i] = n
        
    if n < 2**8:
        _d_type = 'uint8'
    elif n >= 2**8 and n < 8**16:
        _d_type = 'uint16'
    elif n >= 2**16 and n < 8**32:
        _d_type = 'uint32'
    else:
        _d_type = 'uint64'
        
#     df[new_label] = df[label].apply(lambda x: dict_fe[x]).astype(_d_type)
    
    return dict_fe, _d_type

In [28]:
def freq_encoder2_test(df, label, dict_fe):
    list_unique = df[label].unique()
    max_val = max(dict_fe.values())
    train_values = dict_fe.keys()
    for v in list_unique:
        if v not in train_values:
            dict_fe[v] = max_val+1
    
    return dict_fe

In [106]:
fr = 0.0005
print('frecuencia minima: {}'.format(fr))
for ft in new_cat_ft:
    dict_fe, _d_type = freq_encoder2(train_syn, ft, fr)
    dict_fe_test = freq_encoder2_test(test_syn, ft, dict_fe)
    train_syn[ft+'_fe1'] = train_syn[ft].apply(lambda x: dict_fe[x]).astype(_d_type)
    test_syn[ft+'_fe1'] = test_syn[ft].apply(lambda x: dict_fe_test[x]).astype(_d_type)
#     print('\t{0}: {1}'.format(ft, len(df_imputed2[ft+'_fe1'].value_counts())))
    
fr = 0.001
print('frecuencia minima: {}'.format(fr))
for ft in new_cat_ft:
    dict_fe, _d_type = freq_encoder2(train_syn, ft, fr)
    dict_fe_test = freq_encoder2_test(test_syn, ft, dict_fe)
    train_syn[ft+'_fe2'] = train_syn[ft].apply(lambda x: dict_fe[x]).astype(_d_type)
    test_syn[ft+'_fe2'] = test_syn[ft].apply(lambda x: dict_fe_test[x]).astype(_d_type)
#     df_imputed2 = freq_encoder(df_imputed2, ft, str(ft+'_fe2'), min_freq = fr)
#     print('\t{0}: {1}'.format(ft, len(df_imputed2[ft+'_fe2'].value_counts())))
    
fr = 0
print('frecuencia minima: {}'.format(fr))
for ft in new_cat_ft:
    dict_fe, _d_type = freq_encoder2(train_syn, ft, fr)
    dict_fe_test = freq_encoder2_test(test_syn, ft, dict_fe)
    train_syn[ft] = train_syn[ft].apply(lambda x: dict_fe[x]).astype(_d_type)
    test_syn[ft] = test_syn[ft].apply(lambda x: dict_fe_test[x]).astype(_d_type)
#     df_imputed2 = freq_encoder(df_imputed2, ft, str(ft), min_freq = fr)
#     print('\t{0}: {1}'.format(ft, len(df_imputed2[ft].value_counts())))

frecuencia minima: 0.0005
frecuencia minima: 0.001
frecuencia minima: 0


In [30]:
cols_D = [x for x in df.columns if 'D' in x and x not in ['TransactionDT','ProductCD','DeviceType','DeviceInfo']]
cols_D

['D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15']

In [29]:
fr = 0.0005
print('frecuencia minima: {}'.format(fr))
for ft in new_cat_ft:
    dict_fe, _d_type = freq_encoder2(df, ft, fr)
    df[ft+'_fe1'] = df[ft].apply(lambda x: dict_fe[x]).astype(_d_type)
    
fr = 0.001
print('frecuencia minima: {}'.format(fr))
for ft in new_cat_ft:
    dict_fe, _d_type = freq_encoder2(df, ft, fr)
    df[ft+'_fe2'] = df[ft].apply(lambda x: dict_fe[x]).astype(_d_type)
    
fr = 0
print('frecuencia minima: {}'.format(fr))
for ft in new_cat_ft:
    dict_fe, _d_type = freq_encoder2(df, ft, fr)
    df[ft] = df[ft].apply(lambda x: dict_fe[x]).astype(_d_type)

frecuencia minima: 0.0005
frecuencia minima: 0.001
frecuencia minima: 0


In [30]:
df.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_29_fe2,id_30_fe2,id_31_fe2,id_32_fe2,id_33_fe2,id_34_fe2,id_35_fe2,id_36_fe2,id_37_fe2,id_38_fe2
0,0,86400,68.5,0,1572,12,0,3,30,1,...,0,0,0,0,0,0,0,0,0,0
1,0,86401,29.0,0,152,36,0,1,3,1,...,0,0,0,0,0,0,0,0,0,0
2,0,86469,59.0,0,95,3,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,0,86499,50.0,0,15,21,0,1,4,0,...,0,0,0,0,0,0,0,0,0,0
4,0,86506,50.0,3,2995,8,0,1,3,1,...,2,5,22,2,20,1,1,1,1,1


In [31]:
new_cat_ft2 = new_cat_ft + [x for x in df.columns if '_fe1' in x or '_fe2' in x]
new_cat_ft2[:5]

['ProductCD', 'P_emaildomain', 'R_emaildomain', 'card1', 'card2']

In [95]:
# for c, d in zip(df_imputed2.columns, df_imputed2.dtypes):
#     if c not in new_cat_ft+['isFraud'] and str(d)[:4] != 'uint':
#         df_imputed2[c] = np.log1p(df_imputed2[c])

In [110]:
train_syn.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_38_fe2,device_version_fe2,R_emaildomain_0_fe2,P_emaildomain_1_fe2,proc_id_30_0_fe2,proc_id_30_1_fe2,proc_id_31_0_fe2,R_emaildomain_1_fe2,P_emaildomain_0_fe2,device_name_fe2
0,0,86400,68.5,0,1749,12,0,3,35,1,...,0,0,0,1,0,0,0,0,2,0
1,0,86401,29.0,0,139,31,0,1,3,1,...,0,0,0,0,0,0,0,0,0,0
2,0,86469,59.0,0,101,3,0,0,2,0,...,0,0,0,0,0,0,0,0,8,0
3,0,86499,50.0,0,17,20,0,1,4,0,...,0,0,0,0,0,0,0,0,1,0
4,0,86506,50.0,3,5782,7,0,1,3,1,...,1,2,0,0,4,4,8,0,0,4


In [37]:
dict(df.dtypes)

{'isFraud': dtype('int8'),
 'TransactionDT': dtype('int32'),
 'TransactionAmt': dtype('float16'),
 'ProductCD': dtype('uint8'),
 'card1': dtype('uint16'),
 'card2': dtype('uint16'),
 'card3': dtype('uint8'),
 'card4': dtype('uint8'),
 'card5': dtype('uint8'),
 'card6': dtype('uint8'),
 'addr1': dtype('uint16'),
 'addr2': dtype('uint8'),
 'dist1': dtype('float16'),
 'dist2': dtype('float16'),
 'P_emaildomain': dtype('uint8'),
 'R_emaildomain': dtype('uint8'),
 'C1': dtype('float16'),
 'C2': dtype('float16'),
 'C3': dtype('float16'),
 'C4': dtype('float16'),
 'C5': dtype('float16'),
 'C6': dtype('float16'),
 'C7': dtype('float16'),
 'C8': dtype('float16'),
 'C9': dtype('float16'),
 'C10': dtype('float16'),
 'C11': dtype('float16'),
 'C12': dtype('float16'),
 'C13': dtype('float16'),
 'C14': dtype('float16'),
 'D1': dtype('float16'),
 'D2': dtype('float16'),
 'D3': dtype('float16'),
 'D4': dtype('float16'),
 'D5': dtype('float16'),
 'D6': dtype('float16'),
 'D7': dtype('float16'),
 'D8': 

In [32]:
df[:len_train_syn].shape, df[len_train_syn:].shape

((270901, 536), (221908, 536))

In [33]:
df[:len_train_syn].to_csv(data_folder+'/train_synthetic_1.csv', header=True, index=None)
print('train guardado')
df[len_train_syn:].to_csv(data_folder+'/test_synthetic_1.csv', header=True, index=None)
print('test guardado')

train guardado
test guardado


In [4]:
data_folder = 'input'
train = pd.read_csv(data_folder+'/train_generated_0.csv', dtype = schema_generated_0)
# test = pd.read_csv(data_folder+'/test_generated_0.csv', dtype = schema_generated_0)

In [20]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb

In [6]:
def custom_loss(y_pred, y_true):
    precision, recall, thresholds = precision_recall_curve(np.where(y_true >= 0.5, 1, 0), y_pred)
    AUC = auc(recall, precision)
    if AUC != AUC:
        AUC = 0
    return 'PR_AUC', AUC, True

In [29]:
params = {
#     'max_depth': 11,
    'num_leaves': 361,
    'metric': ['PR_AUC'],
    'first_metric_only': True,
    'n_estimators': 20000,
    'learning_rate': 0.07,
    'colsample_bytree': 0.6,
    'objective': 'xentropy',
    'n_jobs': -1,
    'seed': 42,
    'bagging_fraction': 0.7,
    'bagging_freq': 8,
    'lambda_l1': 0.2,
    'lambda_l2': 0.2,
#     'is_unbalance': True
}

In [30]:
lgb_model = lgb.LGBMClassifier(**params)

In [15]:
X_cols = [x for x in train.columns if x not in ['isFraud', 'TransactionDT', 'TransactionID']]

In [132]:
# df_imputed2_1 = df_imputed2[df_imputed2.isFraud==1]
# df_imputed2_0 = df_imputed2[df_imputed2.isFraud==0].sample(frac=0.3)
# X = pd.concat([df_imputed2_1, df_imputed2_0])[X_cols]
# y = pd.concat([df_imputed2_1, df_imputed2_0]).isFraud

In [26]:
X = train.sort_values('TransactionDT')[X_cols]
y = train.isFraud

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
tscv = TimeSeriesSplit(n_splits=5)

In [32]:
lgb_model.feature_importances_

array([ 663,  184,  459,   88, 1286,  534,  594,   28,   75,  315,  573,
         25,  144,  424,  982, 1412, 1077,  251,  155,  294, 2002, 1603,
       1333, 1366, 1297,  204,  201, 1182,  796,  563,   16,    2,   73,
        134,  256,  121,  128,   91,   86,   70, 1399,   90,  476, 4564,
          1,   67,   41,   24,   53,   31,   12,   48,    9,    0,    1,
         26,   87,    0,    0,    5,    0,    1,    8,    3,    1,    0,
          0,   91,    0,    0,    0,    6,   61,   10,  235,  452,  238,
        115,   84,  357,  222,   87,  209,   91,   61,  120,   43,    5,
        117,    3,   65,    2,    1,  179,   98,  159,   16,   15,   15,
         26,   26,   95,  112,   43,    7,    3,   13,   18,   16,   16,
        113,   43,  162,   22,   13,   12,  174,  180,  172,   33,   28,
         14,   19,   61,   47,   11,    7,   13,   12,   21,   15,   29,
          9,   20,   24,   30,    8,   20,    8,   11,   14,   16,   10,
         23,  153,    9,    6,   10,    4,    0,   

In [31]:
n = 1
for train_index, test_index in tscv.split(X=X, y=y):
    print('FOLD {}'.format(n))
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    lgb_model.fit(X_train,
                  y_train,
                  eval_set=[(X_train, y_train), (X_test, y_test)],
                  verbose=50,
                  early_stopping_rounds=50,
                  eval_metric=custom_loss
                 )
    del X_train
    del X_test
    del y_train
    del y_test
    gc.collect()
    n += 1

FOLD 1
Training until validation scores don't improve for 50 rounds.
[50]	training's PR_AUC: 0.871842	valid_1's PR_AUC: 0.582042
[100]	training's PR_AUC: 0.968293	valid_1's PR_AUC: 0.590685
[150]	training's PR_AUC: 0.98953	valid_1's PR_AUC: 0.596291
[200]	training's PR_AUC: 0.996639	valid_1's PR_AUC: 0.601554
[250]	training's PR_AUC: 0.998221	valid_1's PR_AUC: 0.60498
Early stopping, best iteration is:
[249]	training's PR_AUC: 0.998221	valid_1's PR_AUC: 0.605395
FOLD 2


KeyboardInterrupt: 

In [19]:
lgb_model.fit(X_train,
                   y_train,
                   eval_set=[(X_train, y_train), (X_test, y_test)],
                   verbose=50,
                   early_stopping_rounds=50,
                   eval_metric=custom_loss
                  )

Training until validation scores don't improve for 50 rounds.
[50]	training's PR_AUC: 0.625616	valid_1's PR_AUC: 0.618846
[100]	training's PR_AUC: 0.662864	valid_1's PR_AUC: 0.651928
[150]	training's PR_AUC: 0.684206	valid_1's PR_AUC: 0.667586
[200]	training's PR_AUC: 0.701413	valid_1's PR_AUC: 0.681069
[250]	training's PR_AUC: 0.717748	valid_1's PR_AUC: 0.693366
[300]	training's PR_AUC: 0.730906	valid_1's PR_AUC: 0.704521
[350]	training's PR_AUC: 0.744675	valid_1's PR_AUC: 0.714907
[400]	training's PR_AUC: 0.753716	valid_1's PR_AUC: 0.722393
[450]	training's PR_AUC: 0.760865	valid_1's PR_AUC: 0.726149
[500]	training's PR_AUC: 0.770495	valid_1's PR_AUC: 0.730711


KeyboardInterrupt: 

In [111]:
n_round = lgb_model.best_iteration_
n_round

2038

In [113]:
params = {
    'max_depth': 13,
    'metric': ['PR_AUC'],
    'first_metric_only': True,
    'n_estimators': int(n_round*1.1),
    'learning_rate': 0.05,
    'colsample_bytree': 0.8,
    'objective': 'xentropy',
    'n_jobs': -1,
    'seed': 42,
    'bagging_fraction': 0.8,
    'lambda_l1': 0,
    'lambda_l2': 0,
}

In [114]:
lgb_model = lgb.LGBMClassifier(**params)

In [115]:
lgb_model.fit(X, y, verbose=100)

LGBMClassifier(bagging_fraction=0.8, boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.8, first_metric_only=True,
        importance_type='split', lambda_l1=0, lambda_l2=0,
        learning_rate=0.05, max_depth=13, metric=['PR_AUC'],
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=2241, n_jobs=-1, num_leaves=31, objective='xentropy',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=42,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

In [44]:
del X_train
del X_test
del y_train
del y_test

In [45]:
gc.collect()

8

In [139]:
X_test = df_imputed2[X.columns].iloc[train_index:, :]

In [140]:
y_preds = lgb_model.predict_proba(X_test)

In [141]:
y_preds[:,1]

array([0.00095012, 0.00021622, 0.00047366, ..., 0.00846984, 0.01005928,
       0.00357157])

In [142]:
df_sub = pd.read_csv(data_folder+'/sample_submission.csv')

In [143]:
df_sub['isFraud'] = y_preds[:,1]

In [144]:
df_sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.00095
1,3663550,0.000216
2,3663551,0.000474
3,3663552,0.003246
4,3663553,0.002967


In [145]:
df_sub.to_csv('submissions/benchmark_freq_enc_3_30porciento_sincard1.csv', sep=',', header=True, index=None)

In [146]:
df_imp = pd.DataFrame({'ft': X.columns, 'imp': lgb_model.feature_importances_}).sort_values('imp', ascending=False)

In [147]:
df_imp.head(30)

Unnamed: 0,ft,imp
385,card2,1895
43,TransactionAmt,1722
383,addr1,1595
20,D15,1027
422,id_31,1012
4,C13,919
390,dist1,800
15,D10,751
388,card5,723
23,D4,671


In [254]:
import datetime

In [264]:
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
X_train = pd.DataFrame()
X_train['TransactionDT'] = train_imputed['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

X_train['year'] = X_train['TransactionDT'].dt.year
X_train['month'] = X_train['TransactionDT'].dt.month
X_train['dow'] = X_train['TransactionDT'].dt.dayofweek
X_train['hour'] = X_train['TransactionDT'].dt.hour
X_train['day'] = X_train['TransactionDT'].dt.day
print('hello world')

In [265]:
X_train.head(30)

Unnamed: 0,TransactionDT,year,month,dow,hour,day
0,2017-12-02 00:00:00,2017,12,5,0,2
1,2017-12-02 00:00:01,2017,12,5,0,2
2,2017-12-02 00:01:09,2017,12,5,0,2
3,2017-12-02 00:01:39,2017,12,5,0,2
4,2017-12-02 00:01:46,2017,12,5,0,2
5,2017-12-02 00:01:50,2017,12,5,0,2
6,2017-12-02 00:02:02,2017,12,5,0,2
7,2017-12-02 00:02:09,2017,12,5,0,2
8,2017-12-02 00:02:15,2017,12,5,0,2
9,2017-12-02 00:02:16,2017,12,5,0,2
