In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import gc

pd.options.display.max_columns=None
pd.options.display.max_rows=None
plt.style.use('fivethirtyeight')

In [2]:
train_transaction=pd.read_csv('train_transaction.csv')
test_transaction=pd.read_csv('test_transaction.csv')

train_identity=pd.read_csv('train_identity.csv')
test_identity=pd.read_csv('test_identity.csv')

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
train_transaction=reduce_mem_usage(train_transaction)
test_transaction=reduce_mem_usage(test_transaction)
train_identity=reduce_mem_usage(train_identity)
test_identity=reduce_mem_usage(test_identity)

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)


In [5]:
print("Train Transaction shape: {}".format(train_transaction.shape))
print("Test Transaction shape: {}".format(test_transaction.shape))
print("Train Identity shape: {}".format(train_identity.shape))
print("Test Identity shape: {}".format(test_identity.shape))

Train Transaction shape: (590540, 394)
Test Transaction shape: (506691, 393)
Train Identity shape: (144233, 41)
Test Identity shape: (141907, 41)


In [6]:
train=pd.merge(train_transaction,train_identity,on='TransactionID',how='left')
test=pd.merge(test_transaction,test_identity,on='TransactionID',how='left')

P_emaildomain[Purchaser Email Domain]

In [7]:
train.P_emaildomain.replace({'gmail':'gmail.com',
 'yahoo.com.mx':'yahoo.com',
 'yahoo.fr':'yahoo.com', 
 'yahoo.de':'yahoo.com', 
 'yahoo.es':'yahoo.com',
'yahoo.co.uk':'yahoo.com',
'yahoo.co.jp':'yahoo.com',
'hotmail.fr':'hotmail.com',
 'hotmail.de':'hotmail.com',
 'hotmail.co.uk':'hotmail.com'},inplace=True)

R_emaildomain[Receiver Email Domain]

In [8]:
train.R_emaildomain.replace({'gmail':'gmail.com',
 'yahoo.com.mx':'yahoo.com',
 'yahoo.fr':'yahoo.com', 
 'yahoo.de':'yahoo.com', 
 'yahoo.es':'yahoo.com',
'yahoo.co.uk':'yahoo.com',
'yahoo.co.jp':'yahoo.com',
'hotmail.fr':'hotmail.com',
 'hotmail.de':'hotmail.com',
 'hotmail.co.uk':'hotmail.com'},inplace=True)

C1-C14


Given:
    C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.

In [9]:
C=[c for c in train.columns if 'C' in c]
C=C[1:]

D1-D15

Given:
    D1-D15: timedelta, such as days between previous transaction, etc.

In [10]:
D=[d for d in train.columns if 'D' in d]
D=D[3:18]

M1-M9

Given:
    M1-M9: match, such as names on card and address, etc

In [11]:
M=[m for m in train.columns if 'M' in m]

V1-V339

Given:
Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.

In [12]:
V=[v for v in train.columns if 'V' in v]

id_01- id_11

Given:
Numerical features

In [13]:
id=[c for c in train.columns if 'id_' in c]

# Missing values

In [14]:
#Columns id_01 - id_38 of the train dataset is represented as id-02 - id_38.Hence, we convert the column names to make them similar
test.rename(columns={'id-01':'id_01','id-02':'id_02','id-03':'id_03','id-04':'id_04','id-05':'id_05','id-06':'id_06',
                     'id-07':'id_07','id-08':'id_08','id-09':'id_09','id-10':'id_10','id-11':'id_11','id-12':'id_12',
                     'id-13':'id_13','id-14':'id_14','id-15':'id_15','id-16':'id_16','id-17':'id_17','id-18':'id_18',
                     'id-19':'id_19','id-20':'id_20','id-21':'id_21','id-22':'id_22','id-23':'id_23','id-24':'id_24',
                     'id-25':'id_25','id-26':'id_26','id-27':'id_27','id-28':'id_28','id-29':'id_29','id-30':'id_30',
                     'id-31':'id_31','id-32':'id_32','id-33':'id_33','id-34':'id_34','id-35':'id_35','id-36':'id_36',
                     'id-37':'id_37','id-38':'id_38'},inplace=True)

In [15]:
#Categorical features
obj=[]
for t in train.columns:
    if(train[t].dtype=='O'):
        obj.append(t)

#Numerical features
num=[]
for t in train.columns:
    if(train[t].dtype!='O'):
        num.append(t)

In [16]:
print('Numerical Features: ',len(num))
print('Categorical Features: ',len(obj))

Numerical Features:  403
Categorical Features:  31


In [17]:
#Missing values in categorical features(in percentage)
print('Categorical features with their missing values')
train[obj].isna().sum()/len(train[obj])*100

Categorical features with their missing values


ProductCD         0.000000
card4             0.267044
card6             0.266028
P_emaildomain    15.994852
R_emaildomain    76.751617
M1               45.907136
M2               45.907136
M3               45.907136
M4               47.658753
M5               59.349409
M6               28.678836
M7               58.635317
M8               58.633115
M9               58.633115
id_12            75.576083
id_15            76.126088
id_16            78.098012
id_23            99.124699
id_27            99.124699
id_28            76.127273
id_29            76.127273
id_30            86.865411
id_31            76.245132
id_33            87.589494
id_34            86.824771
id_35            76.126088
id_36            76.126088
id_37            76.126088
id_38            76.126088
DeviceType       76.155722
DeviceInfo       79.905510
dtype: float64

In [18]:
train[obj]=train[obj].fillna('Unknown')
test[obj]=test[obj].fillna('Unknown')

In [19]:
le=LabelEncoder()
for col in obj:
    train[col]=le.fit_transform(train[col])
    test[col]=le.fit_transform(test[col])

In [20]:
#Missing values in numerical features(in percentage)
print('Numerical features with their missing values')
train[num].isna().sum()/len(train[num])*100

Numerical features with their missing values


TransactionID      0.000000
isFraud            0.000000
TransactionDT      0.000000
TransactionAmt     0.000000
card1              0.000000
card2              1.512683
card3              0.265012
card5              0.721204
addr1             11.126427
addr2             11.126427
dist1             59.652352
dist2             93.628374
C1                 0.000000
C2                 0.000000
C3                 0.000000
C4                 0.000000
C5                 0.000000
C6                 0.000000
C7                 0.000000
C8                 0.000000
C9                 0.000000
C10                0.000000
C11                0.000000
C12                0.000000
C13                0.000000
C14                0.000000
D1                 0.214888
D2                47.549192
D3                44.514851
D4                28.604667
D5                52.467403
D6                87.606767
D7                93.409930
D8                87.312290
D9                87.312290
D10               12

In [21]:
for i in num:
    if train[i].isnull().sum()>0:
        train[i]=train[i].fillna(train[i].min()-2)
        test[i]=test[i].fillna(test[i].min()-2)

# LIGHTGBM

In [22]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT','TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']

X_test = test.drop(['TransactionDT','TransactionID'], axis=1)

In [23]:
params = {'num_leaves': 500,
          'min_child_weight': 0.04,
          'feature_fraction': 0.4,
          'bagging_fraction': 0.4,
          'min_data_in_leaf': 100,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.05,
          "boosting_type": "gbdt",
          "metric": 'auc',
          "verbosity": -1,
          'random_state': 42,
         }

In [24]:
%%time

NFOLDS = 5
folds = KFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=500)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(X_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")

Training until validation scores don't improve for 500 rounds
[200]	training's auc: 0.997701	valid_1's auc: 0.914446
[400]	training's auc: 0.999867	valid_1's auc: 0.912593
[600]	training's auc: 0.999993	valid_1's auc: 0.912965
Early stopping, best iteration is:
[177]	training's auc: 0.99673	valid_1's auc: 0.91451
Fold 1 | AUC: 0.9145100025067917
Training until validation scores don't improve for 500 rounds
[200]	training's auc: 0.998249	valid_1's auc: 0.932382
[400]	training's auc: 0.999927	valid_1's auc: 0.930576
[600]	training's auc: 0.999999	valid_1's auc: 0.929132
Early stopping, best iteration is:
[270]	training's auc: 0.999486	valid_1's auc: 0.932607
Fold 2 | AUC: 0.932606552969328
Training until validation scores don't improve for 500 rounds
[200]	training's auc: 0.998373	valid_1's auc: 0.930609
[400]	training's auc: 0.99994	valid_1's auc: 0.926139
[600]	training's auc: 0.999999	valid_1's auc: 0.925436
Early stopping, best iteration is:
[137]	training's auc: 0.99495	valid_1's au

In [25]:
del X,y,X_test

# Feature Engineering

In [26]:
#TransactionDT
train['TransactionDT_minutes']=train['TransactionDT']/60
train['TransactionDT_hours']=train['TransactionDT']/(60*60)
train['TransactionDT_days']=train['TransactionDT']/(60*60*24)
train['TransactionDT_week']=train['TransactionDT']/(60*60*24*7)
train['TransactionDT_month']=train['TransactionDT']/(60*60*24*30)

test['TransactionDT_minutes']=test['TransactionDT']/60
test['TransactionDT_hours']=test['TransactionDT']/(60*60)
test['TransactionDT_days']=test['TransactionDT']/(60*60*24)
test['TransactionDT_week']=test['TransactionDT']/(60*60*24*7)
test['TransactionDT_month']=test['TransactionDT']/(60*60*24*30)

#card_1,card_2,card_3,card_5 are numerical
train['TransactionAmt_by_mean_card_1']=train['TransactionAmt']/train.groupby(['card1'])['TransactionAmt'].transform('mean')
train['TransactionAmt_by_mean_card_2']=train['TransactionAmt']/train.groupby(['card2'])['TransactionAmt'].transform('mean')
train['TransactionAmt_by_mean_card_3']=train['TransactionAmt']/train.groupby(['card3'])['TransactionAmt'].transform('mean')
train['TransactionAmt_by_mean_card_5']=train['TransactionAmt']/train.groupby(['card5'])['TransactionAmt'].transform('mean')

test['TransactionAmt_by_mean_card_1']=test['TransactionAmt']/test.groupby(['card1'])['TransactionAmt'].transform('mean')
test['TransactionAmt_by_mean_card_2']=test['TransactionAmt']/test.groupby(['card2'])['TransactionAmt'].transform('mean')
test['TransactionAmt_by_mean_card_3']=test['TransactionAmt']/test.groupby(['card3'])['TransactionAmt'].transform('mean')
test['TransactionAmt_by_mean_card_5']=test['TransactionAmt']/test.groupby(['card5'])['TransactionAmt'].transform('mean')

#card_3,card_5 are categorical
train['TransactionAmt_by_mean_card_4']=train['TransactionAmt']/train.groupby(['card4'])['TransactionAmt'].transform('mean')
train['TransactionAmt_by_mean_card_6']=train['TransactionAmt']/train.groupby(['card6'])['TransactionAmt'].transform('mean')

test['TransactionAmt_by_mean_card_4']=test['TransactionAmt']/test.groupby(['card4'])['TransactionAmt'].transform('mean')
test['TransactionAmt_by_mean_card_6']=test['TransactionAmt']/test.groupby(['card6'])['TransactionAmt'].transform('mean')

#Log feature
train['TransactionAmt_log']=np.log(train['TransactionAmt'])
test['TransactionAmt_log']=np.log(test['TransactionAmt'])

#TransactionAmt features
train['TransactionAmt_by_minutes']=train['TransactionAmt']/train['TransactionDT_minutes']
train['TransactionAmt_by_hours']=train['TransactionAmt']/train['TransactionDT_hours']
train['TransactionAmt_by_days']=train['TransactionAmt']/train['TransactionDT_days']
train['TransactionAmt_by_week']=train['TransactionAmt']/train['TransactionDT_week']
train['TransactionAmt_by_month']=train['TransactionAmt']/train['TransactionDT_month']

test['TransactionAmt_by_minutes']=test['TransactionAmt']/test['TransactionDT_minutes']
test['TransactionAmt_by_hours']=test['TransactionAmt']/test['TransactionDT_hours']
test['TransactionAmt_by_days']=test['TransactionAmt']/test['TransactionDT_days']
test['TransactionAmt_by_week']=test['TransactionAmt']/test['TransactionDT_week']
test['TransactionAmt_by_month']=test['TransactionAmt']/test['TransactionDT_month']

#Creating Unique_id of clients
train['unique_id_1']=train['card1'].astype(str)+'_'+train['card2'].astype(str)
train['unique_id_2']=train['unique_id_1'].astype(str)+'_'+train['card3'].astype(str)
train['unique_id_3']=train['unique_id_2'].astype(str)+'_'+train['addr1'].astype(str)+'_'+train['addr2'].astype(str)
train['unique_id_4']=train['unique_id_3'].astype(str)+'_'+train['P_emaildomain'].astype(str)
train['unique_id_5']=train['unique_id_4'].astype(str)+'_'+train['R_emaildomain'].astype(str)

test['unique_id_1']=test['card1'].astype(str)+'_'+test['card2'].astype(str)
test['unique_id_2']=test['unique_id_1'].astype(str)+'_'+test['card3'].astype(str)
test['unique_id_3']=test['unique_id_2'].astype(str)+'_'+test['addr1'].astype(str)+'_'+test['addr2'].astype(str)
test['unique_id_4']=test['unique_id_3'].astype(str)+'_'+test['P_emaildomain'].astype(str)
test['unique_id_5']=test['unique_id_4'].astype(str)+'_'+test['R_emaildomain'].astype(str)

train['unique_id_1_value_count']=train.unique_id_1.map(train.unique_id_1.value_counts().to_dict())
train['unique_id_2_value_count']=train.unique_id_2.map(train.unique_id_2.value_counts().to_dict())
train['unique_id_3_value_count']=train.unique_id_3.map(train.unique_id_3.value_counts().to_dict())
train['unique_id_4_value_count']=train.unique_id_4.map(train.unique_id_4.value_counts().to_dict())
train['unique_id_5_value_count']=train.unique_id_5.map(train.unique_id_5.value_counts().to_dict())

test['unique_id_1_value_count']=test.unique_id_1.map(test.unique_id_1.value_counts().to_dict())
test['unique_id_2_value_count']=test.unique_id_2.map(test.unique_id_2.value_counts().to_dict())
test['unique_id_3_value_count']=test.unique_id_3.map(test.unique_id_3.value_counts().to_dict())
test['unique_id_4_value_count']=test.unique_id_4.map(test.unique_id_4.value_counts().to_dict())
test['unique_id_5_value_count']=test.unique_id_5.map(test.unique_id_5.value_counts().to_dict())

#Creating columns by grouping and aggregation
columns=['card1','card2','card3','card4','card5','card6','addr1','addr2','P_emaildomain','R_emaildomain','unique_id_1',
         'unique_id_2','unique_id_3','unique_id_4','unique_id_5']
for col in columns:
    train[col+'_TransactionAmt_mean']=train.groupby(col)['TransactionAmt'].transform('mean')
    test[col+'_TransactionAmt_mean']=test.groupby(col)['TransactionAmt'].transform('mean')

#D features
for col in D:
    train[col+'_TransactionAmt_mean']=train.groupby(col)['TransactionAmt'].transform('mean')
    test[col+'_TransactionAmt_mean']=test.groupby(col)['TransactionAmt'].transform('mean')
    
#Value_count feature of card1,card2,card3,card4,card5,card6,addr1,addr2,dist1.dist2,P_emaildomain,R_emaildomain columns
columns=['card1','card2','card3','card4','card5','card6','addr1','addr2','dist1','dist2','P_emaildomain','R_emaildomain']
for col in columns:
    train[col+'_value_count']=train[col].map(train[col].value_counts().to_dict())
    test[col+'_value_count']=test[col].map(test[col].value_counts().to_dict())
    
#Value_count feature of C1-C14 columns
for c in C:
    train[c+'_value_count']=train[c].map(train[c].value_counts().to_dict())
    test[c+'_value_count']=test[c].map(test[c].value_counts().to_dict())
    
#Value_count feature of D1-15 columns
for d in D:
    train[d+'_value_count']=train[d].map(train[d].value_counts().to_dict())
    test[d+'_value_count']=test[d].map(test[d].value_counts().to_dict())

#Value_count feature of id
for i in id:
    train[i+'_value_count']=train[i].map(train[i].value_counts().to_dict())
    test[i+'_value_count']=test[i].map(test[i].value_counts().to_dict())

In [27]:
#Categorical features
obj=[]
for t in train.columns:
    if(train[t].dtype=='O'):
        obj.append(t)

In [28]:
for col in obj:
    train[col]=le.fit_transform(train[col])
    test[col]=le.fit_transform(test[col])

In [29]:
print('Shape of train: ',train.shape)
print('Shape of test: ',test.shape)

Shape of train:  (590540, 570)
Shape of test:  (506691, 569)


# LIGHTGBM after feature generation

In [30]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT','TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']

X_test = test.drop(['TransactionDT','TransactionID'], axis=1)

In [31]:
%%time

NFOLDS = 5
folds = KFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=500)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(X_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC after feature generation= {score}")
print(f"Out of folds AUC after feature generation = {roc_auc_score(y, y_oof)}")

Training until validation scores don't improve for 500 rounds
[200]	training's auc: 0.999935	valid_1's auc: 0.9242
[400]	training's auc: 1	valid_1's auc: 0.923343
[600]	training's auc: 1	valid_1's auc: 0.920589
Early stopping, best iteration is:
[173]	training's auc: 0.999789	valid_1's auc: 0.924675
Fold 1 | AUC: 0.9246748730854558
Training until validation scores don't improve for 500 rounds
[200]	training's auc: 0.999959	valid_1's auc: 0.936001
[400]	training's auc: 1	valid_1's auc: 0.927492
[600]	training's auc: 1	valid_1's auc: 0.924904
Early stopping, best iteration is:
[144]	training's auc: 0.999527	valid_1's auc: 0.936983
Fold 2 | AUC: 0.9369826925134723
Training until validation scores don't improve for 500 rounds
[200]	training's auc: 0.99996	valid_1's auc: 0.92983
[400]	training's auc: 1	valid_1's auc: 0.928697
[600]	training's auc: 1	valid_1's auc: 0.925398
Early stopping, best iteration is:
[229]	training's auc: 0.999988	valid_1's auc: 0.930683
Fold 3 | AUC: 0.9306827089164

In [32]:
test_pred_lgbm_cv=clf.predict(X_test)

In [33]:
final_dict_lgbm_cv = {'TransactionID' : test.TransactionID, 'isFraud': test_pred_lgbm_cv}
Result_lgbm_cv = pd.DataFrame(final_dict_lgbm_cv)

In [34]:
Result_lgbm_cv[:5]

Unnamed: 0,TransactionID,isFraud
0,3663549,6e-05
1,3663550,0.000284
2,3663551,0.000159
3,3663552,4.6e-05
4,3663553,0.000708


In [36]:
Result_lgbm_cv.to_csv('Result_lgbm.csv')