In [1]:
import lightgbm

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd 

from memory_reduction_script import reduce_mem_usage_sd as mr # We are using the memory reduction by our utility script

In [2]:
##### Functions
# 1st function

def j_mode(x, dropna=True):
    #obtains the most frequent, not nan value
    try: 
        mode = x.value_counts(dropna=dropna).index[0]
        # mode != mode -> isnull
        if (mode != mode) and (x.value_counts(dropna=dropna).index > 1):
            mode = x.value_counts(dropna=dropna).index[1]            
        return mode
    except: 
        return x

# 2nd function

def normalize_columns(group, cols, df_train, df_test, verbose=True):
    # replacing the card values depending on card1 with the most frequent, not nan value
    ### initialize trace variables
    if verbose:
        s_train_before    = df_train.shape
        s_train_na_before = df_train[cols].isnull().sum()
        s_test_before     = df_test.shape
        s_test_na_before  = df_test[cols].isnull().sum()

        #train_nulls_before = pd.concat([df_train[cols].isnull().sum(), df_train[cols].isnull().sum()], axis=1)
        #test_nulls_before  = pd.concat([df_test[cols].isnull().sum(), df_test[cols].isnull().sum()], axis=1)

    
    #normalize the group with the mode.
    grouped_train = df_train.groupby([group])
    grouped_test  = df_test.groupby([group])

    n = 0
    for col in cols:
        if verbose:
            print('normalizing ' + str(col))
        df_train[col] = grouped_train[col].transform(lambda x: j_mode(x))
        df_test[col]  = grouped_test[col].transform(lambda x: j_mode(x))
        n += 1
        
    ### print the traces
    if verbose:        
        print(f'train shape before: {s_train_before}, after: {df_train.shape}')
        print(f'test shape before: {s_test_before}, after: {df_test.shape}')
        train_nulls_after = df_train[cols].isnull().sum()
        test_nulls_after  = df_test[cols].isnull().sum()
        for i in range(s_test_na_before.shape[0]):
            print(f'(train) {s_train_na_before.index[i]} nulls before: {s_train_na_before.iloc[i]}, nulls after: {train_nulls_after.iloc[i]}')
        for i in range(s_test_na_before.shape[0]):
            print(f'(test) {s_test_na_before.index[i]} nulls before: {s_test_na_before.iloc[i]}, nulls after: {test_nulls_after.iloc[i]}')

# 3rd function

def fill_na(cols, df_train, df_test, num_rep=-999, obj_rep ='Unknown', verbose=True):
    ### initialize trace variables
    if verbose:
        s_train_na_before = train[cols].isnull().sum()
        s_test_na_before  = test[cols].isnull().sum()
        
    for col in cols:
        if df_train[col].dtype == 'O':
            df_train[col] = df_train[col].fillna(obj_rep)
            df_test[col]  = df_test[col].fillna(obj_rep)
        else:
            df_train[col] = df_train[col].fillna(num_rep)
            df_test[col]  = df_test[col].fillna(num_rep)

    ### print the traces
    if verbose:        
        train_nulls_after = df_train[cols].isnull().sum()
        test_nulls_after  = df_test[cols].isnull().sum()
        
        for i in range(s_test_na_before.shape[0]):
            print(f'(train) {s_train_na_before.index[i]} nulls before: {s_train_na_before.iloc[i]}, nulls after: {train_nulls_after.iloc[i]}')
        for i in range(s_test_na_before.shape[0]):
            print(f'(test) {s_test_na_before.index[i]} nulls before: {s_test_na_before.iloc[i]}, nulls after: {test_nulls_after.iloc[i]}')


In [3]:
##### Download of files.

print('Downloading datasets...')
print(' ')
train = pd.read_pickle('/kaggle/input/1-fraud-detection-memory-reduction/train_mred.pkl')
print('Train has been downloaded... (1/2)')
test = pd.read_pickle('/kaggle/input/1-fraud-detection-memory-reduction/test_mred.pkl')
print('Test has been downloaded... (2/2)')
print(' ')
print('All files are downloaded')

Downloading datasets...
 
Train has been downloaded... (1/2)
Test has been downloaded... (2/2)
 
All files are downloaded


In [4]:
print(train.shape) # 6 months of data in train
print(test.shape)  # 6 months of data in test

(590540, 405)
(506691, 404)


# Preprocessing:

1. Three different card keys (ways to group cards or users):
   
    * `uid1` with `card1` to `card6` + `addr1` + `addr2` + `ProductCD` + `D1achr`
    * `uid2` with `card1` + `D2achr` + `C13` + `D11achr` + `D10achr` + `D15achr` + `D4achr`
    * `uids` with `card1` + `addr1` + `D1achr`  
  
  
2. Normalize all the fields related to card1 -> replace less frequent, NaNs values with the mode.
3. Input all categorical data - we encode them with a dictionary to set the same value for all NaNs.

In [5]:
object_columns = train.dtypes[train.dtypes=='O'].index
for col in object_columns:
    print(col ,'nulls (train):', train[col].isnull().sum()/train.shape[0])
    print(col ,'nulls (test):', test[col].isnull().sum()/test.shape[0])

ProductCD nulls (train): 0.0
ProductCD nulls (test): 0.0
card4 nulls (train): 0.00267043722694483
card4 nulls (test): 0.006090496969553436
card6 nulls (train): 0.0026602770345785214
card6 nulls (test): 0.00593458340487595
P_emaildomain nulls (train): 0.1599485216920107
P_emaildomain nulls (test): 0.13655659958436206
R_emaildomain nulls (train): 0.7675161716395164
R_emaildomain nulls (test): 0.7318484046489873
M1 nulls (train): 0.4590713584177194
M1 nulls (test): 0.348612862671727
M2 nulls (train): 0.4590713584177194
M2 nulls (test): 0.348612862671727
M3 nulls (train): 0.4590713584177194
M3 nulls (test): 0.348612862671727
M4 nulls (train): 0.47658753005723575
M4 nulls (test): 0.4692110181550491
M5 nulls (train): 0.5934940901547736
M5 nulls (test): 0.6110864412432824
M6 nulls (train): 0.28678836319300977
M6 nulls (test): 0.3136803298262649
M7 nulls (train): 0.5863531682866528
M7 nulls (test): 0.46382903978953643
M8 nulls (train): 0.5863311545365258
M8 nulls (test): 0.46380140953756827
M9

In [6]:
##### Group outliers for card4 and card6
print(" BEFORE ".center(20, '#'))
print('Train - card6:\n', train['card6'].value_counts(dropna=False))
print(' ')
print('Train - card4:\n', train['card4'].value_counts(dropna=False))
print(' ')

train.card6 = train.card6.replace(['debit or credit', 'charge card'], np.nan)
train.card4 = train.card4.replace(['american express', 'discover'], np.nan)

print(" AFTER ".center(20, '#'))
print('Train - card6:\n', train['card6'].value_counts(dropna=False))
print(' ')
print('Train - card4:\n', train['card4'].value_counts(dropna=False))
print(' ')

###### BEFORE ######
Train - card6:
 debit              439938
credit             148986
NaN                  1571
debit or credit        30
charge card            15
Name: card6, dtype: int64
 
Train - card4:
 visa                384767
mastercard          189217
american express      8328
discover              6651
NaN                   1577
Name: card4, dtype: int64
 
###### AFTER #######
Train - card6:
 debit     439938
credit    148986
NaN         1616
Name: card6, dtype: int64
 
Train - card4:
 visa          384767
mastercard    189217
NaN            16556
Name: card4, dtype: int64
 


In [7]:
##### Label encoding all object columns with our dictionary
for col in object_columns:
    print(f'String values from {col} are being transformed to numeric...')
    #unique values without nans
    unique_values = list(train[col].dropna().unique()) 

    #create the dictionary
    str_to_num = dict()
    for num,value in enumerate(unique_values):
        str_to_num[value] = num

    #apply it to column
    train[col] = train[col].map(str_to_num)
    test[col]  = test[col].map(str_to_num)
    print(f'String values from {col} are transformed!')

print(' ')
print('Done!') 

String values from ProductCD are being transformed to numeric...
String values from ProductCD are transformed!
String values from card4 are being transformed to numeric...
String values from card4 are transformed!
String values from card6 are being transformed to numeric...
String values from card6 are transformed!
String values from P_emaildomain are being transformed to numeric...
String values from P_emaildomain are transformed!
String values from R_emaildomain are being transformed to numeric...
String values from R_emaildomain are transformed!
String values from M1 are being transformed to numeric...
String values from M1 are transformed!
String values from M2 are being transformed to numeric...
String values from M2 are transformed!
String values from M3 are being transformed to numeric...
String values from M3 are transformed!
String values from M4 are being transformed to numeric...
String values from M4 are transformed!
String values from M5 are being transformed to numeric...

In [8]:
##### Normalizing by replacing less frequents and NaNs values with the mode using CARD1

card_cols = ['card' + str(i) for i in range(2,7)]
normalize_columns('card1', card_cols, df_train=train, df_test=test, verbose=True) 

domain_cols = ['P_emaildomain', 'R_emaildomain']
normalize_columns('card1', domain_cols, df_train=train, df_test=test, verbose=True)

normalizing card2
normalizing card3
normalizing card4
normalizing card5
normalizing card6
train shape before: (590540, 405), after: (590540, 405)
test shape before: (506691, 404), after: (506691, 404)
(train) card2 nulls before: 8933, nulls after: 4686
(train) card3 nulls before: 1565, nulls after: 16
(train) card4 nulls before: 16556, nulls after: 15026
(train) card5 nulls before: 4259, nulls after: 793
(train) card6 nulls before: 1616, nulls after: 25
(test) card2 nulls before: 8654, nulls after: 5498
(test) card3 nulls before: 3002, nulls after: 46
(test) card4 nulls before: 13640, nulls after: 10776
(test) card5 nulls before: 4547, nulls after: 1394
(test) card6 nulls before: 3008, nulls after: 52
normalizing P_emaildomain
normalizing R_emaildomain
train shape before: (590540, 405), after: (590540, 405)
test shape before: (506691, 404), after: (506691, 404)
(train) P_emaildomain nulls before: 94456, nulls after: 1286
(train) R_emaildomain nulls before: 453249, nulls after: 49051
(t

In [9]:
##### Replacing NaNs

# Cards columns
card_cols_c = ['card' + str(i) for i in range(1,7)]
fill_na(card_cols_c, num_rep=-999, obj_rep ='Unknown', df_train=train, df_test=test, verbose=True)

# Address columns
addrs = ['addr1', 'addr2']
fill_na(addrs, num_rep=-999, obj_rep ='Unknown', df_train=train, df_test=test, verbose=True)

# Domains columns
fill_na(domain_cols, num_rep=-999, obj_rep ='Unknown', df_train=train, df_test=test, verbose=True)

(train) card1 nulls before: 0, nulls after: 0
(train) card2 nulls before: 4686, nulls after: 0
(train) card3 nulls before: 16, nulls after: 0
(train) card4 nulls before: 15026, nulls after: 0
(train) card5 nulls before: 793, nulls after: 0
(train) card6 nulls before: 25, nulls after: 0
(test) card1 nulls before: 0, nulls after: 0
(test) card2 nulls before: 5498, nulls after: 0
(test) card3 nulls before: 46, nulls after: 0
(test) card4 nulls before: 10776, nulls after: 0
(test) card5 nulls before: 1394, nulls after: 0
(test) card6 nulls before: 52, nulls after: 0
(train) addr1 nulls before: 65706, nulls after: 0
(train) addr2 nulls before: 65706, nulls after: 0
(test) addr1 nulls before: 65609, nulls after: 0
(test) addr2 nulls before: 65609, nulls after: 0
(train) P_emaildomain nulls before: 1286, nulls after: 0
(train) R_emaildomain nulls before: 49051, nulls after: 0
(test) P_emaildomain nulls before: 1647, nulls after: 0
(test) R_emaildomain nulls before: 39258, nulls after: 0


In [10]:
##### Creation of UIDs

# uid1

train['uid1'] = train['card1'].astype('str') + '_' + train['card2'].astype('str')     + '_' \
              + train['card3'].astype('str') + '_' + train['card4'].astype('str')     + '_' \
              + train['card5'].astype('str') + '_' + train['card6'].astype('str')     + '_' + train['addr1'].astype('str') + '_' \
              + train['addr2'].astype('str') + '_' + train['ProductCD'].astype('str') + '_' + train['D1achr'].astype('str')

test['uid1']  = test['card1'].astype('str')  + '_' + test['card2'].astype('str')     + '_' \
              + test['card3'].astype('str')  + '_' + test['card4'].astype('str')     + '_' \
              + test['card5'].astype('str')  + '_' + test['card6'].astype('str')     + '_' + test['addr1'].astype('str') + '_' \
              + test['addr2'].astype('str')  + '_' + test['ProductCD'].astype('str') + '_' + test['D1achr'].astype('str')

# uid2

train['uid2'] = train['card1'].astype('str')   + '_' + train['D2achr'].astype('str')  + '_' \
              + train['C13'].astype('str')     + '_' + train['D11achr'].astype('str') + '_' \
              + train['D10achr'].astype('str') + '_' + train['D15achr'].astype('str') + '_' \
              + train['D4achr'].astype('str')

test['uid2']  = test['card1'].astype('str')    + '_' + test['D2achr'].astype('str')   + '_' \
              + test['C13'].astype('str')      + '_' + test['D11achr'].astype('str')  + '_' \
              + test['D10achr'].astype('str')  + '_' + test['D15achr'].astype('str')  + '_' \
              + test['D4achr'].astype('str')

# uids

train['uids'] = train['card1'].astype('str')   + '_' + train['addr1'].astype('str')  + '_' \
              + train['D1achr'].astype('str')

test['uids']  = test['card1'].astype('str')    + '_' + test['addr1'].astype('str')   + '_' \
              + test['D1achr'].astype('str')

print('Unique uid1 (train):', len(train.uid1.unique()))
print('Unique uid1 (test):', len(test.uid1.unique()))

print('Unique uid2 (train):', len(train.uid2.unique()))
print('Unique uid2 (test):', len(test.uid2.unique()))

print('Unique uids (train):', len(train.uids.unique()))
print('Unique uids (test):', len(test.uids.unique()))

Unique uid1 (train): 228233
Unique uid1 (test): 204853
Unique uid2 (train): 451356
Unique uid2 (test): 402011
Unique uids (train): 217850
Unique uids (test): 196187


In [11]:
##### Selecting type of uid filling type
uid = 'uid1'

In [12]:
%%time
##### Normalizing by replacing less frequents and NaNs values with the mode using UID1

m_cols = ['M'+str(i) for i in range(1,10)]
normalize_columns(uid, m_cols, df_train=train, df_test=test, verbose=True) 
fill_na(m_cols, num_rep=-999, obj_rep ='Unknown', df_train=train, df_test=test, verbose=True)

normalizing M1
normalizing M2
normalizing M3
normalizing M4
normalizing M5
normalizing M6
normalizing M7
normalizing M8
normalizing M9
train shape before: (590540, 408), after: (590540, 408)
test shape before: (506691, 407), after: (506691, 407)
(train) M1 nulls before: 271100, nulls after: 182082
(train) M2 nulls before: 271100, nulls after: 182082
(train) M3 nulls before: 271100, nulls after: 182082
(train) M4 nulls before: 281444, nulls after: 233376
(train) M5 nulls before: 350482, nulls after: 301733
(train) M6 nulls before: 169360, nulls after: 161567
(train) M7 nulls before: 346265, nulls after: 219827
(train) M8 nulls before: 346252, nulls after: 219822
(train) M9 nulls before: 346252, nulls after: 219822
(test) M1 nulls before: 176639, nulls after: 155028
(test) M2 nulls before: 176639, nulls after: 155028
(test) M3 nulls before: 176639, nulls after: 155028
(test) M4 nulls before: 237745, nulls after: 204212
(test) M5 nulls before: 309632, nulls after: 274413
(test) M6 nulls b

In [13]:
%%time
##### Normalizing by replacing less frequents and NaNs values with the mode using UID1

dist_cols = ['dist1', 'dist2']
normalize_columns(uid, dist_cols, df_train=train, df_test=test, verbose=True)
fill_na(dist_cols, num_rep=-999, obj_rep ='Unknown', df_train=train, df_test=test, verbose=True)

normalizing dist1
normalizing dist2
train shape before: (590540, 408), after: (590540, 408)
test shape before: (506691, 407), after: (506691, 407)
(train) dist1 nulls before: 352271, nulls after: 235492
(train) dist2 nulls before: 552913, nulls after: 526091
(test) dist1 nulls before: 291217, nulls after: 226688
(test) dist2 nulls before: 470255, nulls after: 440044
(train) dist1 nulls before: 235492, nulls after: 0
(train) dist2 nulls before: 526091, nulls after: 0
(test) dist1 nulls before: 226688, nulls after: 0
(test) dist2 nulls before: 440044, nulls after: 0
CPU times: user 14min 7s, sys: 4.4 s, total: 14min 12s
Wall time: 14min 12s


In [14]:
##### Memory reduction

train = mr(train, verbose=True)
test  = mr(test, verbose=True)

Column TransactionID: int32 -> int32, na_count=0, n_uniq=590540
Column isFraud: int8 -> int8, na_count=0, n_uniq=2
Column TransactionDT: int32 -> int32, na_count=0, n_uniq=573349
Column TransactionAmt: float32 -> float32, na_count=0, n_uniq=20902
Column ProductCD: int64 -> int8, na_count=0, n_uniq=5
Column card1: int16 -> int16, na_count=0, n_uniq=13553
Column card2: float16 -> int16, na_count=0, n_uniq=501
Column card3: float16 -> int16, na_count=0, n_uniq=113
Column card4: float64 -> int16, na_count=0, n_uniq=3
Column card5: float16 -> int16, na_count=0, n_uniq=112
Column card6: float64 -> int16, na_count=0, n_uniq=3
Column addr1: float16 -> int16, na_count=0, n_uniq=333
Column addr2: float16 -> int16, na_count=0, n_uniq=75
Column dist1: float32 -> int16, na_count=0, n_uniq=2442
Column dist2: float32 -> int16, na_count=0, n_uniq=1542
Column P_emaildomain: float64 -> int16, na_count=0, n_uniq=59
Column R_emaildomain: float64 -> int16, na_count=0, n_uniq=57
Column C1: int16 -> int16, n

In [15]:
##### Saving both DataFrames into binary files to speed next uploadings.

print('Saving datasets...')
train.to_pickle('train.pkl')
print('Train has been saved... (1/2)')
test.to_pickle('test.pkl')
print('Test has been saved... (2/2)')
print('Done!')

Saving datasets...
Train has been saved... (1/2)
Test has been saved... (2/2)
Done!
