## Feature Engineering

In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv("train_eda")
test = pd.read_csv("test_eda")

## Missing Values

In [136]:
# Remove values that have 95% missing value or higher

for feature in dataset.columns:
    if np.round(dataset[feature].isnull().mean(), 4) > .9:
        del dataset[feature]

for feature in test.columns:
    if np.round(test[feature].isnull().mean(), 4) > .9:
        del test[feature]

In [137]:
dataset.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_29,id_31,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,day,hour
0,3567540,0,16.554679,3.663562,W,7.497762,5.081404,5.010635,mastercard,4.762174,...,,,,,,,,,3,2
1,3567541,0,16.554681,3.432373,W,9.439625,5.590987,5.010635,visa,5.111988,...,,,,,,,,,3,2
2,3567542,0,16.554683,4.799914,W,9.714745,6.165418,5.010635,visa,4.836282,...,,,,,,,,,3,2
3,3567543,0,16.554686,5.141664,W,8.969669,6.202536,5.010635,visa,5.420535,...,,,,,,,,,3,2
4,3567544,0,16.554687,4.681668,W,9.028219,6.276643,5.010635,mastercard,5.411646,...,,,,,,,,,3,2


In [138]:
# Numerical
numerical_features_train = [feature for feature in dataset.columns if dataset[feature].dtype != 'O']
numerical_features_test = [feature for feature in test.columns if test[feature].dtype != 'O']
print(len(numerical_features_train))
print(len(numerical_features_test))
print([feature for feature in numerical_features_train if feature not in numerical_features_test])
# #Discrete: Take the mode
# discrete_features_train = [feature for feature in numerical_features_train if (len(dataset[feature].unique()) < 20) & (feature != "isFraud") ]
# discrete_features_test = [feature for feature in numerical_features_test if (len(test[feature].unique()) < 20) & (feature != "isFraud") ]
# # Continuous: Take mean
# continuous_features_train = [feature for feature in numerical_features_train if (feature not in discrete_features_train) & (feature != 'TransactionID')]
# continuous_features_test = [feature for feature in numerical_features_test if (feature not in discrete_features_test) & (feature != 'TransactionID')]

#Cat: Mode
categorical_features_train = [feature for feature in dataset.columns if dataset[feature].dtype == 'O']
categorical_features_test = [feature for feature in test.columns if test[feature].dtype == 'O']




340
396
['isFraud', 'id_01', 'id_02', 'id_05', 'id_06', 'id_09', 'id_10', 'id_11', 'id_13', 'id_17', 'id_19', 'id_20']


In [139]:
for i in numerical_features_train:
    #Do the mean
    dataset[i].fillna(dataset[i].mean(),inplace=True)
for i in numerical_features_test:
    #Do the mean
    test[i].fillna(test[i].mean(),inplace=True)
    
for i in categorical_features_train:
    if len(dataset[i].value_counts()) < 3: # If its a boolean
        dataset[i].fillna('Unknown',inplace=True)
    else:
        dataset[i].fillna(dataset[i].mode()[0],inplace=True)
for i in categorical_features_test:
    if len(test[i].value_counts()) < 3: # If its a boolean
        test[i].fillna('Unknown',inplace=True)
    else:
        test[i].fillna(test[i].mode()[0],inplace=True)
    
dataset[numerical_features_train].isnull().sum()

TransactionID     0
isFraud           0
TransactionDT     0
TransactionAmt    0
card1             0
                 ..
id_17             0
id_19             0
id_20             0
day               0
hour              0
Length: 340, dtype: int64

In [140]:
dataset.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_29,id_31,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,day,hour
0,3567540,0,16.554679,3.663562,W,7.497762,5.081404,5.010635,mastercard,4.762174,...,Unknown,chrome 66.0,Unknown,Unknown,Unknown,Unknown,Unknown,Windows,3,2
1,3567541,0,16.554681,3.432373,W,9.439625,5.590987,5.010635,visa,5.111988,...,Unknown,chrome 66.0,Unknown,Unknown,Unknown,Unknown,Unknown,Windows,3,2
2,3567542,0,16.554683,4.799914,W,9.714745,6.165418,5.010635,visa,4.836282,...,Unknown,chrome 66.0,Unknown,Unknown,Unknown,Unknown,Unknown,Windows,3,2
3,3567543,0,16.554686,5.141664,W,8.969669,6.202536,5.010635,visa,5.420535,...,Unknown,chrome 66.0,Unknown,Unknown,Unknown,Unknown,Unknown,Windows,3,2
4,3567544,0,16.554687,4.681668,W,9.028219,6.276643,5.010635,mastercard,5.411646,...,Unknown,chrome 66.0,Unknown,Unknown,Unknown,Unknown,Unknown,Windows,3,2


In [141]:
# Wee have replaced all Null categorical variables with "Unknown"
# dataset['id_33'].value_counts()

## Label Encoder

In [142]:
# Transform all X_lables into integers for xgb model
# from sklearn import preprocessing

# le = preprocessing.LabelEncoder()
# for feature in categorical_features_train:
#     dataset[feature]=dataset[feature].astype(str)
#     le.fit(dataset[feature])
#     dataset[feature] = le.transform(dataset[feature])

# for feature in categorical_features_test:
#     test[feature]=test[feature].astype(str)
#     le.fit(test[feature])
#     test[feature] = le.transform(test[feature])

In [143]:
# Transformed them into integers, we will rank them later
# dataset['P_emaildomain'][0].dtype

In [144]:
dataset.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_29,id_31,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,day,hour
0,3567540,0,16.554679,3.663562,W,7.497762,5.081404,5.010635,mastercard,4.762174,...,Unknown,chrome 66.0,Unknown,Unknown,Unknown,Unknown,Unknown,Windows,3,2
1,3567541,0,16.554681,3.432373,W,9.439625,5.590987,5.010635,visa,5.111988,...,Unknown,chrome 66.0,Unknown,Unknown,Unknown,Unknown,Unknown,Windows,3,2
2,3567542,0,16.554683,4.799914,W,9.714745,6.165418,5.010635,visa,4.836282,...,Unknown,chrome 66.0,Unknown,Unknown,Unknown,Unknown,Unknown,Windows,3,2
3,3567543,0,16.554686,5.141664,W,8.969669,6.202536,5.010635,visa,5.420535,...,Unknown,chrome 66.0,Unknown,Unknown,Unknown,Unknown,Unknown,Windows,3,2
4,3567544,0,16.554687,4.681668,W,9.028219,6.276643,5.010635,mastercard,5.411646,...,Unknown,chrome 66.0,Unknown,Unknown,Unknown,Unknown,Unknown,Windows,3,2


In [145]:
# ## 
# print(len(categorical_features_train))
# print(len(categorical_features_test))

for feature in categorical_features_train:
    train_ = pd.DataFrame(dataset.groupby('isFraud')[feature].value_counts(normalize=True).rename('percentage')
                         .mul(100)
                         .reset_index()
                         .sort_values(feature))
    train_ = train_.set_index(train_[feature])
    print(train_)
    dict_= {}
    for label in train_[feature].unique(): # go thru eveery unique label and find diffreence % btwn isFraud
        #Within each category, find diff
        #Create dictionary where feature label is key and percentage diff is val
        dict_[label] = train_.loc[(train_['isFraud'] == 1) & (train_[feature] == label)]['percentage'] - train_.loc[(train_['isFraud'] == 0) & (train_[feature] == label)]['percentage']
    
    #After going through each label, order by largest diff: If most +, then it will be numbered last, if most - then numbered first
    #EX: 0 index = more legit than fraud, n index = more fraud than legit
    labels_ordered = sorted(dict_.items(), reverse=True)
    labels_ordered={k[0]:i for i,k in enumerate(labels_ordered,0)}
    print(labels_ordered)
    # Map by replacing the dic key with the value in each feature column
    dataset[feature]=dataset[feature].map(labels_ordered)
    if(feature in test.columns):
        test[feature] = test[feature].map(labels_ordered)
    

           isFraud ProductCD  percentage
ProductCD                               
C                0         C    7.384647
C                1         C   34.085213
H                0         H    2.770545
H                1         H    4.761905
R                0         R    4.332882
R                1         R    7.769424
S                0         S    2.093532
S                1         S    9.022556
W                0         W   83.418394
W                1         W   44.360902
{'W': 0, 'S': 1, 'R': 2, 'H': 3, 'C': 4}
                  isFraud             card4  percentage
card4                                                  
american express        0  american express    0.916571
american express        1  american express    0.751880
discover                0          discover    1.187376
discover                1          discover    2.255639
mastercard              0        mastercard   30.173940
mastercard              1        mastercard   28.070175
visa               

{'Unknown': 0, 'NotFound': 1, 'Found': 2}
         isFraud    id_15  percentage
id_15                                
Found          0    Found   92.125820
Found          1    Found   83.959900
New            0      New    6.499323
New            1      New   10.526316
Unknown        0  Unknown    1.374857
Unknown        1  Unknown    5.513784
{'Unknown': 0, 'New': 1, 'Found': 2}
          isFraud     id_16  percentage
id_16                                  
Found           0     Found    7.895011
Found           1     Found   36.340852
NotFound        0  NotFound    6.540985
NotFound        1  NotFound   10.526316
Unknown         0   Unknown   85.564004
Unknown         1   Unknown   53.132832
{'Unknown': 0, 'NotFound': 1, 'Found': 2}
         isFraud    id_28  percentage
id_28                                
Found          0    Found    8.884491
Found          1    Found   38.345865
New            0      New    6.915946
New            1      New   14.035088
Unknown        0  Unknown  

In [146]:
test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id-33,id-34,id-35,id-36,id-37,id-38,DeviceType,DeviceInfo,day,hour
0,4160240,33940602,6.701,4,15885,545.0,185.0,0,138.0,0,...,1920x1080,Unknown,F,F,T,F,1,19.0,6,19
1,4160241,33940623,32.277,4,5812,408.0,185.0,1,224.0,0,...,1920x1080,Unknown,F,F,T,F,0,,6,19
2,4160242,33940630,609.88,0,9537,583.0,150.0,0,226.0,1,...,1920x1080,Unknown,Unknown,Unknown,Unknown,Unknown,2,19.0,6,19
3,4160243,33940634,100.0,3,6019,583.0,150.0,0,226.0,1,...,1920x1080,match_status:2,T,F,T,T,1,115.0,6,19
4,4160244,33940669,7.009,4,9633,130.0,185.0,0,138.0,0,...,1920x1080,Unknown,F,F,T,F,0,,6,19


## PCA

In [147]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit(dataset.drop('isFraud', axis=1)) # We fit the data
# scaled_data_train = scaler.transform(dataset.drop('isFraud', axis=1))
# scaler.fit(test) # We fit the data
# scaled_data_test = scaler.transform(test)

In [148]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components=4)

# # Here we find ALL Principle Components using fit
# pca.fit(scaled_data_train)
# x_train_pca = pca.transform(scaled_data_train)

# pca.fit(scaled_data_test)
# x_test_pca = pca.transform(scaled_data_test)


In [149]:
# scaled_data_train.shape

In [150]:
# x_train_pca.shape

In [151]:
# x_train_pca = pd.DataFrame(x_train_pca)
# test_pca = pd.DataFrame(x_test_pca)

In [152]:
# dataset_pca = pd.concat([x_train_pca, dataset['isFraud']], axis=1)

## Oversampling

In [153]:
from sklearn.utils import resample
not_fraud = dataset[dataset['isFraud'] == 0]
fraud = dataset[dataset['isFraud'] == 1]

# Replicate the samples from fraud
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results
# combine majority and upsampled minority
upsampled_train = pd.concat([not_fraud, fraud_upsampled])

# check new class counts
print(upsampled_train.isFraud.value_counts())

1    9601
0    9601
Name: isFraud, dtype: int64


In [154]:
upsampled_train.to_csv('train_feateng', index=False)
test.to_csv('test_feateng', index=False)