### Started Notebook: https://www.kaggle.com/kickitlikeshika/fraud-detection-with-92-acc

### Load the Dataset from Google Drive and Move it to Colab Local Space

In [32]:
cp "drive/MyDrive/Colab Notebooks/Datasets/ieee-fraud-detection.zip" .

In [2]:
!unzip ieee-fraud-detection.zip

Archive:  ieee-fraud-detection.zip
  inflating: sample_submission.csv   
  inflating: test_identity.csv       
  inflating: test_transaction.csv    
  inflating: train_identity.csv      
  inflating: train_transaction.csv   


In [33]:
import pandas as pd
import lightgbm as lgb

### Load the Tables

In [34]:
train_transaction = pd.read_csv("train_transaction.csv")
train_identity = pd.read_csv("train_identity.csv")
test_transaction = pd.read_csv("test_transaction.csv")
test_identity = pd.read_csv("test_identity.csv")

In [47]:
print(train_transaction.shape)
print(train_identity.shape)
print(test_transaction.shape)
print(test_identity.shape)

(590540, 394)
(144233, 41)
(506691, 393)
(141907, 41)


In [49]:
train.shape, test.shape

((590540, 434), (506691, 433))

### Join the Tables for both Train and Test set.

In [48]:
train = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")
test = pd.merge(test_transaction, test_identity, on="TransactionID", how="left")

In [54]:
test = test.rename(columns={"id-01": "id_01", "id-02": "id_02", "id-03": "id_03",
                            "id-06": "id_06", "id-05": "id_05", "id-04": "id_04",
                            "id-07": "id_07", "id-08": "id_08", "id-09": "id_09",
                            "id-10": "id_10", "id-11": "id_11", "id-12": "id_12",
                            "id-15": "id_15", "id-14": "id_14", "id-13": "id_13",
                            "id-16": "id_16", "id-17": "id_17", "id-18": "id_18",
                            "id-21": "id_21", "id-20": "id_20", "id-19": "id_19",
                            "id-22": "id_22", "id-23": "id_23", "id-24": "id_24",
                            "id-27": "id_27", "id-26": "id_26", "id-25": "id_25",
                            "id-28": "id_28", "id-29": "id_29", "id-30": "id_30",
                            "id-31": "id_31", "id-32": "id_32", "id-33": "id_33",
                            "id-34": "id_34", "id-35": "id_35", "id-36": "id_36",
                            "id-37": "id_37", "id-38": "id_38"})

In [7]:
del train_transaction
del train_identity
del test_transaction
del test_identity

### Count the Number of Missing Values for each feature

In [50]:
def get_missing_values(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum() / data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['total', 'percent'])
    return missing_data

In [51]:
missing_values = get_missing_values(train)
missing_values.head(100).T

Unnamed: 0,id_24,id_25,id_07,id_08,id_21,id_26,id_22,id_23,id_27,dist2,D7,id_18,D13,D14,D12,id_04,id_03,D6,id_33,id_10,D8,D9,id_09,id_30,id_32,id_34,id_14,V149,V153,V156,V154,V155,V157,V158,V161,V163,V147,V148,V162,V146,...,V336,V335,V334,V333,V332,V331,V330,V329,V328,V327,V326,V325,V324,V323,DeviceInfo,id_13,id_16,V237,V230,V267,V266,V265,V264,V263,V262,V261,V260,V228,V258,V257,V229,V254,V269,V253,V252,V231,V232,V249,V248,V247
total,585793.0,585408.0,585385.0,585385.0,585381.0,585377.0,585371.0,585371.0,585371.0,552913.0,551623.0,545427.0,528588.0,528353.0,525823.0,524216.0,524216.0,517353.0,517251.0,515614.0,515614.0,515614.0,515614.0,512975.0,512954.0,512735.0,510496.0,508595.0,508595.0,508595.0,508595.0,508595.0,508595.0,508595.0,508595.0,508595.0,508595.0,508595.0,508595.0,508595.0,...,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,471874.0,463220.0,461200.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0,460110.0
percent,0.991962,0.99131,0.991271,0.991271,0.991264,0.991257,0.991247,0.991247,0.991247,0.936284,0.934099,0.923607,0.895093,0.894695,0.89041,0.887689,0.887689,0.876068,0.875895,0.873123,0.873123,0.873123,0.873123,0.868654,0.868619,0.868248,0.864456,0.861237,0.861237,0.861237,0.861237,0.861237,0.861237,0.861237,0.861237,0.861237,0.861237,0.861237,0.861237,0.861237,...,0.86055,0.86055,0.86055,0.86055,0.86055,0.86055,0.86055,0.86055,0.86055,0.86055,0.86055,0.86055,0.86055,0.86055,0.799055,0.784401,0.78098,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134,0.779134


### Drop the features with More than 1,00,000 missing values.

In [52]:
cols_to_drop = missing_values[missing_values['total'] > 100000].index
cols_to_drop

Index(['id_24', 'id_25', 'id_07', 'id_08', 'id_21', 'id_26', 'id_22', 'id_23',
       'id_27', 'dist2',
       ...
       'V43', 'V49', 'V41', 'V40', 'V39', 'V38', 'V37', 'V36', 'V35', 'D4'],
      dtype='object', length=252)

In [55]:
train.drop(cols_to_drop, axis=1, inplace=True)
test.drop(cols_to_drop, axis=1, inplace=True)

In [56]:
test_transactionID = test.TransactionID

### Drop the Column `TransactionID` as it is not a relavant feature and also drop features that have more than 15,000 missing values. 

In [57]:
train.drop(['TransactionID'], axis=1, inplace=True)
test.drop(['TransactionID'], axis=1, inplace=True)

In [58]:
missing_values = get_missing_values(train)

In [59]:
cols_to_drop = missing_values[missing_values['total'] > 15000].index
cols_to_drop

Index(['P_emaildomain', 'V90', 'V75', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82',
       'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V91', 'V92', 'V93',
       'V76', 'V94', 'D15', 'V69', 'V70', 'V53', 'V54', 'V55', 'V56', 'V57',
       'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67',
       'V68', 'V74', 'V73', 'V72', 'V71', 'V31', 'V32', 'V34', 'V33', 'V16',
       'V14', 'V29', 'V28', 'V27', 'V26', 'V25', 'V24', 'V23', 'V22', 'V21',
       'V20', 'V19', 'V18', 'V17', 'V15', 'V30', 'V13', 'V12', 'D10', 'addr2',
       'addr1'],
      dtype='object')

In [60]:
train.drop(cols_to_drop, axis=1, inplace=True)
test.drop(cols_to_drop, axis=1, inplace=True)

In [61]:
train.shape, test.shape

((590540, 111), (506691, 110))

In [17]:
set(train.dtypes), set(test.dtypes)

({dtype('int64'), dtype('float64'), dtype('O')},
 {dtype('int64'), dtype('float64'), dtype('O')})

### Fill the rest of the Missing Values with Median for Numerical Features and Mode for Categorical Features

In [18]:
train_columns = train.columns
for col in train_columns:
    if train[col].dtype == 'object':
        train[col] = train[col].fillna(train[col].mode()[0])
    else:
        train[col] = train[col].fillna(train[col].mean())

In [19]:
test_columns = test.columns
for col in test_columns:
    if test[col].dtype == 'object':
        test[col] = test[col].fillna(test[col].mode()[0])
    else:
        test[col] = test[col].fillna(test[col].mean())

### Convert the Categorical values to numeric and prepare the train data to pass that into the model

In [20]:
cat_columns = train.select_dtypes(['object']).columns
cat_columns

Index(['ProductCD', 'card4', 'card6'], dtype='object')

In [21]:
for col in cat_columns:
    train[col] = train[col].astype("category").cat.codes
    test[col] = test[col].astype("category").cat.codes

In [22]:
target = train.isFraud
train.drop(['isFraud'], axis=1, inplace=True)
print(train.shape)

(590540, 110)


### Define the Parameters of the Model and start training the Model

In [23]:
x_train = lgb.Dataset(train, label=target)

In [27]:
lgbm_params = {'learning_rate': 0.05, 'boosting_type': 'dart',
              'objective': 'binary',
              'metric': ['auc', 'binary_logloss'],
              'num_threads': 2}

In [28]:
lgbm_classifier = lgb.train(lgbm_params, x_train, 2000)

CPU times: user 1h 30min 53s, sys: 5.16 s, total: 1h 30min 58s
Wall time: 46min 11s


### Predict the output of the Test Set.

In [29]:
y_pred_lgb = lgbm_classifier.predict(test)

In [31]:
y_pred_lgb = y_pred_lgb.reshape(y_pred_lgb.shape[0], 1)

In [30]:
sub = pd.DataFrame()
sub['TransactionID'] = test_transactionID
sub['isFraud'] = y_pred_lgb
sub.to_csv('test_results.csv', index=False)

### Save the trained model for future Inference.

In [39]:
lgbm_classifier.save_model("model.json")

<lightgbm.basic.Booster at 0x7f270d2c00d0>