# Fraud Detection System for Online Transactions
## Project 4: Machine Learning Integration

In [1]:
# Import dependencies

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Import the data

file_path1 = Path("Resources/test_identity.csv")
file_path2 = Path("Resources/test_transaction.csv")
file_path3 = Path("Resources/train_identity.csv")
file_path4 = Path("Resources/train_transaction.csv")

In [3]:
# Read the data

test_identity = pd.read_csv(file_path1)
test_transaction = pd.read_csv(file_path2)
train_identity = pd.read_csv(file_path3)
train_transaction = pd.read_csv(file_path4)

## Examine the characteristics of each dataset

### Check data types

In [5]:
test_identity.dtypes 

TransactionID      int64
id-01            float64
id-02            float64
id-03            float64
id-04            float64
id-05            float64
id-06            float64
id-07            float64
id-08            float64
id-09            float64
id-10            float64
id-11            float64
id-12             object
id-13            float64
id-14            float64
id-15             object
id-16             object
id-17            float64
id-18            float64
id-19            float64
id-20            float64
id-21            float64
id-22            float64
id-23             object
id-24            float64
id-25            float64
id-26            float64
id-27             object
id-28             object
id-29             object
id-30             object
id-31             object
id-32            float64
id-33             object
id-34             object
id-35             object
id-36             object
id-37             object
id-38             object
DeviceType        object


In [6]:
test_transaction.dtypes

TransactionID       int64
TransactionDT       int64
TransactionAmt    float64
ProductCD          object
card1               int64
                   ...   
V335              float64
V336              float64
V337              float64
V338              float64
V339              float64
Length: 393, dtype: object

In [7]:
train_identity.dtypes

TransactionID      int64
id_01            float64
id_02            float64
id_03            float64
id_04            float64
id_05            float64
id_06            float64
id_07            float64
id_08            float64
id_09            float64
id_10            float64
id_11            float64
id_12             object
id_13            float64
id_14            float64
id_15             object
id_16             object
id_17            float64
id_18            float64
id_19            float64
id_20            float64
id_21            float64
id_22            float64
id_23             object
id_24            float64
id_25            float64
id_26            float64
id_27             object
id_28             object
id_29             object
id_30             object
id_31             object
id_32            float64
id_33             object
id_34             object
id_35             object
id_36             object
id_37             object
id_38             object
DeviceType        object


In [8]:
train_transaction.dtypes

TransactionID       int64
isFraud             int64
TransactionDT       int64
TransactionAmt    float64
ProductCD          object
                   ...   
V335              float64
V336              float64
V337              float64
V338              float64
V339              float64
Length: 394, dtype: object

### Check for missing values

In [9]:
## for test_identity
print(test_identity.count())
print(test_identity.isnull().values.any())

TransactionID    141907
id-01            141907
id-02            136976
id-03             66481
id-04             66481
id-05            134750
id-06            134750
id-07              5059
id-08              5059
id-09             74338
id-10             74338
id-11            136778
id-12            141907
id-13            130286
id-14             71357
id-15            136977
id-16            125747
id-17            135966
id-18             50875
id-19            135906
id-20            135633
id-21              5059
id-22              5062
id-23              5062
id-24              4740
id-25              5039
id-26              5047
id-27              5062
id-28            136778
id-29            136778
id-30             70659
id-31            136625
id-32             70671
id-33             70671
id-34             72175
id-35            136977
id-36            136977
id-37            136977
id-38            136977
DeviceType       136931
DeviceInfo       115057
dtype: int64
Tru

In [10]:
## for test_transaction
print(test_transaction.count())
print(test_transaction.isnull().values.any())

TransactionID     506691
TransactionDT     506691
TransactionAmt    506691
ProductCD         506691
card1             506691
                   ...  
V335               76431
V336               76431
V337               76431
V338               76431
V339               76431
Length: 393, dtype: int64
True


In [11]:
## for train_identity
print(train_identity.count())
print(train_identity.isnull().values.any())

TransactionID    144233
id_01            144233
id_02            140872
id_03             66324
id_04             66324
id_05            136865
id_06            136865
id_07              5155
id_08              5155
id_09             74926
id_10             74926
id_11            140978
id_12            144233
id_13            127320
id_14             80044
id_15            140985
id_16            129340
id_17            139369
id_18             45113
id_19            139318
id_20            139261
id_21              5159
id_22              5169
id_23              5169
id_24              4747
id_25              5132
id_26              5163
id_27              5169
id_28            140978
id_29            140978
id_30             77565
id_31            140282
id_32             77586
id_33             73289
id_34             77805
id_35            140985
id_36            140985
id_37            140985
id_38            140985
DeviceType       140810
DeviceInfo       118666
dtype: int64
Tru

In [12]:
## for train_transaction
print(train_transaction.count())
print(train_transaction.isnull().values.any())

TransactionID     590540
isFraud           590540
TransactionDT     590540
TransactionAmt    590540
ProductCD         590540
                   ...  
V335               82351
V336               82351
V337               82351
V338               82351
V339               82351
Length: 394, dtype: int64
True


Any missing data? yes

### Check for duplicate entries

In [13]:
test_identity.duplicated().sum()

0

In [14]:
test_transaction.duplicated().sum()

0

In [15]:
train_identity.duplicated().sum()

0

In [16]:
train_transaction.duplicated().sum()

0

### Handle categorical data
Handle categorical data by converting them into numerical values

In [None]:
#one-hot encoding
## pd.get_dummies(X['<categorical variable>'])

### Standardise data

Fit pre-processor to training data

In [None]:
# scaler = StandardScaler()
# <scaled dataset> = scaler.fit_transform(<df>)

### Identify and remove outliers

### Perform feature engineering/dimension reduction

In [None]:
# currently lots of variables, need to conduct feature engineering to remove