In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

## Data Summary

In [2]:
#since the data is large, I have to chunk it into bits to make it easier for loading

data = pd.read_csv('Fraud Detection Dataset.csv', chunksize=100000)

In [4]:
data

<pandas.io.parsers.readers.TextFileReader at 0x147863c71f0>

In [5]:
data = pd.read_csv('Fraud Detection Dataset.csv', chunksize=100000)
t_len = 0
for chunk in data:
    t_len += len(chunk)
print(t_len)

6000000


In [3]:
%%time

data = pd.read_csv('Fraud Detection Dataset.csv', chunksize=100000)

df = pd.concat(data, ignore_index=True)
df.head(10)

CPU times: total: 1min 37s
Wall time: 10min 27s


Unnamed: 0,Transaction ID,User ID,Transaction Amount,Transaction Date and Time,Merchant ID,Payment Method,Country Code,Transaction Type,Device Type,IP Address,...,User's Transaction History,Merchant's Reputation Score,User's Device Location,Transaction Currency,Transaction Purpose,User's Credit Score,User's Email Domain,Merchant's Business Age,Transaction Authentication Method,Fraudulent Flag
0,51595306,9822,163.08,2023-01-02 07:47:54,4044,ACH Transfer,KOR,Charity,GPS Device,42.23.223.120,...,26,2.71,United Kingdom,NOK,Consultation Fee,343,cox.co.uk,3,Bluetooth Authentication,0
1,85052974,4698,430.74,2021-09-12 15:15:41,4576,2Checkout,VNM,Cashback,Medical Device,39.52.212.120,...,60,3.95,Mexico,EGP,Cashback Reward,688,gmail.com,13,NFC Tag,1
2,23954324,8666,415.74,2023-01-12 17:25:58,4629,Google Wallet,MEX,Reward,Vehicle Infotainment System,243.180.236.29,...,81,3.81,Qatar,MXN,Acquisition,371,rocketmail.com,7,Token,1
3,44108303,9012,565.89,2021-02-27 11:31:00,3322,Check,SGP,Purchase,Kiosk,212.186.227.14,...,18,2.67,Spain,CLP,Loan Repayment,687,roadrunner.co.uk,15,Time-Based OTP,1
4,66622683,5185,955.49,2022-09-24 04:06:38,7609,Worldpay,HKG,Acquisition,Smart Mirror,166.113.10.199,...,98,3.19,Israel,RUB,Dividend Reinvestment,605,protonmail.co.uk,17,Password,1
5,29002618,7770,635.62,2023-07-28 23:33:27,5285,Prepaid Card,THA,Auction,Smart Doorbell,105.127.92.148,...,39,4.76,Mexico,BRL,Service Charge,342,verizon.co.uk,19,Transaction Confirmation Number,0
6,63317849,4617,274.04,2022-10-06 06:41:12,1607,Credit Card,CHE,Admission,Smart Speaker,142.183.64.81,...,95,1.26,South Africa,AUD,Rent,703,yahoo.co.uk,6,CAPTCHA,1
7,79673247,9707,980.06,2023-03-10 22:48:10,5602,Contactless Payment,CAN,Dividend,Desktop,187.28.75.91,...,38,3.15,Canada,HKD,Acquisition,564,verizon.net,7,Security Question,1
8,26746847,8256,828.3,2021-03-01 03:36:02,7476,Masterpass,TAI,Recharge,Home Security System,84.82.147.62,...,26,4.84,Australia,HKD,Retail Purchase,323,yahoo.co.uk,3,Hardware Token,0
9,97928727,1238,791.35,2021-09-14 12:27:44,7832,Check,QAT,Reward,Smart Speaker,2.60.179.94,...,88,3.93,Singapore,THB,Donation to Nonprofit,434,gmail.co.uk,11,Signature Verification,0


In [13]:
#viewing the column ID for easy use if needs be

df.columns

Index(['Transaction ID', 'User ID', 'Transaction Amount',
       'Transaction Date and Time', 'Merchant ID', 'Payment Method',
       'Country Code', 'Transaction Type', 'Device Type', 'IP Address',
       'Browser Type', 'Operating System', 'Merchant Category', 'User Age',
       'User Occupation', 'User Income', 'User Gender', 'User Account Status',
       'Transaction Status', 'Location Distance', 'Time Taken for Transaction',
       'Transaction Time of Day', 'User's Transaction History',
       'Merchant's Reputation Score', 'User's Device Location',
       'Transaction Currency', 'Transaction Purpose', 'User's Credit Score',
       'User's Email Domain', 'Merchant's Business Age',
       'Transaction Authentication Method', 'Fraudulent Flag'],
      dtype='object')

In [14]:
df['Fraudulent Flag'].value_counts()

0    3000094
1    2999906
Name: Fraudulent Flag, dtype: int64

In [88]:
print(df["User's Device Location"].unique())
print(len(df["User's Device Location"].unique()))
print(df["User's Device Location"].value_counts())

['United Kingdom' 'Mexico' 'Qatar' 'Spain' 'Israel' 'South Africa'
 'Canada' 'Australia' 'Singapore' 'China' 'Switzerland' 'Sweden' 'Germany'
 'Nigeria' 'Norway' 'New Zealand' 'Belgium' 'Indonesia' 'South Korea'
 'Japan' 'Turkey' 'Taiwan' 'Argentina' 'United States' 'Russia'
 'Netherlands' 'Hong Kong' 'United Arab Emirates' 'Thailand' 'Egypt'
 'Saudi Arabia' 'Vietnam' 'Austria' 'Malaysia' 'Italy' 'Brazil' 'Poland'
 'Greece' 'France' 'India']
40
United States           150845
Italy                   150786
Germany                 150659
Malaysia                150600
Switzerland             150525
South Korea             150507
South Africa            150496
Nigeria                 150442
Norway                  150425
Spain                   150406
Netherlands             150399
Turkey                  150225
Greece                  150175
Canada                  150165
Poland                  150136
China                   150100
Indonesia               150096
Sweden                  

## Data cleaning

In [15]:
df.describe()

Unnamed: 0,Transaction ID,User ID,Transaction Amount,Merchant ID,User Age,User Income,Location Distance,Time Taken for Transaction,User's Transaction History,Merchant's Reputation Score,User's Credit Score,Merchant's Business Age,Fraudulent Flag
count,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0
mean,55019060.0,5499.571,500.5515,5500.817,49.00159,50493.28,50.506,30.49492,50.48393,2.999382,574.9533,10.49685,0.4999843
std,25980670.0,2598.744,288.4166,2598.35,18.18538,28582.23,28.57562,17.03567,28.86705,1.154479,159.0197,5.764884,0.5
min,10000040.0,1000.0,1.0,1000.0,18.0,1000.02,1.0,1.0,1.0,1.0,300.0,1.0,0.0
25%,32516660.0,3248.0,250.79,3251.0,33.0,25734.46,25.76,15.74,25.0,2.0,437.0,5.0,0.0
50%,55029870.0,5499.0,500.51,5501.0,49.0,50502.94,50.51,30.5,50.0,3.0,575.0,10.0,0.0
75%,77524870.0,7751.0,750.44,7752.0,65.0,75234.54,75.25,45.26,75.0,4.0,713.0,15.0,1.0
max,99999880.0,9999.0,1000.0,9999.0,80.0,100000.0,100.0,60.0,100.0,5.0,850.0,20.0,1.0


In [5]:
df2 = df.drop(['User Gender', 'User Age', 'Browser Type', "Merchant's Business Age", 'User Occupation',
         'Transaction Date and Time', "User's Email Domain", 'User ID', 'Merchant ID', 'Country Code'], axis='columns')
df2.head()

Unnamed: 0,Transaction ID,Transaction Amount,Payment Method,Transaction Type,Device Type,IP Address,Operating System,Merchant Category,User Income,User Account Status,...,Time Taken for Transaction,Transaction Time of Day,User's Transaction History,Merchant's Reputation Score,User's Device Location,Transaction Currency,Transaction Purpose,User's Credit Score,Transaction Authentication Method,Fraudulent Flag
0,51595306,163.08,ACH Transfer,Charity,GPS Device,42.23.223.120,Tizen,Industrial & Scientific,66826.21,Pro,...,24.22,Daytime,26,2.71,United Kingdom,NOK,Consultation Fee,343,Bluetooth Authentication,0
1,85052974,430.74,2Checkout,Cashback,Medical Device,39.52.212.120,Windows Server,Beauty & Cosmetics,89356.71,Pending Approval,...,55.11,Daytime,60,3.95,Mexico,EGP,Cashback Reward,688,NFC Tag,1
2,23954324,415.74,Google Wallet,Reward,Vehicle Infotainment System,243.180.236.29,Android,Real Estate,58438.63,Pro,...,53.84,Daytime,81,3.81,Qatar,MXN,Acquisition,371,Token,1
3,44108303,565.89,Check,Purchase,Kiosk,212.186.227.14,CentOS,Appliances,3426.92,Premium,...,21.62,Daytime,18,2.67,Spain,CLP,Loan Repayment,687,Time-Based OTP,1
4,66622683,955.49,Worldpay,Acquisition,Smart Mirror,166.113.10.199,Ubuntu,Jewelry,53080.12,Free,...,53.71,Daytime,98,3.19,Israel,RUB,Dividend Reinvestment,605,Password,1


In [20]:
df2.columns

Index(['Transaction ID', 'Transaction Amount', 'Payment Method',
       'Transaction Type', 'Device Type', 'IP Address', 'Operating System',
       'Merchant Category', 'User Income', 'User Account Status',
       'Transaction Status', 'Location Distance', 'Time Taken for Transaction',
       'Transaction Time of Day', 'User's Transaction History',
       'Merchant's Reputation Score', 'User's Device Location',
       'Transaction Currency', 'Transaction Purpose', 'User's Credit Score',
       'Transaction Authentication Method', 'Fraudulent Flag'],
      dtype='object')

In [21]:
df2.isnull().sum()

Transaction ID                       0
Transaction Amount                   0
Payment Method                       0
Transaction Type                     0
Device Type                          0
IP Address                           0
Operating System                     0
Merchant Category                    0
User Income                          0
User Account Status                  0
Transaction Status                   0
Location Distance                    0
Time Taken for Transaction           0
Transaction Time of Day              0
User's Transaction History           0
Merchant's Reputation Score          0
User's Device Location               0
Transaction Currency                 0
Transaction Purpose                  0
User's Credit Score                  0
Transaction Authentication Method    0
Fraudulent Flag                      0
dtype: int64

In [22]:
df2.dtypes

Transaction ID                         int64
Transaction Amount                   float64
Payment Method                        object
Transaction Type                      object
Device Type                           object
IP Address                            object
Operating System                      object
Merchant Category                     object
User Income                          float64
User Account Status                   object
Transaction Status                    object
Location Distance                    float64
Time Taken for Transaction           float64
Transaction Time of Day               object
User's Transaction History             int64
Merchant's Reputation Score          float64
User's Device Location                object
Transaction Currency                  object
Transaction Purpose                   object
User's Credit Score                    int64
Transaction Authentication Method     object
Fraudulent Flag                        int64
dtype: obj

In [6]:
#*since location distance compare to user phone location are both for location, so I'd 
##prefer taking one of the two
#**I'd also prefer to use device type over the operating system with device types having 
##higher ranking than operating system

df2.drop(["User's Device Location", 'Operating System'], axis='columns', inplace=True)

print(df2.columns)
print(df.dtypes)

Index(['Transaction ID', 'Transaction Amount', 'Payment Method',
       'Transaction Type', 'Device Type', 'IP Address', 'Merchant Category',
       'User Income', 'User Account Status', 'Transaction Status',
       'Location Distance', 'Time Taken for Transaction',
       'Transaction Time of Day', 'User's Transaction History',
       'Merchant's Reputation Score', 'Transaction Currency',
       'Transaction Purpose', 'User's Credit Score',
       'Transaction Authentication Method', 'Fraudulent Flag'],
      dtype='object')
Transaction ID                         int64
User ID                                int64
Transaction Amount                   float64
Transaction Date and Time             object
Merchant ID                            int64
Payment Method                        object
Country Code                          object
Transaction Type                      object
Device Type                           object
IP Address                            object
Browser Type      

## Featuring Engineering

In [24]:
df2.head(5)

Unnamed: 0,Transaction ID,Transaction Amount,Payment Method,Transaction Type,Device Type,IP Address,Merchant Category,User Income,User Account Status,Transaction Status,Location Distance,Time Taken for Transaction,Transaction Time of Day,User's Transaction History,Merchant's Reputation Score,Transaction Currency,Transaction Purpose,User's Credit Score,Transaction Authentication Method,Fraudulent Flag
0,51595306,163.08,ACH Transfer,Charity,GPS Device,42.23.223.120,Industrial & Scientific,66826.21,Pro,In Transit,9.34,24.22,Daytime,26,2.71,NOK,Consultation Fee,343,Bluetooth Authentication,0
1,85052974,430.74,2Checkout,Cashback,Medical Device,39.52.212.120,Beauty & Cosmetics,89356.71,Pending Approval,Resolved,65.28,55.11,Daytime,60,3.95,EGP,Cashback Reward,688,NFC Tag,1
2,23954324,415.74,Google Wallet,Reward,Vehicle Infotainment System,243.180.236.29,Real Estate,58438.63,Pro,Posted,44.05,53.84,Daytime,81,3.81,MXN,Acquisition,371,Token,1
3,44108303,565.89,Check,Purchase,Kiosk,212.186.227.14,Appliances,3426.92,Premium,Closed,21.7,21.62,Daytime,18,2.67,CLP,Loan Repayment,687,Time-Based OTP,1
4,66622683,955.49,Worldpay,Acquisition,Smart Mirror,166.113.10.199,Jewelry,53080.12,Free,Refunded,56.63,53.71,Daytime,98,3.19,RUB,Dividend Reinvestment,605,Password,1


In [7]:
#*Since your credit score is a prediction of your credit behavior, 
#such as how likely you are to pay a loan back on time, based on information from your credit reports while
#**Reputation score is a number calculated based on lots of factors connected to the image of a brand 
#(how is perceived by the audience) and its reputation, we will do feature engineering by dividing

#Credit Score/Merchant reputation score

df3 = df2.copy()
df3['Credit per Reputation score'] = df3["User's Credit Score"]/df3["Merchant's Reputation Score"]
df3.head(5)

Unnamed: 0,Transaction ID,Transaction Amount,Payment Method,Transaction Type,Device Type,IP Address,Merchant Category,User Income,User Account Status,Transaction Status,...,Time Taken for Transaction,Transaction Time of Day,User's Transaction History,Merchant's Reputation Score,Transaction Currency,Transaction Purpose,User's Credit Score,Transaction Authentication Method,Fraudulent Flag,Credit per Reputation score
0,51595306,163.08,ACH Transfer,Charity,GPS Device,42.23.223.120,Industrial & Scientific,66826.21,Pro,In Transit,...,24.22,Daytime,26,2.71,NOK,Consultation Fee,343,Bluetooth Authentication,0,126.568266
1,85052974,430.74,2Checkout,Cashback,Medical Device,39.52.212.120,Beauty & Cosmetics,89356.71,Pending Approval,Resolved,...,55.11,Daytime,60,3.95,EGP,Cashback Reward,688,NFC Tag,1,174.177215
2,23954324,415.74,Google Wallet,Reward,Vehicle Infotainment System,243.180.236.29,Real Estate,58438.63,Pro,Posted,...,53.84,Daytime,81,3.81,MXN,Acquisition,371,Token,1,97.375328
3,44108303,565.89,Check,Purchase,Kiosk,212.186.227.14,Appliances,3426.92,Premium,Closed,...,21.62,Daytime,18,2.67,CLP,Loan Repayment,687,Time-Based OTP,1,257.303371
4,66622683,955.49,Worldpay,Acquisition,Smart Mirror,166.113.10.199,Jewelry,53080.12,Free,Refunded,...,53.71,Daytime,98,3.19,RUB,Dividend Reinvestment,605,Password,1,189.655172


In [31]:
print(df3['Transaction Currency'].unique())
print(len(df3['Transaction Currency'].unique()))
print(df3['Transaction Currency'].value_counts())

['NOK' 'EGP' 'MXN' 'CLP' 'RUB' 'BRL' 'AUD' 'HKD' 'THB' 'KRW' 'RON' 'CAD'
 'USD' 'QAR' 'GBP' 'PLN' 'CNY' 'COP' 'TRY' 'SEK' 'SAR' 'ILS' 'PEN' 'DKK'
 'NZD' 'VND' 'EUR' 'MYR' 'AED' 'CZK' 'CHF' 'ARS' 'HUF' 'PHP' 'IDN' 'ZAR'
 'SGD' 'KES' 'INR' 'JPY']
40
ZAR    150722
MXN    150710
SEK    150660
MYR    150622
CLP    150574
EGP    150525
CZK    150520
USD    150497
HKD    150431
QAR    150383
AUD    150353
DKK    150244
TRY    150242
COP    150241
INR    150147
SAR    150079
ILS    150066
ARS    150036
RUB    150012
CNY    149996
KRW    149967
CHF    149885
EUR    149830
THB    149823
AED    149820
NOK    149746
NZD    149714
VND    149700
PEN    149635
GBP    149623
HUF    149616
PHP    149606
RON    149602
IDN    149590
PLN    149577
CAD    149548
JPY    149476
KES    149443
SGD    149416
BRL    149323
Name: Transaction Currency, dtype: int64


In [34]:
print(df3['Transaction Authentication Method'].unique())
print(len(df3['Transaction Authentication Method'].unique()))
print(df3['Transaction Authentication Method'].value_counts())

['Bluetooth Authentication' 'NFC Tag' 'Token' 'Time-Based OTP' 'Password'
 'Transaction Confirmation Number' 'CAPTCHA' 'Security Question'
 'Hardware Token' 'Signature Verification' 'Iris Scan'
 'Push Notification Confirmation' 'Certificate-based Authentication'
 'Pattern Lock' 'Smart Card' 'SMS Code' 'Behavioral Biometrics'
 'Biometric Scan' 'Knowledge-Based Authentication' 'Voiceprint'
 'Mobile Phone Verification' 'QR Code' 'Face ID' 'PIN'
 'Two-Factor Authentication' 'Biometric Card'
 'Radio-Frequency Identification (RFID)' 'Social Media Login'
 'Behavioral Analytics' 'Handwriting Recognition' 'Palm Vein Scan'
 'Mobile App Notification' 'Authentication App' 'Geolocation Verification'
 'USB Security Key' 'Fingerprint' 'Email Verification' 'Voice Recognition'
 'Retina Scan']
39
Geolocation Verification                 154728
Behavioral Analytics                     154416
SMS Code                                 154397
Hardware Token                           154250
Behavioral Biometr

In [36]:
print(df3['Transaction Time of Day'].unique())
print(df3['Transaction Time of Day'].value_counts())

['Daytime' 'Evening' 'Nighttime']
Nighttime    2001180
Evening      2000225
Daytime      1998595
Name: Transaction Time of Day, dtype: int64


In [39]:
print(df3['Transaction Status'].unique())
print(len(df3['Transaction Status'].unique()))
print(df3['Transaction Status'].value_counts())

['In Transit' 'Resolved' 'Posted' 'Closed' 'Refunded' 'Approved'
 'Cancelled' 'Partially Declined' 'Void' 'Sent' 'Cleared' 'Processing'
 'Disputed' 'Transferred' 'In Progress' 'Debit' 'Rejected' 'Unverified'
 'Held for Security' 'Pending Review' 'Authorized' 'Executed' 'Error'
 'Awaiting Verification' 'Pending' 'Pending Payment' 'Blocked' 'Settled'
 'Pending Confirmation' 'Failed' 'Credited' 'Received'
 'Partially Approved' 'Delivered' 'Processed' 'Hold' 'Verified' 'Declined'
 'Under Review' 'Completed']
40
In Progress              151049
Sent                     150663
Transferred              150611
Pending Confirmation     150538
Void                     150532
Debit                    150425
Disputed                 150394
Partially Declined       150365
Hold                     150359
Received                 150358
Under Review             150232
Refunded                 150124
Posted                   150088
Cancelled                150075
Closed                   150070
Cleared

In [40]:
print(df3['Payment Method'].unique())
print(len(df3['Payment Method'].unique()))
print(df3['Payment Method'].value_counts())

['ACH Transfer' '2Checkout' 'Google Wallet' 'Check' 'Worldpay'
 'Prepaid Card' 'Credit Card' 'Contactless Payment' 'Masterpass' 'Venmo'
 'E-check' 'NFC Payment' 'Direct Debit' 'Square Cash' 'Klarna' 'Afterpay'
 'Apple Pay' 'Visa Checkout' 'Neteller' 'Wire Transfer' 'Western Union'
 'Money Order' 'Stripe' 'Bank Transfer' 'Skrill' 'PayPal' 'Bitcoin'
 'Mobile Wallet' 'Discover' 'WeChat Pay' 'Diners Club' 'Alipay' 'Ethereum'
 'Cryptocurrency Wallet' 'Debit Card' 'Cash' 'JCB' 'Amazon Pay'
 'American Express' 'Payoneer']
40
Amazon Pay               150781
Cryptocurrency Wallet    150646
Afterpay                 150570
E-check                  150521
Masterpass               150467
Debit Card               150370
Alipay                   150366
Mobile Wallet            150349
Ethereum                 150332
Visa Checkout            150297
Diners Club              150291
Stripe                   150159
Apple Pay                150143
2Checkout                150115
Contactless Payment      150

In [41]:
print(df3['Transaction Type'].unique())
print(len(df3['Transaction Type'].unique()))
print(df3['Transaction Type'].value_counts())

['Charity' 'Cashback' 'Reward' 'Purchase' 'Acquisition' 'Auction'
 'Admission' 'Dividend' 'Recharge' 'Rental' 'Donation' 'Royalty' 'Expense'
 'Tax' 'Payout' 'Registration Fee' 'Buyback' 'Rent' 'Interest'
 'Withdrawal' 'Gift' 'Settlement' 'Subscription' 'Payment' 'Bonus'
 'Transfer' 'Fine' 'Compensation' 'Contribution' 'Reimbursement' 'Invoice'
 'Refund' 'Investment' 'Service Charge' 'Scholarship' 'Loan' 'Membership'
 'Deposit']
38
Refund              300214
Donation            299814
Scholarship         150891
Purchase            150791
Withdrawal          150640
Buyback             150595
Dividend            150549
Service Charge      150500
Deposit             150464
Acquisition         150357
Investment          150244
Cashback            150237
Settlement          150182
Expense             150109
Bonus               150106
Auction             150105
Loan                150094
Reward              150091
Rental              149998
Fine                149995
Tax                 14999

In [42]:
print(df3['Device Type'].unique())
print(len(df3['Device Type'].unique()))
print(df3['Device Type'].value_counts())

['GPS Device' 'Medical Device' 'Vehicle Infotainment System' 'Kiosk'
 'Smart Mirror' 'Smart Doorbell' 'Smart Speaker' 'Desktop'
 'Home Security System' 'POS Terminal' 'Cash Register' 'Tablet'
 'Wearable Device' 'Home Automation Hub' 'Industrial Controller' 'ATM'
 'Smart Thermostat' 'Barcode Scanner' 'Server' 'Virtual Reality Headset'
 'E-Reader' 'Smartphone' 'Smartwatch' 'Smart TV' 'Mobile' 'Digital Camera'
 'Robot' 'Laptop' 'Fitness Tracker' 'Smart Lock' 'Gaming Console'
 'Embedded System' 'IoT Device' 'Vending Machine' 'Self-Checkout Kiosk'
 'Drone' 'Smart Appliance' 'Augmented Reality Glasses']
38
Smart TV                       158697
Fitness Tracker                158605
IoT Device                     158552
Cash Register                  158464
Kiosk                          158234
Smart Speaker                  158207
Drone                          158170
POS Terminal                   158170
GPS Device                     158153
Mobile                         158151
Smart Applia

In [43]:
print(df3['Merchant Category'].unique())
print(len(df3['Merchant Category'].unique()))
print(df3['Merchant Category'].value_counts())

['Industrial & Scientific' 'Beauty & Cosmetics' 'Real Estate' 'Appliances'
 'Jewelry' 'Luxury Brands' 'Baby & Maternity' 'Art & Collectibles'
 'Home & Garden' 'Sports & Outdoors' 'Specialty Services' 'Automotive'
 'Farm & Agriculture' 'Home Improvement' 'Charity & Nonprofit' 'Clothing'
 'Education' 'Vintage & Antique' 'Financial Services' 'Wedding & Bridal'
 'Office Supplies' 'Sporting Goods' 'Subscription Services'
 'Entertainment' 'Fitness & Nutrition' 'Health & Wellness'
 'Hobby & Crafts' 'Food & Beverage' 'Music & Instruments' 'Electronics'
 'Toys & Games' 'Pets & Animals' 'Gifts & Souvenirs' 'Online Marketplace'
 'Books & Literature' 'Wholesale' 'Electronics Repair' 'Technology'
 'Travel & Tourism' 'Furniture']
40
Electronics                150636
Subscription Services      150594
Home Improvement           150592
Jewelry                    150585
Entertainment              150517
Books & Literature         150483
Education                  150458
Charity & Nonprofit        150434

In [44]:
print(df3['Transaction Purpose'].unique())
print(len(df3['Transaction Purpose'].unique()))
print(df3['Transaction Purpose'].value_counts())

['Consultation Fee' 'Cashback Reward' 'Acquisition' 'Loan Repayment'
 'Dividend Reinvestment' 'Service Charge' 'Rent' 'Retail Purchase'
 'Donation to Nonprofit' 'Buyback' 'Charity Donation' 'Investment'
 'Ticket Purchase' 'Utility Payment' 'Bonus' 'Scholarship'
 'Subscription Renewal' 'Rental Payment' 'Membership' 'Tax Payment'
 'Deposit' 'Bill Payment' 'Admission' 'Registration Fee' 'Fine Settlement'
 'Auction Bid' 'Transfer to Family' 'Settlement' 'Gift Purchase'
 'Product Purchase' 'Insurance Premium' 'Royalty' 'Payout' 'Compensation'
 'Interest Payment' 'Recharge' 'Invoice Payment' 'Expense Reimbursement']
38
Invoice Payment          158411
Dividend Reinvestment    158387
Donation to Nonprofit    158387
Compensation             158362
Service Charge           158297
Buyback                  158288
Insurance Premium        158278
Product Purchase         158260
Recharge                 158231
Fine Settlement          158159
Interest Payment         158117
Subscription Renewal     15

In [45]:
print(df3['Transaction Authentication Method'].unique())
print(len(df3['Transaction Authentication Method'].unique()))
print(df3['Transaction Authentication Method'].value_counts())

['Bluetooth Authentication' 'NFC Tag' 'Token' 'Time-Based OTP' 'Password'
 'Transaction Confirmation Number' 'CAPTCHA' 'Security Question'
 'Hardware Token' 'Signature Verification' 'Iris Scan'
 'Push Notification Confirmation' 'Certificate-based Authentication'
 'Pattern Lock' 'Smart Card' 'SMS Code' 'Behavioral Biometrics'
 'Biometric Scan' 'Knowledge-Based Authentication' 'Voiceprint'
 'Mobile Phone Verification' 'QR Code' 'Face ID' 'PIN'
 'Two-Factor Authentication' 'Biometric Card'
 'Radio-Frequency Identification (RFID)' 'Social Media Login'
 'Behavioral Analytics' 'Handwriting Recognition' 'Palm Vein Scan'
 'Mobile App Notification' 'Authentication App' 'Geolocation Verification'
 'USB Security Key' 'Fingerprint' 'Email Verification' 'Voice Recognition'
 'Retina Scan']
39
Geolocation Verification                 154728
Behavioral Analytics                     154416
SMS Code                                 154397
Hardware Token                           154250
Behavioral Biometr

### Use one hot encoding for Transaction Time of Day

In [8]:
#Lets do One hot encoding for the 'Transaction Time of Day' column

df4 = pd.get_dummies(data=df3, columns=['Transaction Time of Day'])
df4.head()

Unnamed: 0,Transaction ID,Transaction Amount,Payment Method,Transaction Type,Device Type,IP Address,Merchant Category,User Income,User Account Status,Transaction Status,...,Merchant's Reputation Score,Transaction Currency,Transaction Purpose,User's Credit Score,Transaction Authentication Method,Fraudulent Flag,Credit per Reputation score,Transaction Time of Day_Daytime,Transaction Time of Day_Evening,Transaction Time of Day_Nighttime
0,51595306,163.08,ACH Transfer,Charity,GPS Device,42.23.223.120,Industrial & Scientific,66826.21,Pro,In Transit,...,2.71,NOK,Consultation Fee,343,Bluetooth Authentication,0,126.568266,1,0,0
1,85052974,430.74,2Checkout,Cashback,Medical Device,39.52.212.120,Beauty & Cosmetics,89356.71,Pending Approval,Resolved,...,3.95,EGP,Cashback Reward,688,NFC Tag,1,174.177215,1,0,0
2,23954324,415.74,Google Wallet,Reward,Vehicle Infotainment System,243.180.236.29,Real Estate,58438.63,Pro,Posted,...,3.81,MXN,Acquisition,371,Token,1,97.375328,1,0,0
3,44108303,565.89,Check,Purchase,Kiosk,212.186.227.14,Appliances,3426.92,Premium,Closed,...,2.67,CLP,Loan Repayment,687,Time-Based OTP,1,257.303371,1,0,0
4,66622683,955.49,Worldpay,Acquisition,Smart Mirror,166.113.10.199,Jewelry,53080.12,Free,Refunded,...,3.19,RUB,Dividend Reinvestment,605,Password,1,189.655172,1,0,0


### Use frequency encoding for the Categorical data 'Payment Method', 'Transaction Type', 'Device Type', 'Merchant Category', 'User Account Status', 'Transaction Status', 'Transaction Currency', 'Transaction Purpose', 'Transaction Authentication Method'

In [9]:
#Lets use the frequency encoding for the categorical data 'Payment Method'

Payment_method = df4.groupby('Payment Method').size()
# mapping values to dataframe
df4.loc[:, "{}_freq_encode".format('Payment Method')] = df4['Payment Method'].map(Payment_method)
# drop original column.
df4 = df4.drop(['Payment Method'], axis=1)
df4.head(10)

Unnamed: 0,Transaction ID,Transaction Amount,Transaction Type,Device Type,IP Address,Merchant Category,User Income,User Account Status,Transaction Status,Location Distance,...,Transaction Currency,Transaction Purpose,User's Credit Score,Transaction Authentication Method,Fraudulent Flag,Credit per Reputation score,Transaction Time of Day_Daytime,Transaction Time of Day_Evening,Transaction Time of Day_Nighttime,Payment Method_freq_encode
0,51595306,163.08,Charity,GPS Device,42.23.223.120,Industrial & Scientific,66826.21,Pro,In Transit,9.34,...,NOK,Consultation Fee,343,Bluetooth Authentication,0,126.568266,1,0,0,149586
1,85052974,430.74,Cashback,Medical Device,39.52.212.120,Beauty & Cosmetics,89356.71,Pending Approval,Resolved,65.28,...,EGP,Cashback Reward,688,NFC Tag,1,174.177215,1,0,0,150115
2,23954324,415.74,Reward,Vehicle Infotainment System,243.180.236.29,Real Estate,58438.63,Pro,Posted,44.05,...,MXN,Acquisition,371,Token,1,97.375328,1,0,0,149905
3,44108303,565.89,Purchase,Kiosk,212.186.227.14,Appliances,3426.92,Premium,Closed,21.7,...,CLP,Loan Repayment,687,Time-Based OTP,1,257.303371,1,0,0,150086
4,66622683,955.49,Acquisition,Smart Mirror,166.113.10.199,Jewelry,53080.12,Free,Refunded,56.63,...,RUB,Dividend Reinvestment,605,Password,1,189.655172,1,0,0,149760
5,29002618,635.62,Auction,Smart Doorbell,105.127.92.148,Luxury Brands,20745.14,Free,Approved,87.06,...,BRL,Service Charge,342,Transaction Confirmation Number,0,71.848739,0,1,0,149727
6,63317849,274.04,Admission,Smart Speaker,142.183.64.81,Baby & Maternity,15874.22,Guest,Cancelled,36.01,...,AUD,Rent,703,CAPTCHA,1,557.936508,0,0,1,150043
7,79673247,980.06,Dividend,Desktop,187.28.75.91,Art & Collectibles,90782.29,Guest,Partially Declined,92.24,...,HKD,Acquisition,564,Security Question,1,179.047619,1,0,0,150088
8,26746847,828.3,Recharge,Home Security System,84.82.147.62,Home & Garden,43149.43,Pending Approval,Void,33.47,...,HKD,Retail Purchase,323,Hardware Token,0,66.735537,0,1,0,150467
9,97928727,791.35,Reward,Smart Speaker,2.60.179.94,Sports & Outdoors,64792.31,Unverified,Sent,94.59,...,THB,Donation to Nonprofit,434,Signature Verification,0,110.43257,0,1,0,150086


In [10]:
#Lets use the frequency encoding for the categorical data 'Transaction Type'
transaction_type = df4.groupby('Transaction Type').size()
# mapping values to dataframe
df4.loc[:, "{}_freq_encode".format('Transaction Type')] = df4['Transaction Type'].map(transaction_type)
# drop original column.
df4 = df4.drop(['Transaction Type'], axis=1)

#Lets use the frequency encoding for the categorical data 'Device Type'
device_type = df4.groupby('Device Type').size()
# mapping values to dataframe
df4.loc[:, "{}_freq_encode".format('Device Type')] = df4['Device Type'].map(device_type)
# drop original column.
df4 = df4.drop(['Device Type'], axis=1)

#Lets use the frequency encoding for the categorical data 'Merchant Category'
merchant_category = df4.groupby('Merchant Category').size()
# mapping values to dataframe
df4.loc[:, "{}_freq_encode".format('Merchant Category')] = df4['Merchant Category'].map(merchant_category)
# drop original column.
df4 = df4.drop(['Merchant Category'], axis=1)

#Lets use the frequency encoding for the categorical data 'User Account Status'
User_Account_Status = df4.groupby('User Account Status').size()
# mapping values to dataframe
df4.loc[:, "{}_freq_encode".format('User Account Status')] = df4['User Account Status'].map(User_Account_Status)
# drop original column.
df4 = df4.drop(['User Account Status'], axis=1)

#Lets use the frequency encoding for the categorical data 'Transaction Status'
Transaction_Status = df4.groupby('Transaction Status').size()
# mapping values to dataframe
df4.loc[:, "{}_freq_encode".format('Transaction Status')] = df4['Transaction Status'].map(Transaction_Status)
# drop original column.
df4 = df4.drop(['Transaction Status'], axis=1)

#Lets use the frequency encoding for the categorical data 'Transaction Currency'
Transaction_Currency = df4.groupby('Transaction Currency').size()
# mapping values to dataframe
df4.loc[:, "{}_freq_encode".format('Transaction Currency')] = df4['Transaction Currency'].map(Transaction_Currency)
# drop original column.
df4 = df4.drop(['Transaction Currency'], axis=1)

#Lets use the frequency encoding for the categorical data 'Transaction Purpose'
transaction_purpose = df4.groupby('Transaction Purpose').size()
# mapping values to dataframe
df4.loc[:, "{}_freq_encode".format('Transaction Purpose')] = df4['Transaction Purpose'].map(transaction_purpose)
# drop original column.
df4 = df4.drop(['Transaction Purpose'], axis=1)

#Lets use the frequency encoding for the categorical data 'Transaction Authentication Method'
transaction_auth = df4.groupby('Transaction Authentication Method').size()
# mapping values to dataframe
df4.loc[:, "{}_freq_encode".format('Transaction Authentication Method')] = df4['Transaction Authentication Method'].map(transaction_auth)
# drop original column.
df4 = df4.drop(['Transaction Authentication Method'], axis=1)

df4.head(5)

Unnamed: 0,Transaction ID,Transaction Amount,IP Address,User Income,Location Distance,Time Taken for Transaction,User's Transaction History,Merchant's Reputation Score,User's Credit Score,Fraudulent Flag,...,Transaction Time of Day_Nighttime,Payment Method_freq_encode,Transaction Type_freq_encode,Device Type_freq_encode,Merchant Category_freq_encode,User Account Status_freq_encode,Transaction Status_freq_encode,Transaction Currency_freq_encode,Transaction Purpose_freq_encode,Transaction Authentication Method_freq_encode
0,51595306,163.08,42.23.223.120,66826.21,9.34,24.22,26,2.71,343,0,...,0,149586,149373,158153,149629,332291,149840,149746,157922,153100
1,85052974,430.74,39.52.212.120,89356.71,65.28,55.11,60,3.95,688,1,...,0,150115,150237,157724,149627,333832,149414,150525,157709,153989
2,23954324,415.74,243.180.236.29,58438.63,44.05,53.84,81,3.81,371,1,...,0,149905,150091,157753,150374,332291,150088,150710,157524,153836
3,44108303,565.89,212.186.227.14,3426.92,21.7,21.62,18,2.67,687,1,...,0,150086,150791,158234,149878,332834,150070,150574,157376,153635
4,66622683,955.49,166.113.10.199,53080.12,56.63,53.71,98,3.19,605,1,...,0,149760,150357,157692,150585,332730,150124,150012,158387,153626


In [59]:
df4.dtypes

Transaction ID                                     int64
Transaction Amount                               float64
IP Address                                        object
User Income                                      float64
Location Distance                                float64
Time Taken for Transaction                       float64
User's Transaction History                         int64
Merchant's Reputation Score                      float64
User's Credit Score                                int64
Fraudulent Flag                                    int64
Credit per Reputation score                      float64
Transaction Time of Day_Daytime                    uint8
Transaction Time of Day_Evening                    uint8
Transaction Time of Day_Nighttime                  uint8
Payment Method_freq_encode                       float64
Transaction Type_freq_encode                     float64
Device Type_freq_encode                          float64
Merchant Category_freq_encode  

In [68]:
IP_address = df2['IP Address']
IP_address.head(5)

0     42.23.223.120
1     39.52.212.120
2    243.180.236.29
3    212.186.227.14
4    166.113.10.199
Name: IP Address, dtype: object

In [13]:
##need to convert the 'IP Address' to numeric

df4.drop('IP Address', axis='columns', inplace=True)

df4.head(5)

Unnamed: 0,Transaction ID,Transaction Amount,User Income,Location Distance,Time Taken for Transaction,User's Transaction History,Merchant's Reputation Score,User's Credit Score,Fraudulent Flag,Credit per Reputation score,...,Transaction Time of Day_Nighttime,Payment Method_freq_encode,Transaction Type_freq_encode,Device Type_freq_encode,Merchant Category_freq_encode,User Account Status_freq_encode,Transaction Status_freq_encode,Transaction Currency_freq_encode,Transaction Purpose_freq_encode,Transaction Authentication Method_freq_encode
0,51595306,163.08,66826.21,9.34,24.22,26,2.71,343,0,126.568266,...,0,1.49586,0.149373,0.158153,1.49629,3.32291,0.14984,1.49746,0.157922,1.531
1,85052974,430.74,89356.71,65.28,55.11,60,3.95,688,1,174.177215,...,0,1.50115,0.150237,0.157724,1.49627,3.33832,0.149414,1.50525,0.157709,1.53989
2,23954324,415.74,58438.63,44.05,53.84,81,3.81,371,1,97.375328,...,0,1.49905,0.150091,0.157753,1.50374,3.32291,0.150088,1.5071,0.157524,1.53836
3,44108303,565.89,3426.92,21.7,21.62,18,2.67,687,1,257.303371,...,0,1.50086,0.150791,0.158234,1.49878,3.32834,0.15007,1.50574,0.157376,1.53635
4,66622683,955.49,53080.12,56.63,53.71,98,3.19,605,1,189.655172,...,0,1.4976,0.150357,0.157692,1.50585,3.3273,0.150124,1.50012,0.158387,1.53626


In [70]:
df5 = df4.copy()
df5.head()

Unnamed: 0,Transaction ID,Transaction Amount,User Income,Location Distance,Time Taken for Transaction,User's Transaction History,Merchant's Reputation Score,User's Credit Score,Fraudulent Flag,Credit per Reputation score,...,Transaction Time of Day_Nighttime,Payment Method_freq_encode,Transaction Type_freq_encode,Device Type_freq_encode,Merchant Category_freq_encode,User Account Status_freq_encode,Transaction Status_freq_encode,Transaction Currency_freq_encode,Transaction Purpose_freq_encode,Transaction Authentication Method_freq_encode
0,51595306,163.08,66826.21,9.34,24.22,26,2.71,343,0,126.568266,...,0,0.024931,0.149373,0.158153,1.49629,3.32291,0.14984,1.49746,0.157922,1.531
1,85052974,430.74,89356.71,65.28,55.11,60,3.95,688,1,174.177215,...,0,0.025019,0.150237,0.157724,1.49627,3.33832,0.149414,1.50525,0.157709,1.53989
2,23954324,415.74,58438.63,44.05,53.84,81,3.81,371,1,97.375328,...,0,0.024984,0.150091,0.157753,1.50374,3.32291,0.150088,1.5071,0.157524,1.53836
3,44108303,565.89,3426.92,21.7,21.62,18,2.67,687,1,257.303371,...,0,0.025014,0.150791,0.158234,1.49878,3.32834,0.15007,1.50574,0.157376,1.53635
4,66622683,955.49,53080.12,56.63,53.71,98,3.19,605,1,189.655172,...,0,0.02496,0.150357,0.157692,1.50585,3.3273,0.150124,1.50012,0.158387,1.53626


In [71]:
df5['IP Address'] = IP_address

df5.head(5)

Unnamed: 0,Transaction ID,Transaction Amount,User Income,Location Distance,Time Taken for Transaction,User's Transaction History,Merchant's Reputation Score,User's Credit Score,Fraudulent Flag,Credit per Reputation score,...,Payment Method_freq_encode,Transaction Type_freq_encode,Device Type_freq_encode,Merchant Category_freq_encode,User Account Status_freq_encode,Transaction Status_freq_encode,Transaction Currency_freq_encode,Transaction Purpose_freq_encode,Transaction Authentication Method_freq_encode,IP Address
0,51595306,163.08,66826.21,9.34,24.22,26,2.71,343,0,126.568266,...,0.024931,0.149373,0.158153,1.49629,3.32291,0.14984,1.49746,0.157922,1.531,42.23.223.120
1,85052974,430.74,89356.71,65.28,55.11,60,3.95,688,1,174.177215,...,0.025019,0.150237,0.157724,1.49627,3.33832,0.149414,1.50525,0.157709,1.53989,39.52.212.120
2,23954324,415.74,58438.63,44.05,53.84,81,3.81,371,1,97.375328,...,0.024984,0.150091,0.157753,1.50374,3.32291,0.150088,1.5071,0.157524,1.53836,243.180.236.29
3,44108303,565.89,3426.92,21.7,21.62,18,2.67,687,1,257.303371,...,0.025014,0.150791,0.158234,1.49878,3.32834,0.15007,1.50574,0.157376,1.53635,212.186.227.14
4,66622683,955.49,53080.12,56.63,53.71,98,3.19,605,1,189.655172,...,0.02496,0.150357,0.157692,1.50585,3.3273,0.150124,1.50012,0.158387,1.53626,166.113.10.199


In [72]:
df5.dtypes

Transaction ID                                     int64
Transaction Amount                               float64
User Income                                      float64
Location Distance                                float64
Time Taken for Transaction                       float64
User's Transaction History                         int64
Merchant's Reputation Score                      float64
User's Credit Score                                int64
Fraudulent Flag                                    int64
Credit per Reputation score                      float64
Transaction Time of Day_Daytime                    uint8
Transaction Time of Day_Evening                    uint8
Transaction Time of Day_Nighttime                  uint8
Payment Method_freq_encode                       float64
Transaction Type_freq_encode                     float64
Device Type_freq_encode                          float64
Merchant Category_freq_encode                    float64
User Account Status_freq_encode

In [74]:
df4.head(5)

Unnamed: 0,Transaction ID,Transaction Amount,User Income,Location Distance,Time Taken for Transaction,User's Transaction History,Merchant's Reputation Score,User's Credit Score,Fraudulent Flag,Credit per Reputation score,...,Transaction Time of Day_Nighttime,Payment Method_freq_encode,Transaction Type_freq_encode,Device Type_freq_encode,Merchant Category_freq_encode,User Account Status_freq_encode,Transaction Status_freq_encode,Transaction Currency_freq_encode,Transaction Purpose_freq_encode,Transaction Authentication Method_freq_encode
0,51595306,163.08,66826.21,9.34,24.22,26,2.71,343,0,126.568266,...,0,0.024931,0.149373,0.158153,1.49629,3.32291,0.14984,1.49746,0.157922,1.531
1,85052974,430.74,89356.71,65.28,55.11,60,3.95,688,1,174.177215,...,0,0.025019,0.150237,0.157724,1.49627,3.33832,0.149414,1.50525,0.157709,1.53989
2,23954324,415.74,58438.63,44.05,53.84,81,3.81,371,1,97.375328,...,0,0.024984,0.150091,0.157753,1.50374,3.32291,0.150088,1.5071,0.157524,1.53836
3,44108303,565.89,3426.92,21.7,21.62,18,2.67,687,1,257.303371,...,0,0.025014,0.150791,0.158234,1.49878,3.32834,0.15007,1.50574,0.157376,1.53635
4,66622683,955.49,53080.12,56.63,53.71,98,3.19,605,1,189.655172,...,0,0.02496,0.150357,0.157692,1.50585,3.3273,0.150124,1.50012,0.158387,1.53626


In [11]:
df6 = df4.copy()

df6.head()

Unnamed: 0,Transaction ID,Transaction Amount,IP Address,User Income,Location Distance,Time Taken for Transaction,User's Transaction History,Merchant's Reputation Score,User's Credit Score,Fraudulent Flag,...,Transaction Time of Day_Nighttime,Payment Method_freq_encode,Transaction Type_freq_encode,Device Type_freq_encode,Merchant Category_freq_encode,User Account Status_freq_encode,Transaction Status_freq_encode,Transaction Currency_freq_encode,Transaction Purpose_freq_encode,Transaction Authentication Method_freq_encode
0,51595306,163.08,42.23.223.120,66826.21,9.34,24.22,26,2.71,343,0,...,0,149586,149373,158153,149629,332291,149840,149746,157922,153100
1,85052974,430.74,39.52.212.120,89356.71,65.28,55.11,60,3.95,688,1,...,0,150115,150237,157724,149627,333832,149414,150525,157709,153989
2,23954324,415.74,243.180.236.29,58438.63,44.05,53.84,81,3.81,371,1,...,0,149905,150091,157753,150374,332291,150088,150710,157524,153836
3,44108303,565.89,212.186.227.14,3426.92,21.7,21.62,18,2.67,687,1,...,0,150086,150791,158234,149878,332834,150070,150574,157376,153635
4,66622683,955.49,166.113.10.199,53080.12,56.63,53.71,98,3.19,605,1,...,0,149760,150357,157692,150585,332730,150124,150012,158387,153626


In [13]:
#IP_address = df2['IP Address']

Browser_Type = df['Browser Type']
Merchant_age = df["Merchant's Business Age"] 
User_age = df['User Age']
Users_email = df["User's Email Domain"] 
User_occupation = df['User Occupation']
User_device_occupation = df["User's Device Location"]

#df5['IP Address'] = IP_address


df6['Browser Type'] = Browser_Type
df6["Merchant's Business Age"] = Merchant_age
df6['User Age'] = User_age
df6["User's Email Domain"] = Users_email
df6['User Occupation'] = User_occupation
df6["User's Device Location"] = User_device_occupation 

df6.head()

Unnamed: 0,Transaction ID,Transaction Amount,IP Address,User Income,Location Distance,Time Taken for Transaction,User's Transaction History,Merchant's Reputation Score,User's Credit Score,Fraudulent Flag,...,Transaction Status_freq_encode,Transaction Currency_freq_encode,Transaction Purpose_freq_encode,Transaction Authentication Method_freq_encode,Browser Type,Merchant's Business Age,User Age,User's Email Domain,User Occupation,User's Device Location
0,51595306,163.08,42.23.223.120,66826.21,9.34,24.22,26,2.71,343,0,...,149840,149746,157922,153100,Links,3,68,cox.co.uk,Doctor,United Kingdom
1,85052974,430.74,39.52.212.120,89356.71,65.28,55.11,60,3.95,688,1,...,149414,150525,157709,153989,Beaker,13,22,gmail.com,Chemist,Mexico
2,23954324,415.74,243.180.236.29,58438.63,44.05,53.84,81,3.81,371,1,...,150088,150710,157524,153836,Opera,7,71,rocketmail.com,Nurse,Qatar
3,44108303,565.89,212.186.227.14,3426.92,21.7,21.62,18,2.67,687,1,...,150070,150574,157376,153635,Konqueror,15,78,roadrunner.co.uk,Nurse,Spain
4,66622683,955.49,166.113.10.199,53080.12,56.63,53.71,98,3.19,605,1,...,150124,150012,158387,153626,Basilisk,17,31,protonmail.co.uk,Physicist,Israel


In [14]:
df6.dtypes

Transaction ID                                     int64
Transaction Amount                               float64
IP Address                                        object
User Income                                      float64
Location Distance                                float64
Time Taken for Transaction                       float64
User's Transaction History                         int64
Merchant's Reputation Score                      float64
User's Credit Score                                int64
Fraudulent Flag                                    int64
Credit per Reputation score                      float64
Transaction Time of Day_Daytime                    uint8
Transaction Time of Day_Evening                    uint8
Transaction Time of Day_Nighttime                  uint8
Payment Method_freq_encode                         int64
Transaction Type_freq_encode                       int64
Device Type_freq_encode                            int64
Merchant Category_freq_encode  

In [15]:
#Lets use the frequency encoding for the categorical data 'Browser Type'
browser_type = df6.groupby('Browser Type').size()
# mapping values to dataframe
df6.loc[:, "{}_freq_encode".format('Browser Type')] = df6['Browser Type'].map(browser_type)
# drop original column.
df6 = df6.drop(['Browser Type'], axis=1)

#Lets use the frequency encoding for the categorical data "User's Email Domain"
User_email_d = df6.groupby("User's Email Domain").size()
# mapping values to dataframe
df6.loc[:, "{}_freq_encode".format("User's Email Domain")] = df6["User's Email Domain"].map(User_email_d)
# drop original column.
df6 = df6.drop(["User's Email Domain"], axis=1)

#Lets use the frequency encoding for the categorical data 'User Occupation'
User_Occupation = df6.groupby('User Occupation').size()
# mapping values to dataframe
df6.loc[:, "{}_freq_encode".format('User Occupation')] = df6['User Occupation'].map(User_Occupation)
# drop original column.
df6 = df6.drop(['User Occupation'], axis=1)

#Lets use the frequency encoding for the categorical data "User's Device Location"
User_device_occ = df6.groupby("User's Device Location").size()
# mapping values to dataframe
df6.loc[:, "{}_freq_encode".format("User's Device Location")] = df6["User's Device Location"].map(User_device_occ)
# drop original column.
df6 = df6.drop(["User's Device Location"], axis=1)


df6.head()

Unnamed: 0,Transaction ID,Transaction Amount,IP Address,User Income,Location Distance,Time Taken for Transaction,User's Transaction History,Merchant's Reputation Score,User's Credit Score,Fraudulent Flag,...,Transaction Status_freq_encode,Transaction Currency_freq_encode,Transaction Purpose_freq_encode,Transaction Authentication Method_freq_encode,Merchant's Business Age,User Age,Browser Type_freq_encode,User's Email Domain_freq_encode,User Occupation_freq_encode,User's Device Location_freq_encode
0,51595306,163.08,42.23.223.120,66826.21,9.34,24.22,26,2.71,343,0,...,149840,149746,157922,153100,3,68,150415,150016,230117,149252
1,85052974,430.74,39.52.212.120,89356.71,65.28,55.11,60,3.95,688,1,...,149414,150525,157709,153989,13,22,149649,149838,230711,149968
2,23954324,415.74,243.180.236.29,58438.63,44.05,53.84,81,3.81,371,1,...,150088,150710,157524,153836,7,71,150149,150219,230991,149985
3,44108303,565.89,212.186.227.14,3426.92,21.7,21.62,18,2.67,687,1,...,150070,150574,157376,153635,15,78,150467,150776,230991,150406
4,66622683,955.49,166.113.10.199,53080.12,56.63,53.71,98,3.19,605,1,...,150124,150012,158387,153626,17,31,150103,150176,230298,149246


## Model Evaluation

In [22]:
#Now lets define X and y

X = df6.drop(['Fraudulent Flag','IP Address'], axis='columns')
X.head()

Unnamed: 0,Transaction ID,Transaction Amount,User Income,Location Distance,Time Taken for Transaction,User's Transaction History,Merchant's Reputation Score,User's Credit Score,Credit per Reputation score,Transaction Time of Day_Daytime,...,Transaction Status_freq_encode,Transaction Currency_freq_encode,Transaction Purpose_freq_encode,Transaction Authentication Method_freq_encode,Merchant's Business Age,User Age,Browser Type_freq_encode,User's Email Domain_freq_encode,User Occupation_freq_encode,User's Device Location_freq_encode
0,51595306,163.08,66826.21,9.34,24.22,26,2.71,343,126.568266,1,...,149840,149746,157922,153100,3,68,150415,150016,230117,149252
1,85052974,430.74,89356.71,65.28,55.11,60,3.95,688,174.177215,1,...,149414,150525,157709,153989,13,22,149649,149838,230711,149968
2,23954324,415.74,58438.63,44.05,53.84,81,3.81,371,97.375328,1,...,150088,150710,157524,153836,7,71,150149,150219,230991,149985
3,44108303,565.89,3426.92,21.7,21.62,18,2.67,687,257.303371,1,...,150070,150574,157376,153635,15,78,150467,150776,230991,150406
4,66622683,955.49,53080.12,56.63,53.71,98,3.19,605,189.655172,1,...,150124,150012,158387,153626,17,31,150103,150176,230298,149246


In [17]:
y = df6['Fraudulent Flag']

y.head()

0    0
1    1
2    1
3    1
4    1
Name: Fraudulent Flag, dtype: int64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [19]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(y_train.value_counts())

(4800000, 28)
(1200000, 28)
(4800000,)
(1200000,)
0    2400075
1    2399925
Name: Fraudulent Flag, dtype: int64


#### Lets use Logistic Regresion

In [24]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
logistic_model.score(X_train, y_train)

0.500015625

In [24]:
logistic_model.score(X_test, y_test)

0.5000158333333333

#### Using K Fold cross validation to measure accuracy of our LogisticRegression model

### Hyper-parameter Tunning

In [59]:
x = np.asarray(X)
Y = np.asarray(y)

In [55]:
x.astype(np.uint8)

array([[ 42, 163,  10, ...,   0, 229,   4],
       [ 46, 174,  12, ...,  78,  55, 208],
       [148, 159,  70, ..., 203,  79, 225],
       ...,
       [100, 197,   2, ...,  22,  79, 198],
       [229, 102, 240, ...,  25, 140,  84],
       [226, 143, 131, ...,  22,  95, 136]], dtype=uint8)

In [53]:
## I change the float64 to unint8 to reduce the memory size

x['Transaction Amount','User Income', 'Location Distance', 
  'Time Taken for Transaction',
  "Merchant's Reputation Score", 'Credit per Reputation score'] = x['Transaction Amount','User Income',
                                                                    'Location Distance', 'Time Taken for Transaction',
                                                                    "Merchant's Reputation Score",
                                                                    'Credit per Reputation score'].astype(np.uint8)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [36]:
#list of models

models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), 
          KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

In [61]:
from sklearn.model_selection import cross_val_score

def compare_models_cross_validation():

    for model in models:
        cv_score = cross_val_score(model, x, Y, cv=5)
        mean_accuracy = sum(cv_score)/len(cv_score)
        mean_accuracy = mean_accuracy*100
        mean_accuracy = round(mean_accuracy, 2)

        print('Cross Validation accuracies for the',model,'=', cv_score)
        print('Acccuracy score of the ',model,'=',mean_accuracy,'%')
        print('---------------------------------------------------------------')

In [None]:
compare_models_cross_validation()

### Comparing the models with different Hyperparameter values using GridSearchCV

In [39]:
# list of models
models_list = [LogisticRegression(max_iter=10000), SVC(), 
               KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

In [40]:
# creating a dictionary that contains hyperparameter values for the above mentioned models


model_hyperparameters = {
    

    'log_reg_hyperparameters': {
        
        'C' : [1,5,10,20]
    },

    'svc_hyperparameters': {
        
        'kernel' : ['linear','poly','rbf','sigmoid'],
        'C' : [1,5,10,20]
    },


    'KNN_hyperparameters' : {
        
        'n_neighbors' : [3,5,10]
    },


    'random_forest_hyperparameters' : {
        
        'n_estimators' : [10, 20, 50, 100]
    }
}

In [41]:
print(model_hyperparameters.keys())

dict_keys(['log_reg_hyperparameters', 'svc_hyperparameters', 'KNN_hyperparameters', 'random_forest_hyperparameters'])


In [43]:
model_hyperparameters['log_reg_hyperparameters']

{'C': [1, 5, 10, 20]}

In [44]:
model_keys = list(model_hyperparameters.keys())
print(model_keys)

['log_reg_hyperparameters', 'svc_hyperparameters', 'KNN_hyperparameters', 'random_forest_hyperparameters']


In [45]:
model_keys[0]

'log_reg_hyperparameters'

In [46]:
model_hyperparameters[model_keys[0]]

{'C': [1, 5, 10, 20]}

In [49]:
def ModelSelection(list_of_models, hyperparameters_dictionary):
    
    result = []

    i = 0

    for model in list_of_models:
        
        key = model_keys[i]

        params = hyperparameters_dictionary[key]

        i += 1

        print(model)
        print(params)
        print('---------------------------------')


        classifier = GridSearchCV(model, params, cv=5)

        # fitting the data to classifier
        classifier.fit(X,y)

        result.append({
            'model used' : model,
            'highest score' : classifier.best_score_,
            'best hyperparameters' : classifier.best_params_
        })

    result_dataframe = pd.DataFrame(result, columns = ['model used','highest score','best hyperparameters'])

    return result_dataframe

In [50]:
ModelSelection(models_list, model_hyperparameters)

LogisticRegression(max_iter=10000)
{'C': [1, 5, 10, 20]}
---------------------------------


ValueError: 
All the 20 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1196, in fit
    X, y = self._validate_data(
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\base.py", line 565, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 1106, in check_X_y
    X = check_array(
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 989. MiB for an array with shape (4800000, 27) and data type float64

--------------------------------------------------------------------------------
11 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1196, in fit
    X, y = self._validate_data(
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\base.py", line 565, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 1106, in check_X_y
    X = check_array(
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "C:\ProgramData\anaconda3\lib\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "C:\ProgramData\anaconda3\lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
  File "C:\ProgramData\anaconda3\lib\site-packages\pandas\core\frame.py", line 971, in _values
    return self.values
  File "C:\ProgramData\anaconda3\lib\site-packages\pandas\core\frame.py", line 11739, in values
    return self._mgr.as_array()
  File "C:\ProgramData\anaconda3\lib\site-packages\pandas\core\internals\managers.py", line 1770, in as_array
    arr = self._interleave(dtype=dtype, na_value=na_value)
  File "C:\ProgramData\anaconda3\lib\site-packages\pandas\core\internals\managers.py", line 1809, in _interleave
    result = np.empty(self.shape, dtype=dtype)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 989. MiB for an array with shape (27, 4800000) and data type float64


### Presentation