In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [2]:
dataset_train = pd.read_csv('fraudTrain.csv')
dataset_train.head(3)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0


In [4]:
dataset_test = pd.read_csv('fraudTest.csv')


***Pre-Processing***

In [None]:
dataset_train.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [None]:
dataset_train.drop(columns=["Unnamed: 0", "dob", "trans_num", "street"], inplace= True)
dataset_train.head(3)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,city,state,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1325376051,43.150704,-112.154481,0


In [None]:
data = dataset_train.head(30000)
data.is_fraud.value_counts()

is_fraud
0    29712
1      288
Name: count, dtype: int64

In [None]:
dataset_train_processed = pd.get_dummies(data=data, sparse= True)
dataset_train_processed

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,...,job_Video editor,job_Visual merchandiser,job_Volunteer coordinator,job_Warden/ranger,job_Waste management officer,job_Water engineer,job_Water quality scientist,job_Web designer,job_Wellsite geologist,job_Writer
0,2703186189652095,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0,...,False,False,False,False,False,False,False,False,False,False
1,630423337322,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0,...,False,False,False,False,False,False,False,False,False,False
2,38859492057661,220.11,83252,42.1808,-112.2620,4154,1325376051,43.150704,-112.154481,0,...,False,False,False,False,False,False,False,False,False,False
3,3534093764340240,45.00,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0,...,False,False,False,False,False,False,False,False,False,False
4,375534208663984,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,3525590521269779,4.80,27258,36.0424,-79.3242,6006,1326915862,36.440097,-79.589525,0,...,False,False,False,False,False,False,False,False,False,False
29996,376445266762684,2.01,69165,41.1558,-101.1360,1789,1326915881,40.312965,-101.664355,0,...,False,False,False,False,False,False,False,False,False,False
29997,630424987505,36.52,26292,39.1505,-79.5030,836,1326915972,39.313899,-80.038231,0,...,False,False,False,False,False,False,False,False,False,False
29998,3567527758368741,118.50,76951,31.8351,-101.0017,1143,1326916227,31.497897,-101.595871,0,...,False,False,False,False,False,False,False,False,False,False


In [None]:
x_train=dataset_train_processed.drop(columns='is_fraud', axis = 1)
y_train=dataset_train_processed['is_fraud']

In [None]:
dataset_test.drop(columns=["Unnamed: 0", "dob", "trans_num", "street"], inplace= True)
dataset_test.head(3)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,city,state,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1371816865,33.986391,-81.200714,0
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1371816873,39.450498,-109.960431,0
2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1371816893,40.49581,-74.196111,0


In [None]:
data_test = dataset_train.head(30000)
data_test.is_fraud.value_counts()

is_fraud
0    29712
1      288
Name: count, dtype: int64

In [None]:
dataset_test_processed = pd.get_dummies(data=data_test, sparse= True)
dataset_test_processed

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,...,job_Video editor,job_Visual merchandiser,job_Volunteer coordinator,job_Warden/ranger,job_Waste management officer,job_Water engineer,job_Water quality scientist,job_Web designer,job_Wellsite geologist,job_Writer
0,2703186189652095,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0,...,False,False,False,False,False,False,False,False,False,False
1,630423337322,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0,...,False,False,False,False,False,False,False,False,False,False
2,38859492057661,220.11,83252,42.1808,-112.2620,4154,1325376051,43.150704,-112.154481,0,...,False,False,False,False,False,False,False,False,False,False
3,3534093764340240,45.00,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0,...,False,False,False,False,False,False,False,False,False,False
4,375534208663984,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,3525590521269779,4.80,27258,36.0424,-79.3242,6006,1326915862,36.440097,-79.589525,0,...,False,False,False,False,False,False,False,False,False,False
29996,376445266762684,2.01,69165,41.1558,-101.1360,1789,1326915881,40.312965,-101.664355,0,...,False,False,False,False,False,False,False,False,False,False
29997,630424987505,36.52,26292,39.1505,-79.5030,836,1326915972,39.313899,-80.038231,0,...,False,False,False,False,False,False,False,False,False,False
29998,3567527758368741,118.50,76951,31.8351,-101.0017,1143,1326916227,31.497897,-101.595871,0,...,False,False,False,False,False,False,False,False,False,False


In [None]:
x_test=dataset_test_processed.drop(columns='is_fraud', axis = 1)
y_test=dataset_test_processed['is_fraud']

**Logistic Regression**

In [None]:
classifier = LogisticRegression(solver= 'liblinear', random_state= 0)
classifier.fit(x_train, y_train)



In [None]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
ac = accuracy_score(y_test, y_pred)
print(ac)



[[29712     0]
 [  288     0]]
0.9904


**Decision Tree**

In [None]:
classifier = DecisionTreeClassifier(criterion= 'entropy', random_state= 37)
classifier.fit(x_train, y_train)



In [None]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
ac = accuracy_score(y_test, y_pred)
print(ac)



[[29712     0]
 [    0   288]]
1.0


**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators= 10,criterion= 'entropy', random_state= 37)
classifier.fit(x_train, y_train)



In [None]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
ac = accuracy_score(y_test, y_pred)
print(ac)



[[29712     0]
 [   11   277]]
0.9996333333333334


**XG BOOST**

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(x_train, y_train)

  arr: np.ndarray = transformed.values


In [None]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

  arr: np.ndarray = transformed.values


[[29712     0]
 [    0   288]]


1.0