In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import RandomizedSearchCV



In [2]:
X_train_selected = pd.read_csv('X_train_selected.csv')
X_test_selected = pd.read_csv('X_test_selected.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

<h3>Extracting the non-fraud data 

In [3]:
x_train = pd.concat([X_train_selected,y_train], axis=1)
x_train

Unnamed: 0,V2,V3,V4,V7,V10,V11,V12,V14,V16,V17,Class
0,0.258267,-0.350112,0.805699,-0.081985,-0.244261,0.585588,-0.583373,-0.708119,-0.296001,-0.368420,1
1,0.885542,-1.101482,1.184015,-0.628918,-0.891281,1.085265,-1.197058,-1.116212,-0.969169,-0.846299,1
2,0.176296,-0.477150,-0.006199,-0.226751,-0.557557,0.914718,-0.976213,-1.181056,-0.956306,-0.952772,1
3,-0.470973,2.492266,-0.415108,0.211097,0.471432,-1.631645,1.259970,0.123190,0.046376,0.682463,0
4,-0.574234,-0.252418,-1.009704,0.279503,0.638240,-0.603385,0.902407,1.010024,0.366220,0.327848,0
...,...,...,...,...,...,...,...,...,...,...,...
454899,-0.817010,0.940208,-1.305942,0.241876,0.226563,0.071581,1.883148,0.736602,0.322040,0.367059,0
454900,-0.395999,0.073640,-1.321759,0.282927,0.351620,-0.889150,0.866645,0.933930,0.500894,0.301394,0
454901,-0.240832,-0.806786,0.966037,-1.072630,-1.064006,1.430046,-1.289472,-1.297845,-1.455102,-1.541265,1
454902,-0.130149,1.505652,-0.413064,0.864421,0.511333,-0.860830,0.630806,0.691261,0.380930,0.280276,0


In [4]:
non_fraud_train = x_train[x_train['Class']==0]
non_fraud_train

Unnamed: 0,V2,V3,V4,V7,V10,V11,V12,V14,V16,V17,Class
3,-0.470973,2.492266,-0.415108,0.211097,0.471432,-1.631645,1.259970,0.123190,0.046376,0.682463,0
4,-0.574234,-0.252418,-1.009704,0.279503,0.638240,-0.603385,0.902407,1.010024,0.366220,0.327848,0
5,-0.690995,0.620787,-0.699791,0.212291,0.879648,0.214704,2.001494,0.742794,-0.599816,0.956339,0
6,-0.201481,0.651818,-0.981374,0.758744,0.503662,-0.184964,1.098574,0.956897,0.491022,0.279300,0
9,-1.510886,0.473512,-1.420373,0.494731,-0.018134,-0.850026,1.336018,0.721606,0.082701,0.448652,0
...,...,...,...,...,...,...,...,...,...,...,...
454898,-0.456169,2.494238,-1.744935,0.333747,0.570851,0.102682,0.443045,0.555446,1.770121,0.426710,0
454899,-0.817010,0.940208,-1.305942,0.241876,0.226563,0.071581,1.883148,0.736602,0.322040,0.367059,0
454900,-0.395999,0.073640,-1.321759,0.282927,0.351620,-0.889150,0.866645,0.933930,0.500894,0.301394,0
454902,-0.130149,1.505652,-0.413064,0.864421,0.511333,-0.860830,0.630806,0.691261,0.380930,0.280276,0


In [5]:
x_non_fraud_train = non_fraud_train.iloc[:, :-1]

In [6]:
x_non_fraud_train

Unnamed: 0,V2,V3,V4,V7,V10,V11,V12,V14,V16,V17
3,-0.470973,2.492266,-0.415108,0.211097,0.471432,-1.631645,1.259970,0.123190,0.046376,0.682463
4,-0.574234,-0.252418,-1.009704,0.279503,0.638240,-0.603385,0.902407,1.010024,0.366220,0.327848
5,-0.690995,0.620787,-0.699791,0.212291,0.879648,0.214704,2.001494,0.742794,-0.599816,0.956339
6,-0.201481,0.651818,-0.981374,0.758744,0.503662,-0.184964,1.098574,0.956897,0.491022,0.279300
9,-1.510886,0.473512,-1.420373,0.494731,-0.018134,-0.850026,1.336018,0.721606,0.082701,0.448652
...,...,...,...,...,...,...,...,...,...,...
454898,-0.456169,2.494238,-1.744935,0.333747,0.570851,0.102682,0.443045,0.555446,1.770121,0.426710
454899,-0.817010,0.940208,-1.305942,0.241876,0.226563,0.071581,1.883148,0.736602,0.322040,0.367059
454900,-0.395999,0.073640,-1.321759,0.282927,0.351620,-0.889150,0.866645,0.933930,0.500894,0.301394
454902,-0.130149,1.505652,-0.413064,0.864421,0.511333,-0.860830,0.630806,0.691261,0.380930,0.280276


<h3>Isolation Forest Model

In [7]:
# Train Isolation Forest on non-fraud data
isolation_forest = IsolationForest(contamination='auto') 
isolation_forest.fit(x_non_fraud_train)


IsolationForest()

In [8]:
y_test_predict = isolation_forest.predict(X_test_selected)

In [9]:
set(y_test_predict)

{-1, 1}

In [10]:
y_test_predict = np.where(y_test_predict == -1, 1, 0)

In [11]:
y_test_predict

array([1, 1, 0, ..., 1, 1, 0])

In [12]:
y_test_predict = pd.DataFrame(y_test_predict, columns=['Class'])

In [15]:
#Creating a custom function to calculate the metrics
from sklearn.metrics import f1_score,precision_score,confusion_matrix,accuracy_score
def get_metrics(y_test,y_pred):
    print("Test Accuracy: ", accuracy_score(y_test, y_pred))
    print("Precision: ",precision_score(y_test, y_pred))
    print("F1 Score: ",f1_score(y_test, y_pred))
    print("Confusion Matrix: ",confusion_matrix(y_test, y_pred))

In [16]:
get_metrics(y_test,y_test_predict)

Test Accuracy:  0.9150765875877108
Precision:  0.9301636363636364
F1 Score:  0.9137493748660427
Confusion Matrix:  [[52909  3841]
 [ 5817 51159]]


<h3> One-Class SVM Model

In [17]:
# Train One-Class SVM on non-fraud data
one_class_svm = OneClassSVM(nu=0.05)  # Adjust the nu parameter based on your dataset
one_class_svm.fit(x_non_fraud_train)

OneClassSVM(nu=0.05)

In [18]:
y_test_predict = one_class_svm.predict(X_test_selected)

In [19]:
set(y_test_predict)

{-1, 1}

In [20]:
y_test_predict = np.where(y_test_predict == -1, 1, 0)        

In [21]:
y_test_predict = pd.DataFrame(y_test_predict, columns=['Class'])

In [22]:
get_metrics(y_test,y_test_predict)

Test Accuracy:  0.9190246733376712
Precision:  0.9463121110758134
F1 Score:  0.916652336431681
Confusion Matrix:  [[53877  2873]
 [ 6336 50640]]
