In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import os
import matplotlib.pyplot as plt 

In [2]:
from keras.layers import * 
from keras.models import * 

In [3]:
data = pd.read_csv('/kaggle/input/fraudulent-transaction-dataset/Fraud.csv')
data.info()

In [4]:
data['propofamount'] = (data['amount'] - data['oldbalanceOrg'])/(data['amount'] + data['oldbalanceOrg'])

In [5]:
data.isnull().sum()

In [6]:
len ( data.loc[data['isFraud'] == 1] ) 

In [7]:
data.loc[data['propofamount'].isnull() ]

Null for some fraud values only 

In [8]:
meanfraud = data.loc[data['isFraud'] == 1]['propofamount'].mean()
data['propofamount'].fillna(meanfraud, inplace = True)

In [9]:
data.isnull().sum()

No null values

In [10]:
data['Merchant'] = np.where( data['nameDest'].str.contains('M'), 1, 0  )
data.head(10)

In [11]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
oe = OneHotEncoder()
le = LabelEncoder()
trans = le.fit_transform(data['type'])
data[['CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER']] = oe.fit_transform(trans.reshape(-1,1)).toarray()
data = data.drop(['type','nameOrig', 'nameDest'], axis = 1)

data.info()

In [12]:
figsiz = (12,8)
fig,ax  = plt.subplots(1,1,figsize = figsiz)
sns.heatmap(data.corr(),ax = ax)
plt.show()

In [13]:
ax= sns.countplot(x = 'isFraud', data = data)

In [15]:
normcol = ['step','amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest']
data[normcol] = ( data[normcol] - data[normcol].mean() ) /data[normcol].std() 

In [14]:
## 
cols = ['amount', 'oldbalanceOrg', 'oldbalanceDest',
        'Merchant', 'CASH_IN', 'CASH_OUT', 'DEBIT',
        'TRANSFER', 'newbalanceOrig', 'newbalanceDest', 'PAYMENT','isFlaggedFraud',
        'propofamount' ]

# Removed 'step'

target = ['isFraud']

In [16]:
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(data[cols].values, data[target].values, 
                                                    test_size = 0.3, random_state = 42)

In [17]:
randf = RandomForestClassifier(n_estimators = 50
                               , criterion = 'entropy', max_features = 'sqrt',oob_score = True)
score = randf.fit(X_train, y_train.flatten())
print(score.oob_score_)

In [18]:
from sklearn.metrics import classification_report,confusion_matrix
yptrain = randf.predict(X_train)

mat = confusion_matrix(y_train,yptrain).astype(np.float32)
print( mat/np.sum(mat,axis = 1,keepdims=True) )
print('\n')


TP = mat[0][0]
FP = mat[0][1]

FN = mat[1][0]


print('Precision:',TP/(TP+FP))
print('Recall:',TP/(TP+FN))

print('\n')


print(classification_report(y_train,yptrain) )

In [19]:
yptest = randf.predict(X_test)

mat = confusion_matrix(y_test,yptest).astype(np.float32)
print( mat/np.sum(mat,axis = 1,keepdims=True) )
print('\n')


TP = mat[0][0]
FP = mat[0][1]

FN = mat[1][0]

print('Precision:',TP/(TP+FP))
print('Recall:',TP/(TP+FN))

print('\n')


print(classification_report(y_test,yptest) )

In [20]:
## checking from Random Forest 


X = data[cols].values
Y = data[target].values

yptest = randf.predict(X)
# yptest = yptest >= 0.5 
mat = confusion_matrix(Y,yptest)
print( mat )
print('\n')
TP = mat[0][0]
FP = mat[0][1]

FN = mat[1][0]

print('Precision:',TP/(TP+FP))
print('Recall:',TP/(TP+FN))
print('\n')


print( classification_report(Y,yptest) )

Trying with neural network 

In [80]:
mod = Sequential()
mod.add(Dense(64, input_shape = (len(cols),), activation = 'relu'))
# mod.add(Dropout(0.3))
mod.add(Dense(16, activation = 'relu'))
mod.add(Dense(8, activation = 'relu'))

# mod.add(Dropout(0.3))
mod.add(Dense(1, activation = 'sigmoid'))

mod.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
mod.summary()

In [81]:
#trying to train on the entire data set with class weights 

cols = ['amount', 'oldbalanceOrg', 'oldbalanceDest',
        'Merchant','CASH_OUT', 'DEBIT',
        'TRANSFER','newbalanceOrig', 'newbalanceDest','isFlaggedFraud',
        'propofamount' ]

# Removed 'step', 'PAYMENT', CASH_IN

target = ['isFraud']



X_train, X_test, y_train, y_test = train_test_split(data[cols].values,data[target].values, 
                                                    test_size = 0.33, random_state = 42)


checkpoint_filepath = 'mod8.h5'

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)


hist = mod.fit(X_train,y_train, batch_size= 1028, epochs =30, validation_data= (X_test, y_test), callbacks= model_checkpoint_callback, class_weight= {0:1, 1:770})

In [82]:
mod.load_weights(filepath=checkpoint_filepath)

In [83]:
yptrain = mod.predict(X_train)

yptrain = yptrain >= 0.5
mat = confusion_matrix(y_train,yptrain)
print( mat )


print( classification_report(y_train,yptrain) )

In [84]:
yptest = mod.predict(X_test)
yptest = yptest >= 0.5 
mat = confusion_matrix(y_test,yptest)
print( mat )


print( classification_report(y_test,yptest) )

In [87]:
## Model resulnt on Neural Networks 
cols = ['amount', 'oldbalanceOrg', 'oldbalanceDest',
        'Merchant','CASH_OUT', 'DEBIT',
        'TRANSFER','newbalanceOrig', 'newbalanceDest','isFlaggedFraud',
        'propofamount' ]

# Removed 'step', 'PAYMENT', CASH_IN

target = ['isFraud']



X = data[cols].values
Y = data[target].values

yptest = mod.predict(X)
yptest = yptest >= 0.5 
mat = confusion_matrix(Y,yptest)
print( mat )
print('\n')
TP = mat[0][0]
FP = mat[0][1]

FN = mat[1][0]

print('Precision:',TP/(TP+FP))
print('Recall:',TP/(TP+FN))
print('\n')


print( classification_report(Y,yptest) )


In [74]:
np.sum (yptest == Y)/len(Y)

In [88]:
# trying under sampling data 


cols = ['amount', 'oldbalanceOrg', 'oldbalanceDest',
        'Merchant', 'CASH_OUT', 'DEBIT',
        'TRANSFER','newbalanceOrig', 'newbalanceDest','isFlaggedFraud',
        'propofamount' ]

# Removed 'step', 'PAYMENT', CASH_IN

target = ['isFraud']




class_z = data.loc[ data['isFraud'] == 0 ]
class_o = data.loc[ data['isFraud'] == 1 ] 

number_fraud = len(class_o)
number_non_fraud = len(class_z)

print('number fraud:', number_fraud )
print('number non fraud:', number_non_fraud )


under_sampled_class_z = class_z.sample(number_fraud, random_state = 26)
under_sampled_data = pd.concat([under_sampled_class_z, class_o], axis = 0)
under_sampled_data = under_sampled_data.sample(frac = 1).reset_index(drop = True)

In [89]:
mod = Sequential()
mod.add(Dense(64, input_shape = (len(cols),), activation = 'relu'))
# mod.add(Dropout(0.3))
mod.add(Dense(16, activation = 'relu'))
mod.add(Dense(8, activation = 'relu'))

# mod.add(Dropout(0.3))
mod.add(Dense(1, activation = 'sigmoid'))

mod.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
mod.summary()

In [90]:
X_train, X_test, y_train, y_test = train_test_split(under_sampled_data[cols].values, under_sampled_data[target].values, 
                                                    test_size = 0.2, random_state = 42)

In [91]:
checkpoint_filepath = 'mod10.h5'

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)


hist = mod.fit(X_train,y_train, batch_size= 512, epochs = 50, validation_data= (X_test, y_test), callbacks= model_checkpoint_callback)

In [92]:
mod.load_weights(filepath=checkpoint_filepath)

In [93]:
yptrain = mod.predict(X_train)

yptrain = yptrain >= 0.5
mat = confusion_matrix(y_train,yptrain)
print( mat )


print( classification_report(y_train,yptrain) )

In [94]:
yptest = mod.predict(X_test)
yptest = yptest >= 0.5 
mat = confusion_matrix(y_test,yptest)
print( mat )


print( classification_report(y_test,yptest) )

In [95]:
## Model resulnt on Neural Networks 
cols = ['amount', 'oldbalanceOrg', 'oldbalanceDest',
        'Merchant','CASH_OUT', 'DEBIT',
        'TRANSFER','newbalanceOrig', 'newbalanceDest','isFlaggedFraud',
        'propofamount' ]

# Removed 'step', 'PAYMENT', CASH_IN

target = ['isFraud']



X = data[cols].values
Y = data[target].values

yptest = mod.predict(X)
yptest = yptest >= 0.5 
mat = confusion_matrix(Y,yptest)
print( mat )
print('\n')
TP = mat[0][0]
FP = mat[0][1]

FN = mat[1][0]

print('Precision:',TP/(TP+FP))
print('Recall:',TP/(TP+FN))
print('\n')


print( classification_report(Y,yptest) )


In [None]:
#Thus random forest classifier has the best result 