In [17]:
pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [18]:
import pandas as pd

In [19]:
data = pd.read_csv('dataset/fraud_data.csv')

In [20]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [21]:
import numpy as np

In [22]:
print(data.isnull().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [23]:
#Exploring the magnitude of individual transaction types
print(data.type.value_counts())

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64


In [24]:
#Visualizing the distribution of transaction types that we explored above
type = data['type'].value_counts()
transactions = type.index
quantity = type.values

import plotly.express as px
figure = px.pie(data, values = quantity, names = transactions, hole = 0.5, title = 'Distributon Of Transaction types')
figure.show()

In [25]:
# Find rows where 'isFraud' column has NaN values
rows_with_nan = data[data['isFraud'].isna()]

# Print rows with NaN values
print(rows_with_nan)

Empty DataFrame
Columns: [step, type, amount, nameOrig, oldbalanceOrg, newbalanceOrig, nameDest, oldbalanceDest, newbalanceDest, isFraud, isFlaggedFraud]
Index: []


In [26]:
# Remove rows where any column contains NaN values
data.dropna(how='any', inplace=True)


In [27]:
# Find rows where 'isFraud' column has NaN values
rows_with_nan = data[data['isFraud'].isna()]

# Print rows with NaN values
print(rows_with_nan)

Empty DataFrame
Columns: [step, type, amount, nameOrig, oldbalanceOrg, newbalanceOrig, nameDest, oldbalanceDest, newbalanceDest, isFraud, isFlaggedFraud]
Index: []


In [28]:
#Checking the correlation
correlation_data = data.corr()
print(correlation_data['isFraud'].sort_values(ascending = False))





isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


In [29]:
#Transforming categorical feautres into numerical
data['type'] = data['type'].map({'CASH_OUT' : 1, 'PAYMENT' : 2, 'CASH_IN' : 3, 'TRANSFER' : 4, 'DEBIT' : 5})
data['isFraud'] = data['isFraud'].map({0:'No Fraud', 1:'Fraud'})
print(data.head())

   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     2   9839.64  C1231006815       170136.0       160296.36   
1     1     2   1864.28  C1666544295        21249.0        19384.72   
2     1     4    181.00  C1305486145          181.0            0.00   
3     1     1    181.00   C840083671          181.0            0.00   
4     1     2  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest   isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0  No Fraud               0  
1  M2044282225             0.0             0.0  No Fraud               0  
2   C553264065             0.0             0.0     Fraud               0  
3    C38997010         21182.0             0.0     Fraud               0  
4  M1230701703             0.0             0.0  No Fraud               0  


In [30]:
#Training a classification model
from sklearn.model_selection import train_test_split

x= np.array(data[['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig']])

y = np.array(data[['isFraud']])



In [31]:
#Training the model
from sklearn.tree import DecisionTreeClassifier
X_train, x_test, Y_train, y_test = train_test_split(x, y, test_size= 0.10, random_state = 42)
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)
print(model.score(x_test, y_test))

0.9997359578286932


In [32]:
# prediction based upon our model trained
#template of features = [type, amount, oldbalanceOrg, newbalanceOrig]
features = np.array([[0, 9000.00, 12000.00, 3000.0]])
print(model.predict(features))

['No Fraud']
