In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
# reading the DataFrame
df = pd.read_csv("../input/fraudulent-transactions-prediction/Fraud.csv")

In [None]:
# checking the data types
df.dtypes

In [None]:
# checking the shape of the data
df.shape

In [None]:
# checking the head of the data
df.head()

In [None]:
# checking the info of the data
df.info()

In [None]:
# describing the data
df.describe().transpose()

In [None]:
df.isnull().sum()

In [None]:
# checking the correlation between the variables with the target variable
df.corr()["isFraud"].sort_values()

In [None]:
# heatmap of the correlation between the variables with the target variable
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True)

In [None]:
df["isFraud"].value_counts()
# as we see here we have an unbalanced dataset

In [None]:
df["isFraud"].value_counts(normalize=True)
# more than 90 % of the data is not fraud

In [None]:
# checking the payment type of the transactions
df["type"].value_counts()

In [None]:
# counting the number of transactions per type
plt.figure(figsize=(12,8))
sns.countplot(x="type", data=df,hue="isFraud" , palette="Set2")
# we are enable to analyse the Fraud transactions 
# as we have non fraud transactions more than fraud transactions

In [None]:
# dropping the variables that are not needed
# making a copy of the data
df_copy = df.copy()
df_copy.columns

In [None]:
# checking the step variable
# About the step : maps a unit of time in the real world
# In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).
df["step"].value_counts().sort_values(ascending=False).head()

In [None]:
# columns with object type
df_copy.select_dtypes(include=["object"]).columns

In [None]:
# droping NameOrig and NameDest
df_copy.drop(["nameOrig", "nameDest"], axis=1, inplace=True)

In [None]:
df_copy.head()

In [None]:
# Label Encoding the type variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_copy["type"] = le.fit_transform(df_copy["type"])

In [None]:
df_copy.head()

In [None]:
# dropping the isFlaggedFraud variable as we dont need it
df_copy.drop(["isFlaggedFraud"], axis=1, inplace=True)

In [None]:
df_copy.head()

In [None]:
# choosing a model to use
# we will use Logistic Regression as we have unbalanced dataset
# First we need to create a training and test set
from sklearn.model_selection import train_test_split
X = df_copy.drop("isFraud",axis=1).values
y = df_copy["isFraud"].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# we need to feature scale the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train  = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
pred = log_reg.predict(X_test)

In [None]:
# checking the accuracy of the model
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))