Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/drive/MyDrive/ML_PRJ/anomaly_data.csv')

In [None]:
# first 5 rows of the dataset
credit_card_data.head()

In [None]:
credit_card_data.tail()

In [None]:
# dataset informations
credit_card_data.info()

In [None]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

In [None]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [None]:
import matplotlib.pyplot as plt

class_counts = credit_card_data['Class'].value_counts()

plt.figure(figsize=(5, 6))
plt.pie(class_counts, labels=['Normal', 'Fraudulent'], autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Transactions')
plt.axis('equal')
plt.show()

In [None]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

In [None]:
# statistical measures of the data
legit.Amount.describe()

In [None]:
fraud.Amount.describe()

In [None]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 496


In [None]:
legit_sample = legit.sample(n=100)

Concatenating two DataFrames

In [None]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [None]:
new_dataset.head()

In [None]:
new_dataset.tail()

In [None]:
new_dataset['Class'].value_counts()

In [None]:
new_dataset.groupby('Class').mean()

Splitting the data into Features & Targets

In [None]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [None]:
print(X)

In [None]:
print(Y)

Split the data into Training data & Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score on Test Data : ', test_data_accuracy)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a histogram of transaction amounts for both legitimate and fraudulent transactions
plt.figure(figsize=(10, 6))
sns.histplot(legit['Amount'], bins=50, label='Legit', color='blue', kde=True)
sns.histplot(fraud['Amount'], bins=50, label='Fraud', color='red', kde=True)
plt.title('Distribution of Transaction Amounts')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the correlation matrix
corr_matrix = new_dataset.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(20, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
 import matplotlib.pyplot as plt

# Get feature importances from the Logistic Regression model
feature_importances = model.coef_[0]
feature_names = X.columns

# Create a bar plot of feature importances
plt.figure(figsize=(10, 6))
plt.bar(feature_names, feature_importances)
plt.title('Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming 'new_dataset' is your balanced dataset
class_counts = new_dataset['Class'].value_counts()
labels = ['Legitimate', 'Fraudulent']
sizes = class_counts.values

plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Legitimate and Fraudulent Transactions')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)