<a href="https://colab.research.google.com/github/rodoshi16/Fraud-transaction-detection-system-/blob/main/Data_prep_Fraud_detection_system_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import os
import gdown
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

file_id = "1XWydcU314PzUnerMD-aznUHrDcwSKuqM"
output = 'creditcard.csv'
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, output, quiet=False)

if not os.path.exists('data'):
    os.makedirs('data')

# reading the dataset
df = pd.read_csv('creditcard.csv')


print('Missing values per column:')
print(df.isnull().sum())

print("Zero values per column:")
print((df == 0).sum())

# Dropping duplicate rows if any
df = df.drop_duplicates()

# In the dataset, time and Amount have different scales. ML models work
# best when they have a similar scale. Therefore, we normalize it.

scaler = StandardScaler()
df[['Amount', 'Time']] = scaler.fit_transform(df[['Amount', 'Time']])


# We need to separate input from output. The values stored in class represent if
# it's a fraud transaction or not. We need to store this as y for testing.

# we want all the features except for class to use to predict fraud
X = df.drop(columns=['Class'])
y = df['Class']

# we need to break the data into training and testing sets
# 70% of the data is used to train the model through X_train and y_train

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Second split: 50% validation, 50% test from the temp set (because 50% of 30% = 15% each)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)


# My dataset is highly imbalanced :(
# When I parsed out the number of fraud cases vs not: its 492/ 284315 (normal data)
# Now there should be equal representation of the data

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nOriginal class distribution:")
print(y.value_counts())

print("\nResampled class distribution:")
print(pd.Series(y_train_resampled).value_counts())

print(f"x_train_vec shape: {x_train_vec.shape}, y_train shape: {y_train.shape}")
print(f"x_val_vec shape: {X_val_vec.shape}, y_val shape: {y_val.shape}")
print(f"x_test_vec shape: {X_test_vec.shape}, y_test shape: {y_test.shape}")

# Cleaned dataset - yay!!!

# X_train_resampled.to_csv('X_train_cleaned.csv', index=False)
# y_train_resampled.to_csv('y_train_cleaned.csv', index=False)
# X_test.to_csv('X_test_cleaned.csv', index=False)
# y_test.to_csv('y_test_cleaned.csv', index=False)

print('My dataset is clean and ready to go!')







Downloading...
From (original): https://drive.google.com/uc?id=1XWydcU314PzUnerMD-aznUHrDcwSKuqM
From (redirected): https://drive.google.com/uc?id=1XWydcU314PzUnerMD-aznUHrDcwSKuqM&confirm=t&uuid=cb6dce09-10dc-4bab-bf84-b01d3c002548
To: /content/creditcard.csv
100%|██████████| 151M/151M [00:00<00:00, 195MB/s]


Missing values per column:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
Zero values per column:
Time           2
V1             0
V2             0
V3             0
V4             0
V5             0
V6             0
V7             0
V8             0
V9             0
V10            0
V11            0
V12            0
V13            0
V14            0
V15            0
V16            0
V17            0
V18            0
V19            0
V20            0
V21            0
V22            0
V23            0
V24            0
V25            0
V26            0
V27            0
V28            0
Amount      1825
Class     284315
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Amount', 'Time']] = scaler.fit_transform(df[['Amount', 'Time']])



Original class distribution:
Class
0    283253
1       473
Name: count, dtype: int64

Resampled class distribution:
Class
0    198269
1    198269
Name: count, dtype: int64
x_train_vec shape: (30, 30), y_train shape: (198608,)
x_val_vec shape: (30, 30), y_val shape: (42559,)
x_test_vec shape: (30, 30), y_test shape: (42559,)
My dataset is clean and ready to go!


In [14]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")
print(f"Confusion Matrix: \n{conf_matrix}")

Validation Accuracy: 0.975093399750934
Confusion Matrix: 
[[41441  1052]
 [    8    58]]
