In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
data = pd.read_csv('Fraud.csv')

In [3]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
data.columns = [column.lower() for column in data.columns]
data.columns

Index(['step', 'type', 'amount', 'nameorig', 'oldbalanceorg', 'newbalanceorig',
       'namedest', 'oldbalancedest', 'newbalancedest', 'isfraud',
       'isflaggedfraud'],
      dtype='object')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameorig        object 
 4   oldbalanceorg   float64
 5   newbalanceorig  float64
 6   namedest        object 
 7   oldbalancedest  float64
 8   newbalancedest  float64
 9   isfraud         int64  
 10  isflaggedfraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [6]:
data.drop(['step', 'nameorig', 'namedest', 'isflaggedfraud'], axis=1, inplace=True)

In [7]:
data.head()

Unnamed: 0,type,amount,oldbalanceorg,newbalanceorig,oldbalancedest,newbalancedest,isfraud
0,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0
1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0
2,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
3,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
4,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0


In [8]:
# Define a mapping for type attributes
type_mapping = {
    'CASH-IN': 1,
    'CASH-OUT': 2,
    'DEBIT': 3,
    'PAYMENT': 4,
    'TRANSFER': 5
}

# Map the type attributes to numeric values
data['type'] = data['type'].map(type_mapping)



In [9]:
unique_values_dataset = data.nunique()
print(unique_values_dataset)

type                    3
amount            5316900
oldbalanceorg     1845844
newbalanceorig    2682586
oldbalancedest    3614697
newbalancedest    3555499
isfraud                 2
dtype: int64


In [10]:
# You can save the modified dataset to a new CSV file if needed
data.to_csv('modified_dataset.csv', index=False)

In [11]:
dt = pd.read_csv('modified_dataset.csv')

In [12]:
# Check the data types of all columns
data_types = dt.dtypes

# Print the data types of all columns
print(data_types)

type              float64
amount            float64
oldbalanceorg     float64
newbalanceorig    float64
oldbalancedest    float64
newbalancedest    float64
isfraud             int64
dtype: object


In [13]:
dt = data.drop_duplicates()
print(dt.head())

   type    amount  oldbalanceorg  newbalanceorig  oldbalancedest  \
0   4.0   9839.64       170136.0       160296.36             0.0   
1   4.0   1864.28        21249.0        19384.72             0.0   
2   5.0    181.00          181.0            0.00             0.0   
3   NaN    181.00          181.0            0.00         21182.0   
4   4.0  11668.14        41554.0        29885.86             0.0   

   newbalancedest  isfraud  
0             0.0        0  
1             0.0        0  
2             0.0        1  
3             0.0        1  
4             0.0        0  


In [14]:
dt.to_csv("preprocessed_Fraud_data.csv", index=False)

In [15]:
dt = pd.read_csv("preprocessed_Fraud_data.csv")

In [16]:
dt.head()

Unnamed: 0,type,amount,oldbalanceorg,newbalanceorig,oldbalancedest,newbalancedest,isfraud
0,4.0,9839.64,170136.0,160296.36,0.0,0.0,0
1,4.0,1864.28,21249.0,19384.72,0.0,0.0,0
2,5.0,181.0,181.0,0.0,0.0,0.0,1
3,,181.0,181.0,0.0,21182.0,0.0,1
4,4.0,11668.14,41554.0,29885.86,0.0,0.0,0


In [17]:
# Handling missing values (if any)
data.fillna(0, inplace=True)  # Replace missing values with 0 

In [18]:
X = dt.drop('isfraud', axis=1)
y = dt['isfraud']

In [19]:
# Encode categorical variables (if any)
X = pd.get_dummies(X, columns=['type'], drop_first=True)

In [20]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [23]:
# Predict on the test set
y_pred = model.predict(X_test)

In [24]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


In [25]:
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Accuracy: 1.00
Confusion Matrix:
[[1251222      67]
 [    994     665]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1251289
           1       0.91      0.40      0.56      1659

    accuracy                           1.00   1252948
   macro avg       0.95      0.70      0.78   1252948
weighted avg       1.00      1.00      1.00   1252948



In [27]:
# the model has a high overall accuracy, correctly classifying most of the non-fraudulent transactions. 