In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [2]:
# Load the CSV file
data = pd.read_csv('creditcard.csv.zip')
# Display the first few rows of the dataset
print(data.head())
# Check for any missing values
missing_values = data.isnull().sum()
print(missing_values)

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [3]:
# Separate features and target variable
X = data.drop(columns=['Class'])
y = data['Class']

# Normalize the 'Amount' and 'Time' columns
scaler = StandardScaler()
X[['Amount', 'Time']] = scaler.fit_transform(X[['Amount', 'Time']])

# Handle class imbalance using SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [4]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
report = classification_report(y_test, y_pred, target_names=['Genuine', 'Fraudulent'])
conf_matrix = confusion_matrix(y_test, y_pred)
0
report, conf_matrix

('              precision    recall  f1-score   support\n\n     Genuine       1.00      1.00      1.00     56750\n  Fraudulent       1.00      1.00      1.00     56976\n\n    accuracy                           1.00    113726\n   macro avg       1.00      1.00      1.00    113726\nweighted avg       1.00      1.00      1.00    113726\n',
 array([[56739,    11],
        [    0, 56976]], dtype=int64))

In [5]:
from sklearn.utils import resample

# Combine the features and target into one dataframe for easier manipulation
df = pd.concat([X, y], axis=1)

# Separate the majority and minority classes
df_majority = df[df.Class == 0]
df_minority = df[df.Class == 1]

# Undersample the majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,  # sample without replacement
                                   n_samples=len(df_minority),  # match minority class
                                   random_state=42)  # for reproducibility

# Combine the minority class with the downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Separate features and target variable again
X_downsampled = df_downsampled.drop(columns=['Class'])
y_downsampled = df_downsampled['Class']

# Split the downsampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_downsampled, y_downsampled, test_size=0.2, random_state=42)

# Train the Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
report = classification_report(y_test, y_pred, target_names=['Genuine', 'Fraudulent'])
conf_matrix = confusion_matrix(y_test, y_pred)

report, conf_matrix

('              precision    recall  f1-score   support\n\n     Genuine       0.90      0.96      0.93        99\n  Fraudulent       0.96      0.89      0.92        98\n\n    accuracy                           0.92       197\n   macro avg       0.93      0.92      0.92       197\nweighted avg       0.93      0.92      0.92       197\n',
 array([[95,  4],
        [11, 87]], dtype=int64))