In [3]:
import pandas as pd
import numpy as np
import random as rn



In [4]:
# Load the dataset
df = pd.read_csv('creditcard.csv')



In [5]:
# Manual parameters
RANDOM_SEED = 42
TRAINING_SAMPLE = 200000
VALIDATE_SIZE = 0.2
RATIO_TO_FRAUD = 15


In [6]:

# Setting random seeds for reproducibility
np.random.seed(RANDOM_SEED)
rn.seed(RANDOM_SEED)

# Convert column names to lower case and rename the 'Class' column
df.columns = map(str.lower, df.columns)
df.rename(columns={'class': 'label'}, inplace=True)



In [7]:
# Print first 5 rows to get an initial impression of the data
print(df.head())

# Add a negligible amount to avoid taking the log of 0
df['log10_amount'] = np.log10(df['amount'] + 0.00001)

# Dropping redundant columns
df = df.drop(['time', 'amount'], axis=1)

# Splitting by class
fraud = df[df.label == 1]
clean = df[df.label == 0]



   time        v1        v2        v3        v4        v5        v6        v7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         v8        v9  ...       v21       v22       v23       v24       v25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        v26       v27       v28 

In [8]:
# Undersample clean transactions
clean_undersampled = clean.sample(int(len(fraud) * RATIO_TO_FRAUD), random_state=RANDOM_SEED)

# Concatenate with fraud transactions into a single dataframe
visualisation_initial = pd.concat([fraud, clean_undersampled])



In [9]:

# Isolate features from labels
features = visualisation_initial.drop('label', axis=1).values
labels = visualisation_initial['label'].values

# Keep the label field at the back
df = df[[col for col in df.columns if col not in ['label', 'log10_amount']] + ['log10_amount', 'label']]

In [10]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    features, labels, 
    test_size=VALIDATE_SIZE, 
    random_state=RANDOM_SEED
)


In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the model
model = RandomForestClassifier(random_state=RANDOM_SEED)

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_val_scaled)

# Evaluate the model
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))


[[1465    1]
 [  20   89]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1466
           1       0.99      0.82      0.89       109

    accuracy                           0.99      1575
   macro avg       0.99      0.91      0.94      1575
weighted avg       0.99      0.99      0.99      1575

