### Load the dataset

In [1]:
import pandas as pd

In [2]:
# Load the dataset
data  = "creditcard.csv"
data = pd.read_csv(data)

In [3]:
# Display basic information about the dataset
print(data.info())

# Display the first few rows of the dataset
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

### Data Preprocessing

In [4]:
from sklearn.preprocessing import StandardScaler

# Standardize the 'Amount' column
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))

# Drop unnecessary columns (if any)
# For example, if 'Time' is not relevant, you can drop it
data = data.drop(['Time'], axis=1)


In [5]:
pip install imbalanced-learn


Note: you may need to restart the kernel to use updated packages.


### Handle Class Imbalance

In [6]:
# Count the number of fraudulent and genuine transactions
fraud_count = data['Class'].sum()
genuine_count = len(data) - fraud_count

# Display class distribution
print(f"Fraudulent transactions: {fraud_count}")
print(f"Genuine transactions: {genuine_count}")


Fraudulent transactions: 492
Genuine transactions: 284315


### Split the dataset

In [7]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
from sklearn.utils import resample

# Combine X_train and y_train for resampling
combined_data = pd.concat([X_train, y_train], axis=1)

# Separate majority and minority classes
majority_class = combined_data[combined_data['Class'] == 0]
minority_class = combined_data[combined_data['Class'] == 1]

# Upsample the minority class
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# Combine the upsampled minority class with the majority class
upsampled_data = pd.concat([majority_class, minority_upsampled])

# Separate features (X_resampled) and target variable (y_resampled)
X_resampled = upsampled_data.drop('Class', axis=1)
y_resampled = upsampled_data['Class']

# Check the class distribution after upsampling
print("Class distribution after upsampling:")
print(y_resampled.value_counts())


Class distribution after upsampling:
0    227451
1    227451
Name: Class, dtype: int64


### Build a Random Forest Classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_resampled, y_resampled)


### Evaluate the model

In [10]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set
y_pred = model.predict(X_test)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.99      0.80      0.88        98

    accuracy                           1.00     56962
   macro avg       0.99      0.90      0.94     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
[[56863     1]
 [   20    78]]
