In [None]:
import pandas as pd

# Load the dataset (example dataset used here)
url = 'https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv'
credit_card_data = pd.read_csv(url)

# Display the first few rows and check for missing values
print(credit_card_data.head())
print(credit_card_data.isnull().sum())

# Check the class distribution (fraudulent vs non-fraudulent)
print(credit_card_data['Class'].value_counts())


   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [None]:
from sklearn.preprocessing import StandardScaler

# Drop any missing values if present (not necessary if dataset is clean)
credit_card_data.dropna(inplace=True)

# Normalize the 'Amount' column
scaler = StandardScaler()
credit_card_data['Amount'] = scaler.fit_transform(credit_card_data['Amount'].values.reshape(-1, 1))


In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Separate input features and target variable
X = credit_card_data.drop('Class', axis=1)
y = credit_card_data['Class']

# Using SMOTE to handle class imbalance
over_sampler = SMOTE(sampling_strategy=0.1, random_state=42)
under_sampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)

# Combine over-sampling and under-sampling in a pipeline
sampling_pipeline = Pipeline([
    ('over', over_sampler),
    ('under', under_sampler)
])

# Apply the sampling pipeline
X_resampled, y_resampled = sampling_pipeline.fit_resample(X, y)

# Check the new class distribution
print(pd.Series(y_resampled).value_counts())


Class
0    56862
1    28431
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11321
           1       1.00      1.00      1.00      5738

    accuracy                           1.00     17059
   macro avg       1.00      1.00      1.00     17059
weighted avg       1.00      1.00      1.00     17059

[[11313     8]
 [   24  5714]]
