In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Load dataset
url = "https://www.kaggleusercontent.com/mlgulb/creditcardfraud"
df = pd.read_csv('/content/creditcard.csv')

# Preprocessing
df.drop_duplicates(inplace=True)

# Handle missing values
df.fillna(df.median(), inplace=True)  # Fill NaN with median values

X = df.drop(columns=['Class'])
y = df['Class']

# Ensure there are no NaN values in y
y = y.fillna(y.mode()[0])  # Fill NaN in target with the most frequent class

# Handle class imbalance using SMOTE
smote = SMOTE(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train Isolation Forest model
model = IsolationForest(n_estimators=100, contamination=0.02, random_state=42)
model.fit(X_train)

# Predictions
y_pred = model.predict(X_test)
y_pred = np.where(y_pred == -1, 1, 0)  # Convert anomaly (-1) to fraud (1)

# Evaluation
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, AUC-ROC: {auc:.4f}")


Precision: 0.9895, Recall: 0.2027, AUC-ROC: 0.6013
