In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix

# --- 1. Data Loading and Exploration ---
data = pd.read_csv("creditcard.csv")

print(data.head())
print(data.info())
print(data.isnull().sum())

sns.countplot(x='Class', data=data)
plt.show()

print(data[['Amount', 'Time']].describe())

# --- 2. Data Preprocessing ---
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = scaler.fit_transform(data['Time'].values.reshape(-1, 1))

# --- 3. Handling Class Imbalance ---
X = data.drop('Class', axis=1)
y = data['Class']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# --- 4. Model Training ---
# Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_lr))
sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, fmt='d')
plt.show()

# Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(X_train)

y_pred_iso = iso_forest.predict(X_test)
y_pred_iso[y_pred_iso == 1] = 0
y_pred_iso[y_pred_iso == -1] = 1

print("\nIsolation Forest Results:")
print(classification_report(y_test, y_pred_iso))
sns.heatmap(confusion_matrix(y_test, y_pred_iso), annot=True, fmt='d')
plt.show()

# --- 5. Interpretation and Insights (Logistic Regression) ---
feature_names = X.columns
coefficients = lr_model.coef_[0]
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Logistic Regression):\n", feature_importance)