In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [None]:
# Load dataset
df = pd.read_csv('credit_card_fraud.csv')

# Display first few rows
print(df.head())

# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Check class distribution
print("Class Distribution:\n", df['fraudulent'].value_counts())


In [None]:
# Convert categorical variables to dummy/one-hot encoding
df = pd.get_dummies(df, drop_first=True)

# Define Features and Target
X = df.drop('fraudulent', axis=1)
y = df['fraudulent']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
model = RandomForestClassifier(random_state=42)

# 5-fold cross-validation
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation Scores:", scores)
print("Mean Accuracy:", scores.mean())


In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters (Grid Search):", grid_search.best_params_)
print("Best Score (Grid Search):", grid_search.best_score_)


In [None]:
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'criterion': ['gini', 'entropy']
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42), 
    param_dist, 
    n_iter=5, 
    cv=5, 
    scoring='accuracy', 
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best Parameters (Randomized Search):", random_search.best_params_)
print("Best Score (Randomized Search):", random_search.best_score_)


In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Calculate feature importance
feature_importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

# Display feature importance
print("Feature Importance:\n", feature_importance)

# Plot
plt.figure(figsize=(10, 5))
sns.barplot(x=feature_importance, y=feature_importance.index)
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.title('Feature Importance Analysis')
plt.tight_layout()
plt.show()


In [None]:
# Use best model from Grid Search
best_model = grid_search.best_estimator_

# Predictions on test set
y_pred = best_model.predict(X_test)

# Evaluate model
print("Final Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
