In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
df = pd.read_csv('fraud_detection_data.csv')  # Replace with your file path

print(df.head())
print(df.info())
print(df.describe())
print(df['fraudulent'].value_counts())  # Assuming 'fraudulent' is target column


In [None]:
# Fill missing values (example: fill numerical with median)
for col in df.select_dtypes(include=np.number).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Fill missing categorical with mode
for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Convert categorical variables
df = pd.get_dummies(df, drop_first=True)

# Define X and y
X = df.drop('fraudulent', axis=1)
y = df['fraudulent']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Optional: scale features if using models sensitive to scaling (e.g. Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
model = RandomForestClassifier(random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean CV accuracy: {scores.mean():.4f}")


In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best parameters from Grid Search: {grid_search.best_params_}")
print(f"Best CV accuracy from Grid Search: {grid_search.best_score_:.4f}")


In [None]:
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, 20, 30, None],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2']
}

random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

print(f"Best parameters from Randomized Search: {random_search.best_params_}")
print(f"Best CV accuracy from Randomized Search: {random_search.best_score_:.4f}")


In [None]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

feature_importances = pd.Series(best_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Top 10 Important Features:")
print(feature_importances.head(10))

# Plot
plt.figure(figsize=(10,6))
sns.barplot(x=feature_importances.head(10), y=feature_importances.head(10).index)
plt.title('Feature Importance')
plt.show()


In [None]:
log_reg = LogisticRegression(penalty='l2', solver='liblinear', random_state=42)
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)
print("Logistic Regression with L2 Regularization")
print(classification_report(y_test, y_pred))


In [None]:
# Evaluate best random forest model on test data
y_pred_rf = best_model.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
