# Baseline Feature Selection for E-commerce Fraud Detection

This notebook aims to perform baseline feature selection to identify the most relevant features for the fraud detection model.

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

# Load the cleaned transaction data
data = pd.read_csv('../data/processed/transactions_cleaned.csv')

# Define features and target variable
X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform feature selection
selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(X_train, y_train)

# Get the scores and feature names
scores = selector.scores_
feature_names = X.columns

# Create a DataFrame to display the scores
feature_scores = pd.DataFrame({'Feature': feature_names, 'Score': scores})
feature_scores = feature_scores.sort_values(by='Score', ascending=False)

# Display the feature scores
feature_scores