In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE, RandomOverSampler  # Various sampling techniques
from sklearn.linear_model import LogisticRegression  # Example classifier
from sklearn.ensemble import RandomForestClassifier  # Example classifier
from xgboost import XGBClassifier  # Another powerful classifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Load credit card transaction data
df = pd.read_csv('/content/drive/MyDrive/Encryptix/creditcard.csv')

In [5]:
# Preprocess the data
# Separate features (X) and target variable (y)
X = df.drop('Class', axis=1)  # Assuming 'Class' is the fraud label
y = df['Class']

# Encode categorical features (if any)
le = LabelEncoder()
for col in X.select_dtypes(include=['object']):
    X[col] = le.fit_transform(X[col])

# Normalize numerical features (consider normalization or standardization based on data)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Handle class imbalance if fraud is rare
if y_train.value_counts().iloc[1] / y_train.shape[0] < 0.1:
    # Adjust threshold based on data
    # Tried different oversampling techniques and evaluate their impact
    # Here we demonstrate SMOTE and RandomOverSampler
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    ros = RandomOverSampler(random_state=42)
    X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

    # Train models with different oversampled datasets and compare performance

# Train the classification models (consider multiple models and hyperparameter tuning)
model_lr = LogisticRegression(solver='liblinear')  # Example with tuned hyperparameters (replace with tuning code)
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)  # Example with tuned hyperparameters
model_xgb = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=100)  # Example with tuned hyperparameters

model_lr.fit(X_train, y_train)
model_rf.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)

# Evaluate the model performance on the testing set (consider various metrics)
models = {'Logistic Regression': model_lr, 'Random Forest': model_rf, 'XGBoost': model_xgb}
for name, model in models.items():
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f'Model: {name}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print(f'ROC AUC: {roc_auc}')
    print('-------------------')

Model: Logistic Regression
Precision: 0.8636363636363636
Recall: 0.5816326530612245
F1 Score: 0.6951219512195121
ROC AUC: 0.7907371903460314
-------------------
Model: Random Forest
Precision: 0.974025974025974
Recall: 0.7653061224489796
F1 Score: 0.8571428571428571
ROC AUC: 0.8826354754056941
-------------------
Model: XGBoost
Precision: 0.9746835443037974
Recall: 0.7857142857142857
F1 Score: 0.8700564971751412
ROC AUC: 0.892839557038347
-------------------
