In [1]:
import pandas as pd
import glob
import numpy as np
from datetime import timedelta


In [2]:
data_path = "D:/unified_metor_content/fraud_detection/fraud_detection/" 

file_list = sorted(glob.glob(data_path + "*.pkl"))

# Read and concatenate all daily files
df_all = pd.concat([pd.read_pickle(file) for file in file_list], ignore_index=True)

In [3]:
#df_all.head()

In [4]:
df_all['datetime'] = pd.to_datetime(df_all['TX_DATETIME'])

In [5]:
#df_all['TX_HOUR'] = df_all['TX_DATETIME'].dt.hour
#df_all['TX_DAY'] = df_all['TX_DATETIME'].dt.day
#df_all['TX_DAYOFWEEK'] = df_all['TX_DATETIME'].dt.dayofweek  # 0=Monday, 6=Sunday
#df_all['TX_IS_WEEKEND'] = df_all['TX_DAYOFWEEK'].isin([5, 6]).astype(int)
#df_all['TX_IS_NIGHT'] = df_all['TX_HOUR'].between(0, 6).astype(int)

In [6]:
fraud_counts= df_all[df_all['TX_FRAUD'] == 1].groupby('TERMINAL_ID').size()
df_all['fraud_count'] = df_all['TERMINAL_ID'].map(fraud_counts).fillna(0).astype(int)



In [7]:
df_all["fraud_amt"] = (df_all["TX_AMOUNT"] > 220).astype(int)

In [8]:
df_all.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,datetime,fraud_count,fraud_amt
0,0,2018-04-01 00:00:31,596,3156,57.16,31,0,0,0,2018-04-01 00:00:31,17,0
1,1,2018-04-01 00:02:10,4961,3412,81.51,130,0,0,0,2018-04-01 00:02:10,1,0
2,2,2018-04-01 00:07:56,2,1365,146.0,476,0,0,0,2018-04-01 00:07:56,1,0
3,3,2018-04-01 00:09:29,4128,8737,64.49,569,0,0,0,2018-04-01 00:09:29,0,0
4,4,2018-04-01 00:10:34,927,9906,50.99,634,0,0,0,2018-04-01 00:10:34,0,0


In [9]:
#pip install xgboost

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

In [11]:
#from sklearn.utils import resample

#df_majority = df_all[df_all.TX_FRAUD == 0]
#df_minority = df_all[df_all.TX_FRAUD == 1]

#df_majority_downsampled = resample(df_majority,
                                   #replace=False,
                                   #n_samples=len(df_minority)*5,
                                   #random_state=42)

#df_balanced = pd.concat([df_majority_downsampled, df_minority])


In [17]:
# Set seed for reproducibility
np.random.seed(42)
df_all = df_all.sort_values('TX_DATETIME')

# Add a date-only column
#df_all['date'] = df_all['datetime'].dt.date
df_all['date'] = df_all['datetime'].dt.floor('d')

# Get all unique transaction dates
unique_dates = df_all['date'].unique()

# Simulate fraud for each day except the last 14 (so we have a full 14-day window)
for day in unique_dates[:-14]:
    #current_day = pd.to_datetime(day)
    current_day = day  # Already a Timestamp
    window_end = current_day + timedelta(days=14)

    # Get customers active on this day
    active_customers = df_all[df_all['date'] == day]['CUSTOMER_ID'].unique()

    if len(active_customers) < 3:
        continue  # skip if fewer than 3 customers

    # Select 3 customers at random
    selected_customers = np.random.choice(active_customers, size=3, replace=False)

    for cust_id in selected_customers:
        # Filter transactions in the next 14 days
        mask = (
            (df_all['CUSTOMER_ID'] == cust_id) &
            (df_all['datetime'] > current_day) &
            (df_all['datetime'] <= window_end)
        )
        cust_txns = df_all[mask]

        if len(cust_txns) == 0:
            continue

        # Pick 1/3 of them (or at least 1) to make fraudulent
        num_fraud = max(1, len(cust_txns) // 3)
        fraud_indices = cust_txns.sample(n=num_fraud, random_state=42).index

        # Apply fraud changes
        #df_all.loc[fraud_indices, 'TX_AMOUNT'] *= 5
        # Safely apply multiplier only to numeric rows
        df_all.loc[fraud_indices, 'TX_AMOUNT'] = df_all.loc[fraud_indices, 'TX_AMOUNT'].astype(float) * 5

        df_all.loc[fraud_indices, 'TX_FRAUD'] = 1
        df_all.loc[fraud_indices, 'TX_FRAUD_SCENARIO'] = 4 

In [13]:
# Sort by time
#df = df_balanced.sort_values('TX_DATETIME')
df = df_all.drop(columns = ['TX_DATETIME','datetime','TX_FRAUD_SCENARIO'])
df['TX_TIME_SECONDS'] = pd.to_numeric(df['TX_TIME_SECONDS'], errors='coerce')
df['TX_TIME_DAYS'] = pd.to_numeric(df['TX_TIME_DAYS'], errors='coerce')
df['CUSTOMER_ID'] = pd.to_numeric(df['CUSTOMER_ID'], errors='coerce')
df['TERMINAL_ID'] = pd.to_numeric(df['TERMINAL_ID'], errors='coerce')
# Define your split point (e.g., 70% of the data for training)
split_index = int(0.7 * len(df))

X = df.drop(columns = ['TX_FRAUD'])
y = df['TX_FRAUD']

X_train, X_test = X.iloc[:split_index],X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index],y.iloc[split_index:]


In [14]:
numeric_features = ['TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS']
preprocessor = ColumnTransformer(transformers = [('num',StandardScaler(),numeric_features)],
                                remainder = 'passthrough',
                                force_int_remainder_cols=False)


In [15]:
#steps=[
    #('preprocessor', preprocessor),
    #('classifier', RandomForestClassifier())
#]
pipeline1 = Pipeline([('preprocessor',preprocessor),('clf',LogisticRegression(max_iter=1000))])

#pipeline2 = Pipeline([('xg',XGBClassifier(scale_pos_weight=110,  # Use ratio of non-fraud/fraud
    
    #eval_metric='logloss'))])

In [16]:
pipeline1.fit(X_train, y_train)

TypeError: float() argument must be a string or a real number, not 'Timestamp'

In [None]:
#pipeline1.fit(X_train, y_train)

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)

def log_to_text(model_name, y_test, y_pred, y_proba, filename='results_log.txt'):
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba)
    cm = confusion_matrix(y_test, y_pred)
    
    with open(filename, 'a') as f:
        f.write(f"\nModel: {model_name}\n")
        f.write(f"Accuracy: {acc}\n")
        f.write(f"Precision: {prec}\n")
        f.write(f"Recall: {rec}\n")
        f.write(f"F1 Score: {f1}\n")
        f.write(f"ROC AUC: {auc}\n")
        f.write(f"Confusion Matrix:\n{cm}\n")
        f.write("-" * 40 + "\n")


In [None]:
y_pred = pipeline1.predict(X_test)
y_proba = pipeline1.predict_proba(X_test)[:, 1] 
log_to_text('logistic regression after converting fraud simulants',y_test,y_pred,y_proba)

In [None]:
df_comparision = pd.DataFrame({
    'y_test' : y_test.values,
    'y_pred' : y_pred
})

In [None]:
df_comparision

In [None]:

# Basic metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Classification report (prints everything)
report = classification_report(y_test, y_pred)

# If you want AUC, use predicted probabilities
y_proba = pipeline1.predict_proba(X_test)[:, 1]  # Prob of class 1
auc = roc_auc_score(y_test, y_proba)

# Print everything
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", auc)
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)


In [None]:
#print(X.dtypes)