# Lab10 (Instructor) — Solutions & Checks

Notebook นี้มีแนวคำตอบและจุดตรวจ (sanity checks) สำหรับผู้สอน


In [1]:
import pandas as pd
from pathlib import Path
from sklearn.metrics import classification_report

# Check current directory and data path
import os
print("Current directory:", os.getcwd())
DATA = Path('..') / 'data'
print("Data path:", DATA)
print("Data path exists:", DATA.exists())
print()

# Load data
df_tx = pd.read_csv(DATA / 'Transaction Data' / 'transaction_records.csv')
df_meta = pd.read_csv(DATA / 'Transaction Data' / 'transaction_metadata.csv')
df_cust = pd.read_csv(DATA / 'Customer Profiles' / 'customer_data.csv')
df_act = pd.read_csv(DATA / 'Customer Profiles' / 'account_activity.csv')
df_merch = pd.read_csv(DATA / 'Merchant Information' / 'merchant_data.csv')
df_fraud = pd.read_csv(DATA / 'Fraudulent Patterns' / 'fraud_indicators.csv')

print("Loaded data shapes:")
print(f"df_tx: {df_tx.shape}, columns: {list(df_tx.columns)}")
print(f"df_meta: {df_meta.shape}, columns: {list(df_meta.columns)}")
print(f"df_cust: {df_cust.shape}, columns: {list(df_cust.columns)}")
print(f"df_act: {df_act.shape}, columns: {list(df_act.columns)}")
print(f"df_fraud: {df_fraud.shape}, columns: {list(df_fraud.columns)}")
print()

# Merge all data
df = df_tx.merge(df_meta, on='TransactionID', how='left')
print(f"After merge with meta: {df.shape}, columns: {list(df.columns)}")

df = df.merge(df_cust, on='CustomerID', how='left')
print(f"After merge with cust: {df.shape}, columns: {list(df.columns)}")

df = df.merge(df_act, on='CustomerID', how='left')
print(f"After merge with act: {df.shape}, columns: {list(df.columns)}")

df = df.merge(df_merch, on='MerchantID', how='left')
print(f"After merge with merch: {df.shape}")

df = df.merge(df_fraud[['TransactionID','FraudIndicator']], on='TransactionID', how='left')
print(f"After merge with fraud: {df.shape}")
print(f"All columns: {list(df.columns)}")
print()

# Create features
df['IsFraud'] = df['FraudIndicator'].fillna(0).astype(int)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df['LastLogin'] = pd.to_datetime(df['LastLogin'], errors='coerce')

print("Creating time-based features...")
df['Hour'] = df['Timestamp'].dt.hour.fillna(0).astype(int)
df['ts_hour'] = df['Timestamp'].dt.floor('H')
df['DaysSinceLastLogin'] = (df['Timestamp'] - df['LastLogin']).dt.days.fillna(0).astype(int)
print(f"DaysSinceLastLogin created: {('DaysSinceLastLogin' in df.columns)}")

print("Creating transaction count feature...")
df['TxCount_1h'] = df.groupby(['CustomerID','ts_hour'])['TransactionID'].transform('count')

print("Creating amount-based features...")
cust_avg = df.groupby('CustomerID')['Amount'].transform('mean')
df['AmountToCustomerAvg'] = (df['Amount'] / cust_avg).replace([float('inf')], 0).fillna(0)
df['AnomalyScore'] = ((df['Amount'] - cust_avg) / cust_avg).abs().fillna(0)
print(f"AnomalyScore created: {('AnomalyScore' in df.columns)}")
print()

print("All columns after feature creation:")
print(list(df.columns))
print()

features = ['Amount','AnomalyScore','Age','AccountBalance','Hour','DaysSinceLastLogin','TxCount_1h','AmountToCustomerAvg']
print(f"Features to use: {features}")
print(f"Missing features: {[f for f in features if f not in df.columns]}")

X = df[features].fillna(0)
y = df['IsFraud']

print("\nClass balance:")
print(y.value_counts(normalize=True))
print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")
X.head()

Current directory: /home/jovyan/work/notebooks
Data path: ../data
Data path exists: True

Loaded data shapes:
df_tx: (1000, 3), columns: ['TransactionID', 'Amount', 'CustomerID']
df_meta: (1000, 3), columns: ['TransactionID', 'Timestamp', 'MerchantID']
df_cust: (1000, 4), columns: ['CustomerID', 'Name', 'Age', 'Address']
df_act: (1000, 3), columns: ['CustomerID', 'AccountBalance', 'LastLogin']
df_fraud: (1000, 2), columns: ['TransactionID', 'FraudIndicator']

After merge with meta: (1000, 5), columns: ['TransactionID', 'Amount', 'CustomerID', 'Timestamp', 'MerchantID']
After merge with cust: (1000, 8), columns: ['TransactionID', 'Amount', 'CustomerID', 'Timestamp', 'MerchantID', 'Name', 'Age', 'Address']
After merge with act: (1000, 10), columns: ['TransactionID', 'Amount', 'CustomerID', 'Timestamp', 'MerchantID', 'Name', 'Age', 'Address', 'AccountBalance', 'LastLogin']
After merge with merch: (1000, 12)
After merge with fraud: (1000, 13)
All columns: ['TransactionID', 'Amount', 'Custo

Unnamed: 0,Amount,AnomalyScore,Age,AccountBalance,Hour,DaysSinceLastLogin,TxCount_1h,AmountToCustomerAvg
0,55.530334,0.016907,50,2869.689912,0,-951,1,1.016907
1,12.88118,0.0,46,9527.947107,1,-26,1,1.0
2,50.176322,0.027284,34,9288.355525,2,-954,1,0.972716
3,41.634001,0.164801,33,5588.049942,3,-795,1,0.835199
4,78.122853,0.0,18,7324.785332,4,-945,1,1.0


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(classification_report(y_test, pred, digits=3))

              precision    recall  f1-score   support

           0      0.956     1.000     0.978       239
           1      0.000     0.000     0.000        11

    accuracy                          0.956       250
   macro avg      0.478     0.500     0.489       250
weighted avg      0.914     0.956     0.934       250



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
# API smoke test
import requests
import os

# Use 'api' service name when running in Docker, 'localhost' otherwise
API = 'http://api:8000' if os.path.exists('/.dockerenv') else 'http://localhost:8000'
print(f"Using API: {API}")

response = requests.post(f'{API}/fraud/train')
response.json()

Using API: http://api:8000


{'status': 'trained',
 'n_rows': 1000,
 'label_fraud_rate': 0.045,
 'balanced_fraud_rate': 0.5,
 'train_samples': 216,
 'cv_folds': 5,
 'cv_confusion_matrix': [[493, 271], [21, 15]],
 'cv_precision': 0.05221049009105215,
 'cv_recall': 0.42142857142857143,
 'cv_f1': 0.09273604635711372,
 'cv_roc_auc': 0.5508807250970563,
 'test_confusion_matrix': [[118, 73], [4, 5]],
 'test_precision': 0.0641025641025641,
 'test_recall': 0.5555555555555556,
 'test_f1': 0.11494252873563218,
 'notes': 'IsolationForest (unsupervised). Ensemble (RF+GB) with hybrid balancing (undersample majority + SMOTE oversample minority). Using 5-Fold Stratified CV with threshold=0.30 for balanced evaluation.'}

In [4]:
top = requests.get(f'{API}/fraud/top', params={'limit':5}).json()['items']
tx_id = int(top[0]['TransactionID'])
score = requests.post(f'{API}/fraud/score', json={'transaction_id': tx_id}).json()
explain = requests.post(f'{API}/fraud/explain', json={'transaction_id': tx_id, 'top_k': 5}).json()
score, explain

({'iforest_decision_function': -0.0015144684414128973,
  'iforest_risk_score': 0.0015144684414128973,
  'shadow_model_pred': 1,
  'shadow_model_proba': 0.2739542477340603},
 {'method': 'value_magnitude',
  'top_factors': [{'feature': 'DaysSinceLastLogin',
    'contribution': 1.5949386622572366},
   {'feature': 'DayOfWeek', 'contribution': -1.476168840844461},
   {'feature': 'Age', 'contribution': -1.4272816261966899},
   {'feature': 'Hour', 'contribution': -1.3737346025734039},
   {'feature': 'AnomalyScore', 'contribution': -1.0815420780362677}],
  'note': 'Basic feature value magnitude ranking.'})