In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/My Drive/transactions_fixed.csv') # upload transactions.csv on your personal drive
df1 = pd.read_csv('/content/drive/My Drive/transactions_fixed.csv')


Mounted at /content/drive


##**Old Code** (do not use)

In [None]:

# =============================================================================
# 1. TRYBE DISCREPANCY DETECTOR
# =============================================================================

class TRYBEDiscrepancyDetector:
    """
    Detects floating cash transactions where debits occur but credits are delayed/missing.
    Revised for transactions_fixed.csv dataset.
    """

    def __init__(self):
        self.detection_rules = []
        # Column mapping for the actual dataset - REVISED
        self.column_mapping = {
            'transaction_id': 'transaction_id',
            'user_id': 'user_id',
            'amount': 'amount',
            'transaction_type': 'transaction_type',
            'recipient_type': 'recipient_type',
            'status_4': 'status_4',  # Latest status
            'is_floating_cash': 'is_floating_cash',  # Ground truth
            'floating_duration_minutes': 'floating_duration_minutes',
            'manual_escalation_needed': 'manual_escalation_needed',
            'is_fraudulent_attempt': 'is_fraudulent_attempt',
            'network_latency': 'simulated_network_latency',
            'recipient_bank': 'recipient_bank_name_or_ewallet',  # REVISED: correct column name
            'timestamp': 'timestamp_initiated',
            'is_cancellation': 'is_cancellation'
        }

    def load_transaction_data(self, file_path_or_df):
        """
        Load transaction dataset.
        """
        if isinstance(file_path_or_df, str):
            df = pd.read_csv(file_path_or_df)
        else:
            df = file_path_or_df.copy()

        print(f"Loaded {len(df)} transactions")
        print(f"Columns: {list(df.columns)}")

        # Basic data validation
        print(f"\nDataset Overview:")
        print(f"- Date range: {df['timestamp_initiated'].min()} to {df['timestamp_initiated'].max()}")
        print(f"- Amount range: ${df['amount'].min():.2f} to ${df['amount'].max():.2f}")
        print(f"- Transaction types: {df['transaction_type'].unique()}")
        print(f"- Final statuses: {df['status_4'].value_counts().head()}")

        return df

    def is_floating_transaction(self, row):
        """
        Enhanced rule-based function to detect floating cash transactions.
        Based on the actual dataset structure and business logic.
        """

        # Rule 1: Check ground truth first (for validation)
        # Note: In production, you won't have this column
        if hasattr(row, 'is_floating_cash') and pd.notna(row.is_floating_cash):
            # This is for validation - remove in production
            pass

        # Rule 2: Failed transactions are likely floating
        if hasattr(row, 'status_4') and pd.notna(row.status_4):
            failed_keywords = ['failed', 'timeout', 'error', 'stuck', 'pending']
            status_lower = str(row.status_4).lower()
            if any(keyword in status_lower for keyword in failed_keywords):
                return True

        # Rule 3: Manual escalation needed indicates floating cash
        if hasattr(row, 'manual_escalation_needed') and row.manual_escalation_needed:
            return True

        # Rule 4: High network latency + non-completed status
        if (hasattr(row, 'simulated_network_latency') and
            hasattr(row, 'status_4') and
            row.simulated_network_latency > 1000):  # High latency threshold
            if 'completed' not in str(row.status_4).lower():
                return True

        # Rule 5: Large amount transactions with suspicious patterns
        if hasattr(row, 'amount') and row.amount > 5000:  # Adjust threshold as needed
            if hasattr(row, 'status_4'):
                suspicious_statuses = ['processing', 'pending', 'review']
                if any(status in str(row.status_4).lower() for status in suspicious_statuses):
                    return True

        # Rule 6: Specific transaction types that are prone to floating
        if hasattr(row, 'transaction_type'):
            risky_types = ['Bank to e-Wallet', 'Internal Transfer']  # Add more as needed
            if any(risky_type in str(row.transaction_type) for risky_type in risky_types):
                if hasattr(row, 'status_4') and 'failed' in str(row.status_4).lower():
                    return True

        # TODO: Add more custom rules based on your business logic
        # Rule 7: Time-based rules (e.g., transactions pending for too long)
        # Rule 8: Recipient-specific rules
        # Rule 9: Device/location-based anomalies

        return False

    def detect_discrepancies(self, df):
        """
        Apply discrepancy detection to the entire dataset.
        """
        print("Running discrepancy detection...")

        # Apply the detection function to each row
        df['detected_discrepancy'] = df.apply(self.is_floating_transaction, axis=1)

        # Summary statistics
        total_transactions = len(df)
        flagged_transactions = df['detected_discrepancy'].sum()
        flagged_percentage = (flagged_transactions / total_transactions) * 100

        print(f"\nDetection Results:")
        print(f"- Total transactions analyzed: {total_transactions}")
        print(f"- Flagged as discrepancies: {flagged_transactions}")
        print(f"- Percentage flagged: {flagged_percentage:.2f}%")

        # Validation against ground truth (if available)
        if 'is_floating_cash' in df.columns:
            actual_floating = df['is_floating_cash'].sum()
            print(f"- Actual floating cash (ground truth): {actual_floating}")

            # Calculate detection accuracy
            if actual_floating > 0:
                true_positives = ((df['detected_discrepancy'] == True) &
                                (df['is_floating_cash'] == True)).sum()
                precision = true_positives / flagged_transactions if flagged_transactions > 0 else 0
                recall = true_positives / actual_floating if actual_floating > 0 else 0

                print(f"- Detection Precision: {precision:.3f}")
                print(f"- Detection Recall: {recall:.3f}")

        return df

    def get_flagged_transactions(self, df):
        """
        Return only the transactions flagged as discrepancies with key details.
        """
        flagged = df[df['detected_discrepancy'] == True].copy()

        # Select most relevant columns for review
        key_columns = ['transaction_id', 'user_id', 'amount', 'transaction_type',
                      'status_4', 'manual_escalation_needed', 'floating_duration_minutes']

        # Only include columns that exist in the dataset
        available_columns = [col for col in key_columns if col in flagged.columns]

        return flagged[available_columns]

# =============================================================================
# 2. TRYBE RISK PREDICTOR
# =============================================================================

class TRYBERiskPredictor:
    """
    Predicts probability that a transaction will get stuck or become floating cash.
    Revised for transactions_fixed.csv dataset features.
    """

    def __init__(self, model_type='random_forest'):
        self.model_type = model_type
        self.model = None
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.feature_columns = None
        self.target_column = None

    def load_data(self, file_path_or_df):
        """
        Load dataset for risk prediction.
        """
        if isinstance(file_path_or_df, str):
            df = pd.read_csv(file_path_or_df)
        else:
            df = file_path_or_df.copy()

        print(f"Loaded {len(df)} records for risk prediction")
        print(f"Target distribution:")
        if 'is_floating_cash' in df.columns:
            print(df['is_floating_cash'].value_counts())

        return df

    def preprocess_data(self, df, feature_columns=None, target_column=None):
        """
        Preprocess the data for machine learning using the actual dataset structure.
        REVISED: Updated feature columns to match actual dataset
        """
        df_processed = df.copy()

        # Set feature columns based on actual dataset - REVISED
        if feature_columns is None:
            feature_columns = [
                'amount',                           # Transaction amount
                'simulated_network_latency',        # Network conditions
                'transaction_type',                 # Type of transaction
                'recipient_type',                   # Type of recipient
                'recipient_bank_name_or_ewallet',   # Bank/e-wallet - REVISED: correct column name
                'floating_duration_minutes',        # Historical floating duration
                'is_fraudulent_attempt',            # Fraud indicator
                'is_cancellation',                  # Cancellation flag
                'manual_escalation_needed'          # Escalation needed
            ]

        # Set target column
        if target_column is None:
            target_column = 'is_floating_cash'  # Your ground truth column

        # Filter to only available columns
        available_features = [col for col in feature_columns if col in df_processed.columns]

        print(f"Original features requested: {feature_columns}")
        print(f"Available features in data: {available_features}")
        print(f"Target variable: {target_column}")

        # Create engineered features BEFORE setting self.feature_columns
        df_processed = self._create_engineered_features(df_processed)

        # Now add engineered features to available_features if they exist
        engineered_features = ['amount_log', 'is_high_amount', 'is_high_latency',
                              'hour_of_day', 'day_of_week', 'is_weekend', 'high_risk_combo']

        for feature in engineered_features:
            if feature in df_processed.columns:
                available_features.append(feature)

        # Set final feature columns
        self.feature_columns = available_features
        self.target_column = target_column

        print(f"Final features being used: {self.feature_columns}")

        # Handle missing values
        for col in self.feature_columns:
            if df_processed[col].dtype in ['object', 'category']:
                df_processed[col] = df_processed[col].fillna('unknown')
            else:
                df_processed[col] = df_processed[col].fillna(df_processed[col].median())

        # Encode categorical variables - REVISED: updated column name
        categorical_features = ['transaction_type', 'recipient_type', 'recipient_bank_name_or_ewallet']
        for col in categorical_features:
            if col in df_processed.columns and col in self.feature_columns:
                le = LabelEncoder()
                df_processed[col] = le.fit_transform(df_processed[col].astype(str))
                self.label_encoders[col] = le

        return df_processed

    def _create_engineered_features(self, df):
        """
        Create additional features from the dataset.
        Returns df with new features, but doesn't modify self.feature_columns
        """
        # Amount-based features
        if 'amount' in df.columns:
            df['amount_log'] = np.log1p(df['amount'])  # Log transform for skewed amounts
            df['is_high_amount'] = (df['amount'] > df['amount'].quantile(0.9)).astype(int)

        # Network latency features
        if 'simulated_network_latency' in df.columns:
            df['is_high_latency'] = (df['simulated_network_latency'] > 1000).astype(int)

        # Time-based features (if timestamps are available)
        if 'timestamp_initiated' in df.columns:
            try:
                df['timestamp_initiated'] = pd.to_datetime(df['timestamp_initiated'])
                df['hour_of_day'] = df['timestamp_initiated'].dt.hour
                df['day_of_week'] = df['timestamp_initiated'].dt.dayofweek
                df['is_weekend'] = (df['day_of_week'].isin([5, 6])).astype(int)
            except Exception as e:
                print(f"Warning: Could not process timestamp_initiated: {e}")

        # Risk combination features
        if 'is_fraudulent_attempt' in df.columns and 'manual_escalation_needed' in df.columns:
            df['high_risk_combo'] = (df['is_fraudulent_attempt'] | df['manual_escalation_needed']).astype(int)

        return df

    def get_model(self):
        """
        Initialize the ML model. Easily swappable model types.
        """
        if self.model_type == 'random_forest':
            return RandomForestClassifier(
                n_estimators=100,
                max_depth=10,
                min_samples_split=10,
                random_state=42,
                class_weight='balanced'  # Handle imbalanced data
            )
        elif self.model_type == 'logistic_regression':
            return LogisticRegression(
                random_state=42,
                class_weight='balanced',
                max_iter=1000
            )
        else:
            raise ValueError(f"Unknown model type: {self.model_type}")

    def train_model(self, df_processed):
        """
        Train the risk prediction model.
        """
        print(f"Training {self.model_type} model...")

        # Double-check that all feature columns exist in the processed dataframe
        missing_features = [col for col in self.feature_columns if col not in df_processed.columns]
        if missing_features:
            print(f"Warning: Missing features {missing_features}, removing from feature list")
            self.feature_columns = [col for col in self.feature_columns if col in df_processed.columns]

        # Prepare features and target
        X = df_processed[self.feature_columns]
        y = df_processed[self.target_column]

        print(f"Training set size: {len(X)} samples, {len(self.feature_columns)} features")
        print(f"Features used: {self.feature_columns}")
        print(f"Class distribution: {dict(y.value_counts())}")

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        # Train the model
        self.model = self.get_model()
        self.model.fit(X_train_scaled, y_train)

        # Evaluate the model
        self.evaluate_model(X_test_scaled, y_test)

        return self.model

    def evaluate_model(self, X_test, y_test):
        """
        Evaluate model performance with detailed metrics.
        """
        # Make predictions
        y_pred = self.model.predict(X_test)
        y_pred_proba = self.model.predict_proba(X_test)[:, 1]

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred_proba)

        print(f"\n🎯 Model Evaluation Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"AUC-ROC: {auc:.4f}")
        print(f"\nDetailed Classification Report:")
        print(classification_report(y_test, y_pred))

        # Feature importance (for tree-based models)
        if hasattr(self.model, 'feature_importances_'):
            feature_importance = pd.DataFrame({
                'feature': self.feature_columns,
                'importance': self.model.feature_importances_
            }).sort_values('importance', ascending=False)

            print(f"\n📊 Top 10 Most Important Features:")
            print(feature_importance.head(10))

    def predict_risk(self, transaction_data):
        """
        Predict risk probability for new transaction(s).
        """
        if self.model is None:
            raise ValueError("Model not trained yet. Call train_model() first.")

        # Handle single transaction (dict)
        if isinstance(transaction_data, dict):
            transaction_data = pd.DataFrame([transaction_data])

        # Create engineered features for new data
        transaction_data = self._create_engineered_features(transaction_data)

        # Ensure we only use features that exist in both training and prediction data
        available_features = [col for col in self.feature_columns if col in transaction_data.columns]

        if len(available_features) != len(self.feature_columns):
            missing = set(self.feature_columns) - set(available_features)
            print(f"Warning: Missing features for prediction: {missing}")
            print(f"Using available features: {available_features}")

        # Preprocess categorical features
        for col in available_features:
            if col in self.label_encoders:
                le = self.label_encoders[col]
                transaction_data[col] = transaction_data[col].astype(str)
                transaction_data[col] = transaction_data[col].apply(
                    lambda x: le.transform([x])[0] if x in le.classes_ else -1
                )

        # Handle missing values for new data
        for col in available_features:
            if transaction_data[col].dtype in ['object', 'category']:
                transaction_data[col] = transaction_data[col].fillna('unknown')
            else:
                transaction_data[col] = transaction_data[col].fillna(0)  # Use 0 for missing numerical features

        # Select only available features and scale
        X = transaction_data[available_features]

        # If we're missing features, pad with zeros
        if len(available_features) < len(self.feature_columns):
            # Create a dataframe with all required features, filling missing ones with 0
            X_full = pd.DataFrame(0, index=X.index, columns=self.feature_columns)
            X_full[available_features] = X
            X = X_full

        X_scaled = self.scaler.transform(X)

        # Predict probabilities
        risk_probabilities = self.model.predict_proba(X_scaled)[:, 1]

        return risk_probabilities[0] if len(risk_probabilities) == 1 else risk_probabilities

# =============================================================================
# 3. MAIN INTEGRATION & DEMO FUNCTIONS
# =============================================================================

def demo_discrepancy_detection(df_transactions):
    """
    Demo function for the discrepancy detector using actual data.
    """
    print("=" * 60)
    print("TRYBE DISCREPANCY DETECTOR - REAL DATA")
    print("=" * 60)

    # Initialize and run detector
    detector = TRYBEDiscrepancyDetector()
    detector.load_transaction_data(df_transactions)
    df_with_flags = detector.detect_discrepancies(df_transactions.copy())

    print(f"\n🔍 Flagged Transactions Sample:")
    flagged = detector.get_flagged_transactions(df_with_flags)
    print(flagged.head(10))

    return detector, df_with_flags

def demo_risk_prediction(df_transactions):
    """
    Demo function for the risk predictor using actual data.
    """
    print("=" * 60)
    print("TRYBE RISK PREDICTOR - REAL DATA")
    print("=" * 60)

    # Initialize predictor
    predictor = TRYBERiskPredictor(model_type='random_forest')

    # Load and preprocess data
    df = predictor.load_data(df_transactions)
    df_processed = predictor.preprocess_data(df)

    # Train the model
    trained_model = predictor.train_model(df_processed)

    # Test prediction on a sample transaction
    sample_transaction = df_transactions.iloc[0].to_dict()

    try:
        risk_prob = predictor.predict_risk(sample_transaction)

        print(f"\n🎯 Risk Prediction Example:")
        print(f"Transaction ID: {sample_transaction.get('transaction_id', 'N/A')}")
        print(f"Amount: ${sample_transaction.get('amount', 0):.2f}")
        print(f"Type: {sample_transaction.get('transaction_type', 'N/A')}")
        print(f"Predicted Risk Probability: {risk_prob:.4f}")
        print(f"Actual Outcome: {'Floating' if sample_transaction.get('is_floating_cash', False) else 'Normal'}")
    except Exception as e:
        print(f"Error in prediction example: {e}")
        print("Model trained successfully but prediction example failed")

    return predictor, trained_model

def main(transaction_df=None):
    """
    Main function to run both TRYBE components with actual data.
    """
    print("🚀 TRYBE Fintech AI System - Production Ready")
    print("=" * 50)

    if transaction_df is None:
        print("❌ No transaction data provided. Please load transactions_fixed.csv")
        return None, None, None

    # Run discrepancy detection
    detector, flagged_df = demo_discrepancy_detection(transaction_df)

    print("\n" + "="*50)

    # Run risk prediction
    predictor, model = demo_risk_prediction(transaction_df)

    print(f"\n✅ TRYBE System Ready for Production!")
    print(f"📊 Dataset: {len(transaction_df)} transactions processed")
    print(f"🔍 Discrepancy Detector: Active")
    print(f"🎯 Risk Predictor: {predictor.model_type} trained")

    return detector, predictor, model

# =============================================================================
# 4. HACKATHON UTILITIES
# =============================================================================

def quick_setup(data_path="transactions_fixed.csv", use_gdrive=False, file_id=None):
    """
    Ultra-fast setup for hackathon use with actual dataset.
    REVISED: Updated default filename to match actual dataset
    """
    print("⚡ TRYBE Quick Setup - Hackathon Mode")
    print("=" * 40)

    # Load data
    if use_gdrive and file_id:
        try:
            import gdown
        except ImportError:
            import subprocess
            subprocess.check_call(["pip", "install", "gdown"])
            import gdown

        url = f"https://drive.google.com/uc?id={file_id}"
        print(f"📥 Downloading from Google Drive...")
        gdown.download(url, "transactions_fixed.csv", quiet=False)
        data_path = "transactions_fixed.csv"

    # Load the dataset
    print(f"📊 Loading {data_path}...")
    df = pd.read_csv(data_path)

    # Run the full system
    return main(df)

if __name__ == "__main__":
    # Load the actual dataset - REVISED: Updated filename
    df_transactions = pd.read_csv("/content/drive/My Drive/transactions_fixed.csv")

    # Run the system
    detector, predictor, model = main(df_transactions)

    # For Google Drive usage:
    # detector, predictor, model = quick_setup(use_gdrive=True, file_id="YOUR_FILE_ID")

🚀 TRYBE Fintech AI System - Production Ready
TRYBE DISCREPANCY DETECTOR - REAL DATA
Loaded 10000 transactions
Columns: ['transaction_id', 'user_id', 'timestamp_initiated', 'amount', 'transaction_type', 'recipient_type', 'recipient_account_id', 'recipient_bank_name_or_ewallet', 'device_id', 'location_coordinates', 'simulated_network_latency', 'status_timestamp_1', 'status_1', 'status_timestamp_2', 'status_2', 'status_timestamp_3', 'status_3', 'status_timestamp_4', 'status_4', 'expected_completion_time', 'is_floating_cash', 'floating_duration_minutes', 'is_fraudulent_attempt', 'is_cancellation', 'is_retry_successful', 'manual_escalation_needed', 'transaction_types', 'recipient_bank_name/e-wallet_name']

Dataset Overview:
- Date range: 2023-03-24 15:02:45 to 2024-07-31 16:03:55
- Amount range: $10.00 to $25000.00
- Transaction types: ['Bank to e-Wallet (Maya)' 'Auto-Reversal Processed'
 'Internal Vybe App Transfer' 'Bills Payment (via Vybe Wallet)'
 'Bank to Bank (InstaPay)' 'Cash-In via 

In [None]:
# SAMPLE DEMO 2 (Discrepancy Detector)

# Initialize detector
detector = TRYBEDiscrepancyDetector()

# Load and analyze data
df = detector.load_transaction_data("/content/drive/My Drive/transactions_fixed.csv")
df_with_flags = detector.detect_discrepancies(df)

# Get flagged transactions
flagged_transactions = detector.get_flagged_transactions(df_with_flags)
print(f"Found {len(flagged_transactions)} suspicious transactions")


Loaded 10000 transactions
Columns: ['transaction_id', 'user_id', 'timestamp_initiated', 'amount', 'transaction_type', 'recipient_type', 'recipient_account_id', 'recipient_bank_name_or_ewallet', 'device_id', 'location_coordinates', 'simulated_network_latency', 'status_timestamp_1', 'status_1', 'status_timestamp_2', 'status_2', 'status_timestamp_3', 'status_3', 'status_timestamp_4', 'status_4', 'expected_completion_time', 'is_floating_cash', 'floating_duration_minutes', 'is_fraudulent_attempt', 'is_cancellation', 'is_retry_successful', 'manual_escalation_needed', 'transaction_types', 'recipient_bank_name/e-wallet_name']

Dataset Overview:
- Date range: 2023-03-24 15:02:45 to 2024-07-31 16:03:55
- Amount range: $10.00 to $25000.00
- Transaction types: ['Bank to e-Wallet (Maya)' 'Auto-Reversal Processed'
 'Internal Vybe App Transfer' 'Bills Payment (via Vybe Wallet)'
 'Bank to Bank (InstaPay)' 'Cash-In via Partner Outlet'
 'QR Payment (Merchant)' 'QR Payment (P2P)' 'Cash-Out via ATM or OTC

In [None]:
# SAMPLE DEMO 2 (Risk Predictor)

# Initialize predictor
predictor = TRYBERiskPredictor(model_type='random_forest')

# Load and train
df = predictor.load_data("/content/drive/My Drive/transactions_fixed.csv")
df_processed = predictor.preprocess_data(df)
model = predictor.train_model(df_processed)

# Predict risk for new transaction (Complete sample matching your dataset)
new_transaction = {
    'transaction_id': 'TXN_2025_001234',
    'user_id': 'USER_789012',
    'timestamp_initiated': '2025-01-15 14:30:22',
    'amount': 1500.0,
    'transaction_type': 'Bank to e-Wallet',
    'recipient_type': 'Individual',
    'recipient_account_id': 'ACC_567890',
    'recipient_bank_name_or_ewallet': 'GCash',
    'device_id': 'DEVICE_345678',
    'location_coordinates': '14.5995,120.9842',  # Manila coordinates
    'simulated_network_latency': 800,
    'status_timestamp_1': '2025-01-15 14:30:23',
    'status_1': 'initiated',
    'status_timestamp_2': '2025-01-15 14:30:25',
    'status_2': 'processing',
    'status_timestamp_3': '2025-01-15 14:30:28',
    'status_3': 'verifying',
    'status_timestamp_4': '2025-01-15 14:30:35',
    'status_4': 'completed',
    'expected_completion_time': '2025-01-15 14:31:00',
    'floating_duration_minutes': 0,
    'is_fraudulent_attempt': False,
    'is_cancellation': False,
    'is_retry_successful': True,
    'manual_escalation_needed': False,
    'transaction_types': 'Bank to e-Wallet',  # Alternative column name
    'recipient_bank_name/e-wallet_name': 'GCash'  # Alternative column name
}

risk_probability = predictor.predict_risk(new_transaction)
print(f"Risk probability: {risk_probability:.4f}")

Loaded 10000 records for risk prediction
Target distribution:
is_floating_cash
False    9499
True      501
Name: count, dtype: int64
Original features requested: ['amount', 'simulated_network_latency', 'transaction_type', 'recipient_type', 'recipient_bank_name_or_ewallet', 'floating_duration_minutes', 'is_fraudulent_attempt', 'is_cancellation', 'manual_escalation_needed']
Available features in data: ['amount', 'simulated_network_latency', 'transaction_type', 'recipient_type', 'recipient_bank_name_or_ewallet', 'floating_duration_minutes', 'is_fraudulent_attempt', 'is_cancellation', 'manual_escalation_needed']
Target variable: is_floating_cash
Final features being used: ['amount', 'simulated_network_latency', 'transaction_type', 'recipient_type', 'recipient_bank_name_or_ewallet', 'floating_duration_minutes', 'is_fraudulent_attempt', 'is_cancellation', 'manual_escalation_needed', 'amount_log', 'is_high_amount', 'is_high_latency', 'hour_of_day', 'day_of_week', 'is_weekend', 'high_risk_comb

##**Aug. 11 Experiment** (USE THIS!!)

In [None]:
# =============================================================
# TRYBE Components – **Revised**
# -------------------------------------------------------------
# This module updates your discrepancy‑detector and risk‑predictor
# so they align 1‑to‑1 with **transactions_fixed.csv** while
# remaining drop‑in replacements for your existing notebook.
#
# Key tweaks
# ----------
# • **Schema auto‑alignment** – a lightweight `DataSchemaAligner`
#   maps alternative column names to the canonical ones we use in
#   code, so future header changes won’t break logic.
# • **Cleaner rule logic** in `TRYBEDiscrepancyDetector` that now
#   references the aligned columns directly.
# • **Pre‑processing hardening** in `TRYBERiskPredictor`: engineered
#   features are created *after* schema alignment, categorical
#   encodings reuse stored `LabelEncoder`s, and prediction gracefully
#   handles unseen classes.
# -------------------------------------------------------------

from __future__ import annotations

import pandas as pd
import numpy as np
from typing import List, Dict, Optional

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    classification_report,
)

# ------------------------------------------------------------------
# 0. DATA‑SCHEMA ALIGNER
# ------------------------------------------------------------------

class DataSchemaAligner:
    """Map any alternative/legacy column names to our canonical names."""

    _NAME_MAP: Dict[str, List[str]] = {
        # canonical                       aliases in raw CSVs
        "transaction_id":                ["txn_id", "id"],
        "user_id":                      ["uid"],
        "amount":                       ["txn_amount"],
        "transaction_type":             ["transaction_types", "txn_type"],
        "recipient_type":               [],
        "status_4":                     ["final_status", "status_final"],
        "is_floating_cash":             ["ground_truth_floating"],
        "floating_duration_minutes":    ["float_minutes"],
        "manual_escalation_needed":     ["escalate"],
        "is_fraudulent_attempt":        ["fraud_flag"],
        "simulated_network_latency":    ["network_latency", "latency_ms"],
        "recipient_bank_name_or_ewallet":["recipient_bank", "recipient_bank/e-wallet_name"],
        "timestamp_initiated":          ["timestamp", "initiated_at"],
        "is_cancellation":             ["cancel_flag"],
    }

    def __init__(self, df: pd.DataFrame):
        self.original = df.copy()
        self.aligned = self._align(df.copy())

    # --------------------------------------------------
    def _align(self, df: pd.DataFrame) -> pd.DataFrame:
        rename_map: Dict[str, str] = {}
        for canonical, aliases in self._NAME_MAP.items():
            if canonical in df.columns:
                continue  # already good
            for alt in aliases:
                if alt in df.columns:
                    rename_map[alt] = canonical
                    break
        if rename_map:
            df = df.rename(columns=rename_map)
        return df

    # --------------------------------------------------
    @property
    def frame(self) -> pd.DataFrame:
        return self.aligned

# ------------------------------------------------------------------
# TRYBE DISCREPANCY DETECTOR • v6
# ------------------------------------------------------------------
class TRYBEDiscrepancyDetector:
    """
    Data-backed detector: flag any transaction whose
    `floating_duration_minutes` exceeds 10 min.

    • Precision  ≈ 0.34
    • Recall     ≈ 0.96
    • Alerts     ≈ 14 % of rows (1 400 / 10 000 in your sample)
    """

    _THRESHOLD_MIN = 10   # <- tune here if business needs change

    # --------------------------------------------------------------
    def load_transaction_data(self, src):
        import pandas as pd
        raw = pd.read_csv(src) if isinstance(src, str) else src.copy()
        self.df = DataSchemaAligner(raw).frame
        print(f"Loaded {len(self.df):,} transactions")
        return self.df

    # --------------------------------------------------------------
    def _is_floating(self, row) -> bool:
        return row.get("floating_duration_minutes", 0) > self._THRESHOLD_MIN

    # --------------------------------------------------------------
    def detect_discrepancies(self, df=None):
        if df is None:
            df = getattr(self, "df", None)
            if df is None:
                raise ValueError("Run load_transaction_data() first.")
        else:
            df = DataSchemaAligner(df).frame

        df["detected_discrepancy"] = df["floating_duration_minutes"] > self._THRESHOLD_MIN
        flagged = int(df["detected_discrepancy"].sum())
        print(f"Flagged {flagged:,}/{len(df):,} – {flagged/len(df):.2%}")

        # Optional ground-truth metrics
        if "is_floating_cash" in df.columns:
            tp = ((df["detected_discrepancy"]) &  df["is_floating_cash"]).sum()
            fp = flagged - tp
            fn = ((~df["detected_discrepancy"]) & df["is_floating_cash"]).sum()
            precision = tp / (tp + fp) if tp+fp else 0
            recall    = tp / (tp + fn) if tp+fn else 0
            print(f"Precision: {precision:.3f} | Recall: {recall:.3f}")

        return df

    # --------------------------------------------------------------
    def get_flagged_transactions(self, df=None):
        if df is None:
            df = getattr(self, "df", None)
            if df is None:
                raise ValueError("Run detect_discrepancies() first.")

        cols = [
            "transaction_id", "user_id", "amount", "transaction_type",
            "status_4", "floating_duration_minutes"
        ]
        return df.loc[df["detected_discrepancy"], [c for c in cols if c in df.columns]].copy()


# ------------------------------------------------------------------
# 2. TRYBE RISK PREDICTOR (Revised)
# ------------------------------------------------------------------

class TRYBERiskPredictor:
    """Predict probability a transaction will float, with robust preprocessing."""

    def __init__(self, model_type: str = "random_forest"):
        self.model_type = model_type
        self.scaler = StandardScaler()
        self.label_encoders: Dict[str, LabelEncoder] = {}
        self.model = None  # will be set in train_model
        self.feature_cols: List[str] = []
        self.target_col: str = "is_floating_cash"

    # ----------------------------------------------
    def load_data(self, file_or_df):
        raw = pd.read_csv(file_or_df) if isinstance(file_or_df, str) else file_or_df.copy()
        df = DataSchemaAligner(raw).frame
        print(f"Loaded {len(df):,} records | Floating‑cash prevalence: {df[self.target_col].mean():.2%}")
        return df

    # ----------------------------------------------
    def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        df = DataSchemaAligner(df).frame.copy()

        base_features = [
            "amount", "simulated_network_latency", "transaction_type", "recipient_type",
            "recipient_bank_name_or_ewallet", "floating_duration_minutes", "is_fraudulent_attempt",
            "is_cancellation", "manual_escalation_needed",
        ]
        engineered = self._add_engineered_features(df)
        self.feature_cols = [f for f in base_features + engineered if f in df.columns]

        # missing value handling
        for col in self.feature_cols:
            if df[col].dtype == object or pd.api.types.is_categorical_dtype(df[col]):
                df[col] = df[col].fillna("unknown")
            else:
                df[col] = df[col].fillna(df[col].median())

        # encode categoricals
        for col in ["transaction_type", "recipient_type", "recipient_bank_name_or_ewallet"]:
            if col in df.columns:
                le = self.label_encoders.get(col, LabelEncoder())
                df[col] = le.fit_transform(df[col].astype(str)) if col not in self.label_encoders else le.transform(df[col].astype(str))
                self.label_encoders[col] = le

        return df

    # ----------------------------------------------
    def _add_engineered_features(self, df: pd.DataFrame) -> List[str]:
        new_cols: List[str] = []
        if "amount" in df.columns:
            df["amount_log"] = np.log1p(df["amount"])
            df["is_high_amount"] = (df["amount"] > df["amount"].quantile(0.9)).astype(int)
            new_cols += ["amount_log", "is_high_amount"]

        if "simulated_network_latency" in df.columns:
            df["is_high_latency"] = (df["simulated_network_latency"] > 1_000).astype(int)
            new_cols.append("is_high_latency")

        if "timestamp_initiated" in df.columns:
            ts = pd.to_datetime(df["timestamp_initiated"], errors="coerce")
            df["hour_of_day"] = ts.dt.hour
            df["day_of_week"] = ts.dt.dayofweek
            df["is_weekend"] = ts.dt.dayofweek.isin([5, 6]).astype(int)
            new_cols += ["hour_of_day", "day_of_week", "is_weekend"]

        if {"is_fraudulent_attempt", "manual_escalation_needed"}.issubset(df.columns):
            df["high_risk_combo"] = (df["is_fraudulent_attempt"] | df["manual_escalation_needed"]).astype(int)
            new_cols.append("high_risk_combo")

        return new_cols

    # ----------------------------------------------
    def _init_model(self):
        if self.model_type == "random_forest":
            return RandomForestClassifier(
                n_estimators=150, max_depth=15, min_samples_split=10,
                class_weight="balanced", random_state=42
            )
        if self.model_type == "logistic_regression":
            return LogisticRegression(max_iter=1500, class_weight="balanced", random_state=42)
        raise ValueError(f"Unknown model_type: {self.model_type}")

    # ----------------------------------------------
    def train_model(self, df: pd.DataFrame):
        df_prep = self.preprocess(df)

        X, y = df_prep[self.feature_cols], df_prep[self.target_col]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

        self.scaler.fit(X_train)
        X_train_s = self.scaler.transform(X_train)
        X_test_s  = self.scaler.transform(X_test)

        self.model = self._init_model()
        self.model.fit(X_train_s, y_train)

        self._evaluate(X_test_s, y_test)
        return self.model

    # ----------------------------------------------
    def _evaluate(self, X_test, y_test):
        preds      = self.model.predict(X_test)
        pred_proba = self.model.predict_proba(X_test)[:, 1]
        acc = accuracy_score(y_test, preds)
        auc = roc_auc_score(y_test, pred_proba)
        print("\nModel evaluation → Accuracy:", f"{acc:.4f}", "| AUC:", f"{auc:.4f}")
        print(classification_report(y_test, preds))

    # ----------------------------------------------
    def predict_risk(self, txn):
        if self.model is None:
            raise RuntimeError("Model not trained. Call train_model() first.")

        tx_df = pd.DataFrame([txn]) if isinstance(txn, dict) else txn.copy()
        tx_df = self.preprocess(tx_df)  # uses stored encoders
        avail  = [c for c in self.feature_cols if c in tx_df.columns]

        # ensure same feature order / width as training
        X = pd.DataFrame(0, index=tx_df.index, columns=self.feature_cols)
        X[avail] = tx_df[avail]
        X_s = self.scaler.transform(X)
        proba = self.model.predict_proba(X_s)[:, 1]
        return proba[0] if len(proba) == 1 else proba

# --------------------------------------------------
# OPTIONAL • Self-register this notebook cell as a module
#           so later `import trybe_components_revised` works
#           even though no file was written.
import sys as _sys
_sys.modules['trybe_components_revised'] = _sys.modules[__name__]


In [None]:
# --------------------------------------------------
# 1️⃣  Discrepancy detection
# --------------------------------------------------
from trybe_components_revised import TRYBEDiscrepancyDetector

detector = TRYBEDiscrepancyDetector()

# Load + align the CSV once
csv_path = pd.read_csv('/content/drive/My Drive/transactions_fixed.csv')
df_txn   = detector.load_transaction_data(csv_path)

# Run the detector
df_flagged = detector.detect_discrepancies(df_txn)

# Quick look at the first few discrepancies
flagged_preview = detector.get_flagged_transactions(df_flagged)
print("\n🔍 Flagged transactions (top rows):")
display(flagged_preview.head(10))      # use 'display' if running in Jupyter

# --------------------------------------------------
# 2️⃣  Risk prediction
# --------------------------------------------------
from trybe_components_revised import TRYBERiskPredictor

predictor = TRYBERiskPredictor(model_type="random_forest")

# Re-use the same aligned dataframe (saves I/O) …
predictor.train_model(df_txn)

# …or you could reload from disk:
# df_data = predictor.load_data(csv_path)
# predictor.train_model(df_data)

# --------------------------------------------------
# 3️⃣  Single-transaction inference
# --------------------------------------------------
# Use the *first* transaction as a demo input
sample_txn = df_txn.iloc[0].to_dict()

risk_prob = predictor.predict_risk(sample_txn)
print(f"\n🎯 Predicted floating-cash risk: {risk_prob:.2%}")



Loaded 10,000 transactions
Flagged 1,413/10,000 – 14.13%
Precision: 0.342 | Recall: 0.964

🔍 Flagged transactions (top rows):


Unnamed: 0,transaction_id,user_id,amount,transaction_type,status_4,floating_duration_minutes
3,b9c8b5f7-e9g0-5d31-0b2d-4g5d6e7f8g9h,user_8,7497.0,Bills Payment (via Vybe Wallet),Credit Confirmed (Recipient),44
9,d16a8277-2f54-4fb9-a03a-0e9e95261d76,user_8112,3374.88,Cash-In via Partner Outlet,Credit Confirmed (Recipient),1165
14,781ee456-11b3-40e1-b4f0-46654e528b17,user_9037,4393.18,Vybe Wallet to ShopeePay,Failed (Network Error),34
18,51bf7284-90cd-493e-af38-9e5c7cf59795,user_22,7422.37,Bank to Bank (PESONet),Credit Confirmed (Recipient),53
19,53e37257-2e23-4482-aa0d-b4b9679f187a,user_132205,1545.96,Vybe Wallet to GCash,Credit Confirmed (Recipient),44
21,8a54c62c-8822-4824-a212-eb7e891dd706,user_819448,2540.2,Bank to e-Wallet (Maya),Credit Confirmed (Recipient),46
30,25b96788-b4b0-4c8d-b7fc-4e4054238b7d,user_80562,2883.33,Bank to e-Wallet (ShopeePay),Credit Confirmed (Recipient),49
31,76472df3-2ed5-48b4-938a-3603b573685e,user_3,8129.43,QR Payment (Merchant),Credit Confirmed (Recipient),18
35,71239c09-ef48-4796-a979-4d6f6e578c74,user_53523,465.34,Bank to e-Wallet (Maya),Credit Confirmed (Recipient),5386
37,5c0e1286-d249-4114-8f4f-d7486f03080e,user_5,465.91,Vybe Wallet to Bank (BPI),Credit Confirmed (Recipient),19



Model evaluation → Accuracy: 0.9245 | AUC: 0.9532
              precision    recall  f1-score   support

       False       0.99      0.93      0.96      1900
        True       0.37      0.73      0.49       100

    accuracy                           0.92      2000
   macro avg       0.68      0.83      0.73      2000
weighted avg       0.95      0.92      0.94      2000


🎯 Predicted floating-cash risk: 0.00%


In [None]:
# PKL

# --------------------------------------------------
# 1️⃣  Discrepancy detection
# --------------------------------------------------
from trybe_components_revised import TRYBEDiscrepancyDetector
import pickle, pandas as pd

detector = TRYBEDiscrepancyDetector()

# Load + align the CSV once
csv_path = pd.read_csv('/content/drive/My Drive/transactions_fixed.csv')
df_txn   = detector.load_transaction_data(csv_path)

# Run the detector
df_flagged = detector.detect_discrepancies(df_txn)

# Quick look at the first few discrepancies
flagged_preview = detector.get_flagged_transactions(df_flagged)
print("\n🔍 Flagged transactions (top rows):")
display(flagged_preview.head(10))      # use 'display' if running in Jupyter

# Save detector as .pkl
with open("trybe_discrepancy_detector.pkl", "wb") as f:
    pickle.dump(detector, f)

# Reload detector
with open("trybe_discrepancy_detector.pkl", "rb") as f:
    detector_loaded = pickle.load(f)

# --------------------------------------------------
# 2️⃣  Risk prediction
# --------------------------------------------------
from trybe_components_revised import TRYBERiskPredictor

predictor = TRYBERiskPredictor(model_type="random_forest")

# Train the model on the aligned dataframe
predictor.train_model(df_txn)

# Save predictor as .pkl
with open("trybe_risk_predictor.pkl", "wb") as f:
    pickle.dump(predictor, f)

# Reload predictor
with open("trybe_risk_predictor.pkl", "rb") as f:
    predictor_loaded = pickle.load(f)

# --------------------------------------------------
# 3️⃣  Single-transaction inference (with reloaded predictor)
# --------------------------------------------------
# Use the *first* transaction as a demo input
sample_txn = df_txn.iloc[0].to_dict()

risk_prob = predictor_loaded.predict_risk(sample_txn)
print(f"\n🎯 Predicted floating-cash risk: {risk_prob:.2%}")


Loaded 10,000 transactions
Flagged 1,413/10,000 – 14.13%
Precision: 0.342 | Recall: 0.964

🔍 Flagged transactions (top rows):


Unnamed: 0,transaction_id,user_id,amount,transaction_type,status_4,floating_duration_minutes
3,b9c8b5f7-e9g0-5d31-0b2d-4g5d6e7f8g9h,user_8,7497.0,Bills Payment (via Vybe Wallet),Credit Confirmed (Recipient),44
9,d16a8277-2f54-4fb9-a03a-0e9e95261d76,user_8112,3374.88,Cash-In via Partner Outlet,Credit Confirmed (Recipient),1165
14,781ee456-11b3-40e1-b4f0-46654e528b17,user_9037,4393.18,Vybe Wallet to ShopeePay,Failed (Network Error),34
18,51bf7284-90cd-493e-af38-9e5c7cf59795,user_22,7422.37,Bank to Bank (PESONet),Credit Confirmed (Recipient),53
19,53e37257-2e23-4482-aa0d-b4b9679f187a,user_132205,1545.96,Vybe Wallet to GCash,Credit Confirmed (Recipient),44
21,8a54c62c-8822-4824-a212-eb7e891dd706,user_819448,2540.2,Bank to e-Wallet (Maya),Credit Confirmed (Recipient),46
30,25b96788-b4b0-4c8d-b7fc-4e4054238b7d,user_80562,2883.33,Bank to e-Wallet (ShopeePay),Credit Confirmed (Recipient),49
31,76472df3-2ed5-48b4-938a-3603b573685e,user_3,8129.43,QR Payment (Merchant),Credit Confirmed (Recipient),18
35,71239c09-ef48-4796-a979-4d6f6e578c74,user_53523,465.34,Bank to e-Wallet (Maya),Credit Confirmed (Recipient),5386
37,5c0e1286-d249-4114-8f4f-d7486f03080e,user_5,465.91,Vybe Wallet to Bank (BPI),Credit Confirmed (Recipient),19



Model evaluation → Accuracy: 0.9245 | AUC: 0.9532
              precision    recall  f1-score   support

       False       0.99      0.93      0.96      1900
        True       0.37      0.73      0.49       100

    accuracy                           0.92      2000
   macro avg       0.68      0.83      0.73      2000
weighted avg       0.95      0.92      0.94      2000


🎯 Predicted floating-cash risk: 0.00%
