In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pandas scikit-learn



In [3]:
!pip install cudf-cu12 cuml-cu12 cupy-cuda12x
!nvidia-smi

Collecting cuml-cu12
  Downloading cuml_cu12-24.12.0.tar.gz (2.5 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting cudf-cu12
  Downloading cudf_cu12-24.12.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (6.2 kB)
Collecting cuvs-cu12==24.12.* (from cuml-cu12)
  Downloading cuvs_cu12-24.12.0.tar.gz (1.0 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting dask-cuda==24.12.* (from cuml-cu12)
  Downloading dask_cuda-24.12.0-py3-none-any.whl.metadata (2.6 kB)
Collecting dask-cudf-cu12==24.12.* (from cuml-cu12)
  Downloading dask_cudf_cu12-24.12.0-py3-none-any.whl.metadata (4.5 kB)
Collecting pylibraft-cu12==24.12.* (from cuml-cu12)
  Downloading pylibraft_cu12-24.12.0.tar.gz (5.6 kB)
  Installing build d

In [4]:
import cudf
import cupy as cp
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import logging
import os
from google.colab import drive
import numpy as np
import pandas as pd

In [5]:
INPUT_PATH = '/content/drive/MyDrive/processed_txn_data.csv'
OUTPUT_PATH = '/content/drive/MyDrive/anomalous_transactions.csv'

In [6]:
def create_features(df):
    """Create features using GPU-accelerated operations."""

    # Convert Date/Time to datetime
    df['Date/Time'] = cudf.to_datetime(df['Date/Time'])

    # Extract date using correct cuDF datetime properties
    # In cuDF we need to use .day, .month, .year separately
    df['date_key'] = df['Date/Time'].dt.year * 10000 + \
                     df['Date/Time'].dt.month * 100 + \
                     df['Date/Time'].dt.day

    # Pre-calculate groups
    sender_groups = df.groupby('From_Account_id')

    # Transaction patterns per account using date_key for daily grouping
    df['daily_tx_count_sender'] = df.groupby(['From_Account_id', 'date_key'])['amount'].transform('count')
    df['daily_tx_volume_sender'] = df.groupby(['From_Account_id', 'date_key'])['amount'].transform('sum')

    # Average transaction amount patterns
    df['avg_amount_sender'] = sender_groups['amount'].transform('mean')
    df['amount_to_avg_ratio'] = df['amount'] / df['avg_amount_sender']

    # Receiving patterns
    df['receiver_diversity'] = sender_groups['To_Account_id'].transform('nunique')

    # Time-based patterns
    df['hour'] = df['Date/Time'].dt.hour

    # Using cupy for efficient array operations
    hours = cp.arange(24)
    night_hours = cp.concatenate([cp.arange(5), cp.arange(23, 24)])
    df['is_night'] = df['hour'].isin(night_hours.get()).astype('int8')

    # Clean up temporary column
    df = df.drop('date_key', axis=1)

    return df

In [7]:
def get_risk_factors(df_gpu):
    """Identify risk factors using vectorized GPU operations."""

    # Convert thresholds to GPU arrays
    daily_tx_threshold = float(df_gpu['daily_tx_count_sender'].quantile(0.95))
    receiver_div_threshold = float(df_gpu['receiver_diversity'].quantile(0.95))

    # Initialize empty risk factors column
    df_gpu['risk_factors'] = ''

    # Apply conditions vectorially
    if df_gpu['amount_to_avg_ratio'].gt(3).any():
        mask = df_gpu['amount_to_avg_ratio'] > 3
        df_gpu.loc[mask, 'risk_factors'] += 'Unusually large transaction amount | '

    if df_gpu['daily_tx_count_sender'].gt(daily_tx_threshold).any():
        mask = df_gpu['daily_tx_count_sender'] > daily_tx_threshold
        df_gpu.loc[mask, 'risk_factors'] += 'High daily transaction frequency | '

    if df_gpu['is_night'].eq(1).any():
        mask = df_gpu['is_night'] == 1
        df_gpu.loc[mask, 'risk_factors'] += 'Transaction during unusual hours | '

    if df_gpu['receiver_diversity'].gt(receiver_div_threshold).any():
        mask = df_gpu['receiver_diversity'] > receiver_div_threshold
        df_gpu.loc[mask, 'risk_factors'] += 'Unusually high number of receivers | '

    # Clean up risk factors string
    df_gpu['risk_factors'] = df_gpu['risk_factors'].str.rstrip(' | ')
    df_gpu.loc[df_gpu['risk_factors'] == '', 'risk_factors'] = 'Unknown'

    return df_gpu

In [8]:
def detect_anomalies(input_csv=INPUT_PATH, output_csv=OUTPUT_PATH):
    """Hybrid GPU-CPU anomaly detection."""

    print("Loading transaction data to GPU...")
    df_gpu = cudf.read_csv(input_csv)

    print("Creating features for anomaly detection...")
    df_features = create_features(df_gpu)

    feature_columns = [
        'amount',
        'daily_tx_count_sender',
        'daily_tx_volume_sender',
        'avg_amount_sender',
        'amount_to_avg_ratio',
        'receiver_diversity',
        'hour',
        'is_night'
    ]

    # Convert to numpy for sklearn
    X = df_features[feature_columns].to_pandas().values

    # Scale features
    print("Scaling features...")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    print("Training Isolation Forest...")
    iso_forest = IsolationForest(
        contamination=0.1,
        random_state=42,
        n_estimators=100,
        max_samples='auto',
        n_jobs=-1  # Use all CPU cores
    )

    # Fit and predict
    predictions = iso_forest.fit_predict(X_scaled)
    anomaly_scores = iso_forest.score_samples(X_scaled)

    # Add predictions and scores to dataframe
    df_features['is_anomaly'] = predictions == -1
    df_features['anomaly_score'] = anomaly_scores

    # Create labeled dataframe with all transactions
    all_transactions_df = df_features.copy()

    # Add risk factors for all transactions
    all_transactions_df = get_risk_factors(all_transactions_df)

    # Sort by anomaly score
    all_transactions_df = all_transactions_df.sort_values('anomaly_score', ascending=True)

    # Filter anomalous transactions for saving to CSV
    anomalous_df = all_transactions_df[all_transactions_df['is_anomaly']].copy()

    print(f"Found {len(anomalous_df)} anomalous transactions out of {len(all_transactions_df)} total transactions")

    # Save anomalous results
    columns_to_save = [
        'Date/Time', 'From_Account_id', 'To_Account_id', 'amount',
        'anomaly_score', 'risk_factors', 'is_anomaly'
    ]

    # Convert to pandas for saving
    anomalous_df[columns_to_save].to_pandas().to_csv(output_csv, index=False)

    # Return both dataframes
    return all_transactions_df, anomalous_df

all_transactions, anomalous_transactions = detect_anomalies()
print("\nDistribution of transactions:")
print(all_transactions['is_anomaly'].value_counts().to_pandas())

# Access normal transactions if needed:
normal_transactions = all_transactions[~all_transactions['is_anomaly']]

Loading transaction data to GPU...
Creating features for anomaly detection...
Scaling features...
Training Isolation Forest...
Found 366898 anomalous transactions out of 3668976 total transactions

Distribution of transactions:
is_anomaly
False    3302078
True      366898
Name: count, dtype: int64


In [9]:
output_file = '/content/drive/MyDrive/anomalous_df.csv'

In [10]:
print(f"Saved anomalous transactions to {output_file}")

Saved anomalous transactions to /content/drive/MyDrive/anomalous_df.csv


In [11]:
output_file = '/content/drive/MyDrive/all_transactionsdf.csv'