In [19]:
import json
from kafka import KafkaConsumer
from river import (
    compose, 
    linear_model, 
    preprocessing, 
    metrics, 
    anomaly
)
import datetime
import pickle
import os
import pandas as pd

In [21]:
# Configuration
KAFKA_TOPIC = 'transactions'
KAFKA_BROKERS = 'kafka-producer:29092'  # Adjust as needed
MODEL_PATH = 'river_model.pkl'
DATA_PATH = 'river_data.pkl'

In [22]:
#Data processing functions
def extract_device_info(x):
    x_ = x['device_info']
    return {
        'os': x_['os'],
        'browser': x_['browser'],
    }

In [23]:
def load_or_create_model():
    """Load existing model or create a new one"""
    # Create a new model pipeline
    pipe1 = compose.Select(
        "amount",
        "account_age_days",
        "cvv_provided",
        "billing_address_match"
    )
    pipe2 = compose.Select(
        "currency",
        "merchant_id",
        "payment_method",
        "product_category",
        "transaction_type",
        "user_agent"
    )
    pipe2 |= preprocessing.OrdinalEncoder()
    pipe3 = compose.Select(
        "device_info"
    )
    pipe3 |= compose.FuncTransformer(
        extract_device_info,
    )
    pipe3 |= preprocessing.OrdinalEncoder()
    pipe = pipe1 + pipe2 + pipe3
    model = pipe | linear_model.LogisticRegression()
    #Save the model to future use
    with open(MODEL_PATH, 'wb') as f:
        pickle.dump(model, f)
    return model

In [24]:
model = load_or_create_model()

In [26]:
model[-1].weights

{}

In [6]:
from river import compat
sk_model = compat.river_to_sklearn.convert_river_to_sklearn(model)

In [7]:
type(sk_model)

sklearn.pipeline.Pipeline

In [8]:
import pandas as pd
data = pd.read_pickle("river_data.pkl") 
data.head()

Unnamed: 0,transaction_id,user_id,timestamp,amount,currency,merchant_id,product_category,transaction_type,payment_method,location,ip_address,device_info,user_agent,account_age_days,cvv_provided,billing_address_match,is_fraud
0,04fa166a-4a6a-4f48-8b99-f1548c23fca3,7ae265d5-a5fc-4cb1-b192-b90d9ec5e82a,2025-04-19T21:04:11.015632+00:00,153.81,AUD,merchant_110,travel,withdrawal,credit_card,"{'lat': 84.908082, 'lon': -96.085004}",73.111.188.232,"{'os': 'Linux', 'browser': 'Safari'}",Mozilla/5.0 (iPad; CPU iPad OS 3_1_3 like Mac ...,520,True,True,0
1,b424f5e2-1a6c-4419-bc28-3823b2026695,58f89265-da86-4029-b8ca-240b74229732,2025-04-19T21:04:19.087116+00:00,162.6,USD,merchant_36,travel,deposit,paypal,"{'lat': 70.9651415, 'lon': 121.973331}",170.235.47.19,"{'os': 'Android', 'browser': 'Opera'}",Mozilla/5.0 (iPhone; CPU iPhone OS 11_4_1 like...,217,True,True,0
2,c09a9ea0-e291-41eb-af62-668ec7464e58,1edd1475-8473-4e84-8692-c515cf8995d8,2025-04-19T21:04:19.351069+00:00,71.96,CAD,merchant_136,clothing,purchase,crypto,"{'lat': 62.664663, 'lon': 85.763089}",72.232.136.117,"{'os': 'macOS', 'browser': 'Chrome'}",Mozilla/5.0 (X11; Linux i686; rv:1.9.6.20) Gec...,1189,True,True,0
3,e00887dd-dab6-4530-a0cf-47a4fac8cb82,b6746475-4c5c-413a-b869-03daeb38e847,2025-04-19T21:04:19.408843+00:00,442.23,BRL,merchant_73,digital_goods,transfer,debit_card,"{'lat': -35.8875735, 'lon': -137.023863}",59.38.226.222,"{'os': 'Linux', 'browser': 'Opera'}",Opera/8.11.(X11; Linux i686; gu-IN) Presto/2.9...,912,True,True,0
4,aa03dc24-df20-40f9-a328-4e26bede4d3e,6c3f7309-bbd4-4b2c-ba4d-55054ae95cec,2025-04-19T21:04:19.884811+00:00,350.1,BRL,merchant_94,luxury_items,purchase,paypal,"{'lat': 75.4541425, 'lon': -69.048147}",6.90.190.247,"{'os': 'Linux', 'browser': 'Other'}",Mozilla/5.0 (iPod; U; CPU iPhone OS 3_3 like M...,1256,True,True,0


In [9]:
X = data.drop(columns=["is_fraud"])
y = data["is_fraud"]

In [13]:
X.iloc[0].to_dict()

{'transaction_id': '04fa166a-4a6a-4f48-8b99-f1548c23fca3',
 'user_id': '7ae265d5-a5fc-4cb1-b192-b90d9ec5e82a',
 'timestamp': '2025-04-19T21:04:11.015632+00:00',
 'amount': 153.81,
 'currency': 'AUD',
 'merchant_id': 'merchant_110',
 'product_category': 'travel',
 'transaction_type': 'withdrawal',
 'payment_method': 'credit_card',
 'location': {'lat': 84.908082, 'lon': -96.085004},
 'ip_address': '73.111.188.232',
 'device_info': {'os': 'Linux', 'browser': 'Safari'},
 'user_agent': 'Mozilla/5.0 (iPad; CPU iPad OS 3_1_3 like Mac OS X) AppleWebKit/533.2 (KHTML, like Gecko) FxiOS/16.0m8939.0 Mobile/65W735 Safari/533.2',
 'account_age_days': 520,
 'cvv_provided': True,
 'billing_address_match': True}

In [14]:
feature_columns = [
    "amount",
    "account_age_days",
    "cvv_provided",
    "billing_address_match",
    "currency",
    "merchant_id",
    "payment_method",
    "product_category",
    "transaction_type",
    "user_agent",
    "device_info" # Keep the nested dict column for now
    # Omit: 'transaction_id', 'user_id', 'timestamp', 'location', 'ip_address'
]

In [18]:
sk_model.fit(X[feature_columns], y)

ValueError: could not convert string to float: 'AUD'

In [17]:
sk_model.predict(X[feature_columns])

NotFittedError: This River2SKLTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [28]:
from river import ensemble

dir(ensemble)

['ADWINBaggingClassifier',
 'ADWINBoostingClassifier',
 'AdaBoostClassifier',
 'BOLEClassifier',
 'BaggingClassifier',
 'BaggingRegressor',
 'EWARegressor',
 'LeveragingBaggingClassifier',
 'SRPClassifier',
 'SRPRegressor',
 'StackingClassifier',
 'VotingClassifier',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'annotations',
 'bagging',
 'boosting',
 'ewa',
 'stacking',
 'streaming_random_patches',
 'voting']

In [29]:
from river import ensemble

help(ensemble.AdaptiveRandomForestClassifier)

AttributeError: module 'river.ensemble' has no attribute 'AdaptiveRandomForestClassifier'

In [30]:
dir(river.tree.splitter)

['DynamicQuantizer',
 'EBSTSplitter',
 'ExhaustiveSplitter',
 'GaussianSplitter',
 'HistogramSplitter',
 'QOSplitter',
 'Quantizer',
 'Splitter',
 'StaticQuantizer',
 'TEBSTSplitter',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'annotations',
 'base',
 'ebst_splitter',
 'exhaustive_splitter',
 'gaussian_splitter',
 'histogram_splitter',
 'nominal_splitter_classif',
 'nominal_splitter_reg',
 'qo_splitter',
 'sgt_quantizer',
 'tebst_splitter']