# KAFKA - Classicação dos dados dos clientes,
**Configuração e carregamento do modelo**

Inicialização das bibliotecas necessárias, criação da sessão Spark e carregamento do modelo Gradient Boosting pré-treinado.

In [5]:
# Imports básicos
import os
import json
import time
import datetime
import pandas as pd
from dotenv import load_dotenv
from kafka import KafkaProducer, KafkaConsumer

# PySpark imports
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassificationModel

# Carregar variáveis de ambiente
load_dotenv('.env')

# Criar SparkSession
spark = SparkSession.builder.appName("KafkaIntegration").getOrCreate()
base_path = os.getenv('BASE_PATH')

# Carregar modelo treinado
modelo_carregado = GBTClassificationModel.load("modelos_segunda_melhoria/GradientBoosting/bestModel")


**Producer Kafka**

Configuração do producer Kafka, para enviar os dados do cliente para o tópico de dados do cliente. Realiza a leitura do customer_features_improved.csv, guarda-o como parquet, maximizando a eficiência.

In [18]:
# Configurações do Kafka
topic = 'customer-data'
bootstrap_servers = 'localhost:9092'

# Inicializar o produtor Kafka
producer = KafkaProducer(
    bootstrap_servers=bootstrap_servers,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Carregar dados de clientes
base_path = os.getenv('BASE_PATH ')
df_customer_features = spark.read.csv(f"data/improved/customer_features_improved.csv", header=True, inferSchema=True)

df_customer_features.write.mode("Overwrite").parquet("/home/jovyan/code/data/parquet/customer_features_improved.parquet")
df_customer_parquet = spark.read.parquet("/home/jovyan/code/data/parquet/customer_features_improved.parquet")
# df_customer_features = df_customer_features.limit(10)  # Limitar para exemplo
df_customer_features = df_customer_parquet.limit(30)

# Enviar dados para o tópico Kafka
for row in df_customer_features.collect():
    data = row.asDict()

    # Corrigir campos que são datas
    for k, v in data.items():
        if isinstance(v, datetime.date):
            data[k] = v.isoformat()

    producer.send(topic, value=data)
    print(f"Enviado: {data}")
    # time.sleep(0.5)  # Simular streaming

producer.flush()
producer.close()


Enviado: {'id': 4640504730, 'total_transactions': 1347, 'total_spent': 6132.2, 'avg_spent': 4.55, 'first_purchase': '2012-03-11', 'last_purchase': '2013-07-20', 'unique_categories': 252, 'unique_products': 249, 'offer_purchase_count': 0, 'offer_total_spent': 0.0, 'offer_avg_spent': 0.0, 'days_since_last_offer_purchase': 0, 'avg_days_between_purchases': 140.0, 'offervalue': 0.0, 'offer_value_per_transaction': 0.0, 'unique_chains': 1}
Enviado: {'id': 4640651168, 'total_transactions': 1625, 'total_spent': 6669.94, 'avg_spent': 4.1, 'first_purchase': '2012-03-04', 'last_purchase': '2013-04-20', 'unique_categories': 311, 'unique_products': 336, 'offer_purchase_count': 0, 'offer_total_spent': 0.0, 'offer_avg_spent': 0.0, 'days_since_last_offer_purchase': 0, 'avg_days_between_purchases': 366.0, 'offervalue': 1.0, 'offer_value_per_transaction': 0.0, 'unique_chains': 1}
Enviado: {'id': 4640866580, 'total_transactions': 774, 'total_spent': 3317.65, 'avg_spent': 4.29, 'first_purchase': '2012-03-0

**Consumer Kafka**

Configuração de um consumer kafka, de forma a ler as mensagens do tópico de dados do cliente, uma de cada vez. Define um schema para os dados recebidos, cria um DataFrame para cada mensagem e utiliza o modelo carregado para prever a classificação do cliente. Produz previsões para cada mensagem.

In [19]:

from kafka import KafkaConsumer
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, DateType, LongType
from pyspark.sql import Row
from pyspark.ml.feature import VectorAssembler
import json

# Esquema esperado para criar o DataFrame Spark
schema = StructType([
    StructField("id", LongType(), True),
    StructField("total_transactions", IntegerType(), True),
    StructField("total_spent", DoubleType(), True),
    StructField("avg_spent", DoubleType(), True),
    StructField("first_purchase", DateType(), True),
    StructField("last_purchase", DateType(), True),
    StructField("unique_categories", IntegerType(), True),
    StructField("unique_products", IntegerType(), True),
    StructField("offer_purchase_count", IntegerType(), True),
    StructField("offer_total_spent", DoubleType(), True),
    StructField("offer_avg_spent", DoubleType(), True),
    StructField("days_since_last_offer_purchase", IntegerType(), True),
    StructField("avg_days_between_purchases", DoubleType(), True),
    StructField("offervalue", DoubleType(), True),
    StructField("offer_value_per_transaction", DoubleType(), True),
    StructField("unique_chains", IntegerType(), True)
])

# VectorAssembler - as mesmas features usadas no treino
assembler = VectorAssembler(
    inputCols=[
        "id",
        "total_transactions",
        "total_spent",
        "avg_spent",
        "unique_categories",
        "unique_products",
        "offer_purchase_count",
        "offer_total_spent",
        "offer_avg_spent",
        "days_since_last_offer_purchase",
        "avg_days_between_purchases",
        "offervalue",
        "offer_value_per_transaction",
        "unique_chains"
    ],
    outputCol="features"
)

# Inicializar consumidor Kafka
consumer = KafkaConsumer(
    'customer-data',
    bootstrap_servers='localhost:9092',
    value_deserializer=lambda m: json.loads(m.decode('utf-8')),
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    group_id='spark-consumer-group'
)

# Funções auxiliares para conversão segura
def safe_float(value, default=0.0):
    try:
        return float(value)
    except (ValueError, TypeError):
        return default

def safe_int(value, default=0):
    try:
        return int(value)
    except (ValueError, TypeError):
        return default
    
def safe_date(value, default=None):
    try:
        if isinstance(value, str):
            return datetime.datetime.strptime(value, "%Y-%m-%d").date()
        return value
    except (ValueError, TypeError):
        return default
    
def safe_long(value, default=0):
    try:
        return int(value)
    except (ValueError, TypeError):
        return default

def safe_double(value, default=0.0):
    try:
        return float(value)
    except (ValueError, TypeError):
        return default

print("Iniciando consumo das mensagens...")

try:
    for message in consumer:
        raw_data = message.value
        print(f"Recebido: {raw_data}")

        try:
            # Garantir tipos corretos e preencher valores default
            data = {
                "id": safe_long(raw_data.get("id")),
                "total_transactions": safe_int(raw_data.get("total_transactions")),
                "total_spent": safe_double(raw_data.get("total_spent")),
                "avg_spent": safe_double(raw_data.get("avg_spent")),
                "first_purchase": safe_date(raw_data.get("first_purchase")),
                "last_purchase": safe_date(raw_data.get("last_purchase")),
                "unique_categories": safe_int(raw_data.get("unique_categories")),
                "unique_products": safe_int(raw_data.get("unique_products")),
                "offer_purchase_count": safe_int(raw_data.get("offer_purchase_count")),
                "offer_total_spent": safe_double(raw_data.get("offer_total_spent")),
                "offer_avg_spent": safe_double(raw_data.get("offer_avg_spent")),
                "days_since_last_offer_purchase": safe_int(raw_data.get("days_since_last_offer_purchase")),
                "avg_days_between_purchases": safe_double(raw_data.get("avg_days_between_purchases")),
                "offervalue": safe_double(raw_data.get("offervalue")),
                "offer_value_per_transaction": safe_double(raw_data.get("offer_value_per_transaction")),
                "unique_chains": safe_int(raw_data.get("unique_chains"))
            }

            # Criar DataFrame Spark com o esquema definido
            df = spark.createDataFrame([Row(**data)], schema=schema)

            # Criar vetor features para o modelo
            df_features = assembler.transform(df)

            # Aplicar o modelo carregado (GBTClassificationModel)
            pred = modelo_carregado.transform(df_features)

            # Mostrar a predição
            pred.select("id", "prediction", "probability").show()

        except Exception as e:
            print(f"Erro ao processar mensagem: {e}")
            print(f"Mensagem com problema: {raw_data}")
            continue

except StopIteration:
    print("Consumo finalizado.")    

finally:
    print("Fechando consumidor Kafka...")
    consumer.close()


Iniciando consumo das mensagens...
Recebido: {'id': 4640504730, 'total_transactions': 1347, 'total_spent': 6132.2, 'avg_spent': 4.55, 'first_purchase': '2012-03-11', 'last_purchase': '2013-07-20', 'unique_categories': 252, 'unique_products': 249, 'offer_purchase_count': 0, 'offer_total_spent': 0.0, 'offer_avg_spent': 0.0, 'days_since_last_offer_purchase': 0, 'avg_days_between_purchases': 140.0, 'offervalue': 0.0, 'offer_value_per_transaction': 0.0, 'unique_chains': 1}
+----------+----------+--------------------+
|        id|prediction|         probability|
+----------+----------+--------------------+
|4640504730|       1.0|[0.48136003031576...|
+----------+----------+--------------------+

Recebido: {'id': 4640651168, 'total_transactions': 1625, 'total_spent': 6669.94, 'avg_spent': 4.1, 'first_purchase': '2012-03-04', 'last_purchase': '2013-04-20', 'unique_categories': 311, 'unique_products': 336, 'offer_purchase_count': 0, 'offer_total_spent': 0.0, 'offer_avg_spent': 0.0, 'days_since_

KeyboardInterrupt: 