# 1. Data Validation

In [41]:
# Imports básicos
import os
import json
import time
import datetime
import pandas as pd
from dotenv import load_dotenv
from kafka import KafkaProducer, KafkaConsumer

# PySpark imports
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassificationModel

# Carregar variáveis de ambiente
load_dotenv('.env')

# Criar SparkSession
spark = SparkSession.builder.appName("KafkaIntegration").getOrCreate()
base_path = os.getenv('BASE_PATH')

# Carregar modelo treinado
modelo_carregado = GBTClassificationModel.load("modelos/gradient_boosting_model")


In [42]:
# Configurações do Kafka
topic = 'customer-data'
bootstrap_servers = 'localhost:9092'

# Inicializar o produtor Kafka
producer = KafkaProducer(
    bootstrap_servers=bootstrap_servers,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Carregar dados de clientes
base_path = os.getenv('BASE_PATH ')
df_customer_features = spark.read.csv(f"data/customer_features.csv", header=True, inferSchema=True)

df_customer_features.write.mode("Overwrite").parquet("/home/jovyan/code/data-ml")
df_customer_parquet = spark.read.parquet("/home/jovyan/code/data-ml")
# df_customer_features = df_customer_features.limit(10)  # Limitar para exemplo
df_customer_features = df_customer_parquet.limit(10)

# Enviar dados para o tópico Kafka
for row in df_customer_features.collect():
    data = row.asDict()

    # Corrigir campos que são datas
    for k, v in data.items():
        if isinstance(v, datetime.date):
            data[k] = v.isoformat()

    producer.send(topic, value=data)
    print(f"Enviado: {data}")
    time.sleep(0.5)  # Simular streaming

producer.flush()
producer.close()


Enviado: {'id': 2669945285, 'total_transactions': 1523, 'total_spent': 6320.52, 'avg_spent': 4.15, 'first_purchase': '2012-03-03', 'last_purchase': '2013-04-16', 'unique_categories': 248, 'unique_products': 312}
Enviado: {'id': 2669952782, 'total_transactions': 345, 'total_spent': 2229.0, 'avg_spent': 6.46, 'first_purchase': '2012-03-12', 'last_purchase': '2013-06-16', 'unique_categories': 108, 'unique_products': 145}
Enviado: {'id': 266996275, 'total_transactions': 1258, 'total_spent': 7447.03, 'avg_spent': 5.92, 'first_purchase': '2012-03-02', 'last_purchase': '2013-04-02', 'unique_categories': 191, 'unique_products': 212}
Enviado: {'id': 2670041982, 'total_transactions': 656, 'total_spent': 5417.1, 'avg_spent': 8.26, 'first_purchase': '2012-03-02', 'last_purchase': '2013-06-18', 'unique_categories': 160, 'unique_products': 279}
Enviado: {'id': 267008595, 'total_transactions': 777, 'total_spent': 2728.3, 'avg_spent': 3.51, 'first_purchase': '2012-03-09', 'last_purchase': '2013-06-19'

In [44]:
from kafka import KafkaConsumer
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType
from pyspark.sql import Row
from pyspark.ml.feature import VectorAssembler
import json

# Esquema esperado para criar o DataFrame Spark
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("total_transactions", DoubleType(), True),
    StructField("total_spent", DoubleType(), True),
    StructField("avg_spent", DoubleType(), True),
    StructField("first_purchase", StringType(), True),
    StructField("last_purchase", StringType(), True),
    StructField("unique_categories", DoubleType(), True),
    StructField("unique_products", DoubleType(), True)
])

# VectorAssembler - as mesmas features usadas no treino
assembler = VectorAssembler(
    inputCols=["total_transactions", "total_spent", "avg_spent", "unique_categories", "unique_products"],
    outputCol="features"
)

# Inicializar consumidor Kafka
consumer = KafkaConsumer(
    'customer-data',
    bootstrap_servers='localhost:9092',
    value_deserializer=lambda m: json.loads(m.decode('utf-8')),
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    group_id='spark-consumer-group'
)

# Funções auxiliares para conversão segura
def safe_float(value, default=0.0):
    try:
        return float(value)
    except (ValueError, TypeError):
        return default

def safe_int(value, default=0):
    try:
        return int(value)
    except (ValueError, TypeError):
        return default

print("Iniciando consumo das mensagens...")

for message in consumer:
    raw_data = message.value
    print(f"Recebido: {raw_data}")

    try:
        # Garantir tipos corretos e preencher valores default
        data = {
            "id": safe_int(raw_data.get("id")),
            "total_transactions": safe_float(raw_data.get("total_transactions")),
            "total_spent": safe_float(raw_data.get("total_spent")),
            "avg_spent": safe_float(raw_data.get("avg_spent")),
            "first_purchase": raw_data.get("first_purchase", ""),
            "last_purchase": raw_data.get("last_purchase", ""),
            "unique_categories": safe_float(raw_data.get("unique_categories")),
            "unique_products": safe_float(raw_data.get("unique_products"))
        }

        # Criar DataFrame Spark com o esquema definido
        df = spark.createDataFrame([Row(**data)], schema=schema)

        # Criar vetor features para o modelo
        df_features = assembler.transform(df)

        # Aplicar o modelo carregado (GBTClassificationModel)
        pred = modelo_carregado.transform(df_features)

        # Mostrar a predição
        pred.select("id", "prediction", "probability").show()

    except Exception as e:
        print(f"Erro ao processar mensagem: {e}")
        print(f"Mensagem com problema: {raw_data}")
        continue

    # Se quiser testar só 1 mensagem, descomente o break abaixo
    # break


Iniciando consumo das mensagens...
Recebido: {'id': 267008595, 'total_transactions': 777, 'total_spent': 2728.3, 'avg_spent': 3.51, 'first_purchase': '2012-03-09', 'last_purchase': '2013-06-19', 'unique_categories': 110, 'unique_products': 116}
+---------+----------+--------------------+
|       id|prediction|         probability|
+---------+----------+--------------------+
|267008595|       0.0|[0.72774931623968...|
+---------+----------+--------------------+

Recebido: {'id': 2670267527, 'total_transactions': 431, 'total_spent': 3430.56, 'avg_spent': 7.96, 'first_purchase': '2012-03-02', 'last_purchase': '2013-04-04', 'unique_categories': 141, 'unique_products': 204}
Erro ao processar mensagem: [VALUE_OUT_OF_BOUND] Value for `obj` must be greater than 2147483647 or less than -2147483648, got 2670267527
Mensagem com problema: {'id': 2670267527, 'total_transactions': 431, 'total_spent': 3430.56, 'avg_spent': 7.96, 'first_purchase': '2012-03-02', 'last_purchase': '2013-04-04', 'unique_c

KeyboardInterrupt: 

# KAFKA - Classicação dos dados dos clientes

**Configuração e carregamento do modelo**


Inicialização das bibliotecas necessárias, criação da sessão Spark e carregamento do modelo Gradient Boosting pré-treinado.

In [52]:
# Imports básicos
import os
import json
import time
import datetime
import pandas as pd
from dotenv import load_dotenv
from kafka import KafkaProducer, KafkaConsumer

# PySpark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassificationModel

# Carregar variáveis de ambiente
load_dotenv('/home/jovyan/code/.env')

# Criação da sessão Spark
spark = SparkSession.builder.appName("KafkaIntegration").getOrCreate()
base_path = os.getenv('BASE_PATH', '/home/jovyan/code/data')

# Caminho do modelo
model_path = "modelos/gradient_boosting_model"
if not os.path.exists(f"{model_path}/metadata"):
    raise Exception(f"Model not found at {model_path}/metadata")

# Carregar modelo já treinado
modelo_carregado = GBTClassificationModel.load(model_path)
print(f"Model loaded from {model_path}")

Model loaded from modelos/gradient_boosting_model


**Kafka Producer**

Configuração do producer Kafka, para enviar os dados do cliente para o tópico de dados do cliente. Realiza a leitura do customer_features.csv, guarda-o como parquet, maximizando a eficiência.

In [None]:
# Configurações do Kafka
topic = 'customer-data'
bootstrap_servers = 'localhost:9092'

# Iniciar o produtor Kafka
producer = KafkaProducer(
    bootstrap_servers=bootstrap_servers,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Carregar dados de clientes
df_customer_features = spark.read.csv(f"{base_path}/customer_features.csv", header=True, inferSchema=True)

# Salvar como Parquet
df_customer_features.write.mode("overwrite").parquet(f"{base_path}-ml/customer_features.parquet")
df_customer_parquet = spark.read.parquet(f"{base_path}-ml/customer_features.parquet")
df_customer_features = df_customer_parquet.limit(10)  # Limitar para exemplo

# Enviar dados para o tópico Kafka
for row in df_customer_features.collect():
    data = row.asDict()

    # Corrigir campos que são datas
    for k, v in data.items():
        if isinstance(v, datetime.date):
            data[k] = v.isoformat()

    producer.send(topic, value=data)
    print(f"Enviado: {data}")
    time.sleep(0.5)  # Simular streaming

producer.flush()
producer.close()

Enviado: {'id': 2669945285, 'total_transactions': 1523, 'total_spent': 6320.52, 'avg_spent': 4.15, 'first_purchase': '2012-03-03', 'last_purchase': '2013-04-16', 'unique_categories': 248, 'unique_products': 312}
Enviado: {'id': 2669952782, 'total_transactions': 345, 'total_spent': 2229.0, 'avg_spent': 6.46, 'first_purchase': '2012-03-12', 'last_purchase': '2013-06-16', 'unique_categories': 108, 'unique_products': 145}
Enviado: {'id': 266996275, 'total_transactions': 1258, 'total_spent': 7447.03, 'avg_spent': 5.92, 'first_purchase': '2012-03-02', 'last_purchase': '2013-04-02', 'unique_categories': 191, 'unique_products': 212}
Enviado: {'id': 2670041982, 'total_transactions': 656, 'total_spent': 5417.1, 'avg_spent': 8.26, 'first_purchase': '2012-03-02', 'last_purchase': '2013-06-18', 'unique_categories': 160, 'unique_products': 279}
Enviado: {'id': 267008595, 'total_transactions': 777, 'total_spent': 2728.3, 'avg_spent': 3.51, 'first_purchase': '2012-03-09', 'last_purchase': '2013-06-19'

**Consumidor Kafka (Processamento de Mensagem Única)**

Configuração de um consumer kafka, de forma a ler as mensagens do tópico de dados do cliente, uma de cada vez. Define um schema para os dados recebidos, cria um DataFrame para cada mensagem e utiliza o modelo carregado para prever a classificação do cliente. Produz previsões para cada mensagem.

In [None]:
from kafka import KafkaConsumer
from pyspark.sql.types import StructType, StructField, LongType, DoubleType, StringType
from pyspark.sql import Row
from pyspark.ml.feature import VectorAssembler
import json

# Schema esperado para criar o DataFrame Spark
schema = StructType([
    StructField("id", LongType(), True),
    StructField("total_transactions", DoubleType(), True),
    StructField("total_spent", DoubleType(), True),
    StructField("avg_spent", DoubleType(), True),
    StructField("first_purchase", StringType(), True),
    StructField("last_purchase", StringType(), True),
    StructField("unique_categories", DoubleType(), True),
    StructField("unique_products", DoubleType(), True)
])

# VectorAssembler - usa as mesmas features usadas no treino
assembler = VectorAssembler(
    inputCols=["total_transactions", "total_spent", "avg_spent", "unique_categories", "unique_products"],
    outputCol="features"
)

# Iniciar consumer Kafka
consumer = KafkaConsumer(
    'customer-data',
    bootstrap_servers='localhost:9092',
    value_deserializer=lambda m: json.loads(m.decode('utf-8')),
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    group_id='spark-consumer-group'
)

# Funções auxiliares para conversão
def safe_float(value, default=0.0):
    try:
        return float(value)
    except (ValueError, TypeError):
        return default

def safe_long(value, default=0):
    try:
        return int(value)
    except (ValueError, TypeError):
        return default

print("Iniciando consumo das mensagens...")

for message in consumer:
    raw_data = message.value
    print(f"Recebido: {raw_data}")

    try:
        # Garantir tipos corretos
        data = {
            "id": safe_long(raw_data.get("id")),
            "total_transactions": safe_float(raw_data.get("total_transactions")),
            "total_spent": safe_float(raw_data.get("total_spent")),
            "avg_spent": safe_float(raw_data.get("avg_spent")),
            "first_purchase": raw_data.get("first_purchase", ""),
            "last_purchase": raw_data.get("last_purchase", ""),
            "unique_categories": safe_float(raw_data.get("unique_categories")),
            "unique_products": safe_float(raw_data.get("unique_products"))
        }

        # Criar DataFrame Spark com o esquema definido
        df = spark.createDataFrame([Row(**data)], schema=schema)

        # Criar vetor features para o modelo
        df_features = assembler.transform(df)

        # Aplicar o modelo carregado -> GBTClassificationModel
        pred = modelo_carregado.transform(df_features)

        # Mostrar a previsão
        pred.select("id", "prediction", "probability").show()

    except Exception as e:
        print(f"Erro ao processar mensagem: {e}")
        print(f"Mensagem com problema: {raw_data}")
        continue


Iniciando consumo das mensagens...


**Consumidor Kafka (Processamento em Batch)**

In [51]:
from kafka import KafkaConsumer
from pyspark.sql.types import StructType, StructField, LongType, DoubleType, StringType
from pyspark.sql import Row
from pyspark.ml.feature import VectorAssembler
import json

# Esquema
schema = StructType([
    StructField("id", LongType(), True),
    StructField("total_transactions", DoubleType(), True),
    StructField("total_spent", DoubleType(), True),
    StructField("avg_spent", DoubleType(), True),
    StructField("first_purchase", StringType(), True),
    StructField("last_purchase", StringType(), True),
    StructField("unique_categories", DoubleType(), True),
    StructField("unique_products", DoubleType(), True)
])

# VectorAssembler
assembler = VectorAssembler(
    inputCols=["total_transactions", "total_spent", "avg_spent", "unique_categories", "unique_products"],
    outputCol="features"
)

# Consumidor Kafka
consumer = KafkaConsumer(
    'customer-data',
    bootstrap_servers='localhost:9092',
    value_deserializer=lambda m: json.loads(m.decode('utf-8')),
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    group_id='spark-consumer-group'
)

# Funções auxiliares
def safe_float(value, default=0.0):
    try:
        return float(value)
    except (ValueError, TypeError):
        return default

def safe_long(value, default=0):
    try:
        return int(value)
    except (ValueError, TypeError):
        return default

# Processar em lotes
batch_size = 100  # Process 100 messages at a time
max_messages = 1000  # Stop after 1000 messages
batch_data = []
message_count = 0

print("Iniciando consumo das mensagens...")

for message in consumer:
    raw_data = message.value
    print(f"Recebido: {raw_data}")

    try:
        data = {
            "id": safe_long(raw_data.get("id")),
            "total_transactions": safe_float(raw_data.get("total_transactions")),
            "total_spent": safe_float(raw_data.get("total_spent")),
            "avg_spent": safe_float(raw_data.get("avg_spent")),
            "first_purchase": raw_data.get("first_purchase", ""),
            "last_purchase": raw_data.get("last_purchase", ""),
            "unique_categories": safe_float(raw_data.get("unique_categories")),
            "unique_products": safe_float(raw_data.get("unique_products"))
        }
        batch_data.append(Row(**data))
        message_count += 1

        # Processar lote quando atingir batch_size ou max_messages
        if len(batch_data) >= batch_size or message_count >= max_messages:
            df = spark.createDataFrame(batch_data, schema=schema)
            df_features = assembler.transform(df)
            pred = modelo_carregado.transform(df_features)
            pred.select("id", "prediction", "probability").show()
            # Salvar lote em CSV
            pred.select("id", "prediction", "probability").write.mode("append").csv(
                f"{base_path}-ml/submission_batch", header=True
            )
            batch_data = []  # Limpar lote

        if message_count >= max_messages:
            break

    except Exception as e:
        print(f"Erro ao processar mensagem: {e}")
        print(f"Mensagem com problema: {raw_data}")
        continue

# Processar quaisquer mensagens restantes
if batch_data:
    df = spark.createDataFrame(batch_data, schema=schema)
    df_features = assembler.transform(df)
    pred = modelo_carregado.transform(df_features)
    pred.select("id", "prediction", "probability").show()
    pred.select("id", "prediction", "probability").write.mode("append").csv(
        f"{base_path}-ml/submission_batch", header=True
    )

consumer.close()
print("Consumer finished.")

Iniciando consumo das mensagens...
Recebido: {'id': 2669945285, 'total_transactions': 1523, 'total_spent': 6320.52, 'avg_spent': 4.15, 'first_purchase': '2012-03-03', 'last_purchase': '2013-04-16', 'unique_categories': 248, 'unique_products': 312}
Recebido: {'id': 2669952782, 'total_transactions': 345, 'total_spent': 2229.0, 'avg_spent': 6.46, 'first_purchase': '2012-03-12', 'last_purchase': '2013-06-16', 'unique_categories': 108, 'unique_products': 145}
Recebido: {'id': 266996275, 'total_transactions': 1258, 'total_spent': 7447.03, 'avg_spent': 5.92, 'first_purchase': '2012-03-02', 'last_purchase': '2013-04-02', 'unique_categories': 191, 'unique_products': 212}
Recebido: {'id': 2670041982, 'total_transactions': 656, 'total_spent': 5417.1, 'avg_spent': 8.26, 'first_purchase': '2012-03-02', 'last_purchase': '2013-06-18', 'unique_categories': 160, 'unique_products': 279}
Recebido: {'id': 267008595, 'total_transactions': 777, 'total_spent': 2728.3, 'avg_spent': 3.51, 'first_purchase': '20

KeyboardInterrupt: 