# Modelo 3

Este modelo tem uma analise mais aprofundada das features 

In [10]:
# Basic imports


# Basic imports

import json
from pathlib import Path
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from dotenv import load_dotenv
load_dotenv('.env')
import os
import plotly.graph_objects as go

from dotenv import load_dotenv
import os
import csv
import os
import datetime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, when, lit, to_date, datediff
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, LinearSVC, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
import plotly.express as px
import plotly.express as px
import pandas as pd

In [11]:
# Build SparkSession
spark = SparkSession.builder.appName("DataPreparation").getOrCreate()
base_path = os.getenv('BASE_PATH')

In [12]:
# Reading data - trainHistory.csv
df_trainHistory = spark.read.csv(
        f"{base_path}/trainHistory.csv.gz", 
        header=True, sep=',', inferSchema=True
    )

In [13]:
print("Info for customer_features_improved.csv:")
df_customer_features = spark.read.csv(f"{base_path}/improved/customer_features_improved.csv",
                                      header=True, inferSchema=True)
df_customer_features.show(5)
df_customer_features.printSchema()


Info for customer_features_improved.csv:
+----------+------------------+-----------+---------+--------------+-------------+-----------------+---------------+--------------------+-----------------+---------------+------------------------------+--------------------------+----------+---------------------------+-------------+
|        id|total_transactions|total_spent|avg_spent|first_purchase|last_purchase|unique_categories|unique_products|offer_purchase_count|offer_total_spent|offer_avg_spent|days_since_last_offer_purchase|avg_days_between_purchases|offervalue|offer_value_per_transaction|unique_chains|
+----------+------------------+-----------+---------+--------------+-------------+-----------------+---------------+--------------------+-----------------+---------------+------------------------------+--------------------------+----------+---------------------------+-------------+
|4640504730|              1347|     6132.2|     4.55|    2012-03-11|   2013-07-20|              252|          

# Analise das correlações entre features

In [14]:
cols_corr = [
    "total_transactions", "total_spent", "avg_spent",
    "unique_categories", "unique_products",
    "offer_purchase_count", "offer_total_spent", "offer_avg_spent",
    "days_since_last_offer_purchase",
    "offer_purchase_count", "offer_total_spent", "offer_avg_spent",
    "days_since_last_offer_purchase",
    "unique_chains", "offervalue", "offer_value_per_transaction",
    "avg_days_between_purchases"
]

col_features = "features"
sample_df = df_customer_features.sample(fraction=0.1, seed=42)

assembler = VectorAssembler(inputCols=cols_corr, outputCol=col_features)
df_features = assembler.transform(sample_df).select(col_features)

corr_matrix = Correlation.corr(df_features, col_features).collect()[0][0].toArray().tolist()

print(f'Computed correlations among {cols_corr}:')
fig = px.imshow(corr_matrix, title='Correlations',
                x = cols_corr, y = cols_corr,
                color_continuous_scale='Sunsetdark',  # Sunsetdark, RdBu_r
                text_auto='.2f',
                zmin=-1, zmax=1)
fig.update_layout(
    xaxis_title="Funcionalidades",
    yaxis_title="Funcionalidades",
    width=950,
    height=800
)
fig.show()

Computed correlations among ['total_transactions', 'total_spent', 'avg_spent', 'unique_categories', 'unique_products', 'offer_purchase_count', 'offer_total_spent', 'offer_avg_spent', 'days_since_last_offer_purchase', 'offer_purchase_count', 'offer_total_spent', 'offer_avg_spent', 'days_since_last_offer_purchase', 'unique_chains', 'offervalue', 'offer_value_per_transaction', 'avg_days_between_purchases']:


# Preparação e Divisão dos Dados de Treino e Validação

In [15]:
train_df = df_customer_features.join(
    df_trainHistory.select("id", "repeater"),
    "id",
    "inner"
).withColumn("target", when(col("repeater") == "t", 1).otherwise(0))

train_split, val_split = train_df.randomSplit([0.7, 0.3], seed=42)

# Cache para eficiência
train_split.cache()
val_split.cache()

# Verificar tamanhos
print(f"Treino: {train_split.count()} linhas")
print(f"Validação: {val_split.count()} linhas")

# 2. Liberar memória do train_df
train_df.unpersist()

Treino: 112284 linhas
Validação: 47773 linhas


DataFrame[id: bigint, total_transactions: int, total_spent: double, avg_spent: double, first_purchase: date, last_purchase: date, unique_categories: int, unique_products: int, offer_purchase_count: int, offer_total_spent: double, offer_avg_spent: double, days_since_last_offer_purchase: int, avg_days_between_purchases: double, offervalue: double, offer_value_per_transaction: double, unique_chains: int, repeater: string, target: int]

# Preparação de Dados e Configuração Inicial

In [16]:
# Combinar features em um vetor

feature_cols = [
    "total_transactions",           # Alta importância (0.3538)
    "avg_spent",                    # Alta importância (0.2218)
    "unique_products",              # Moderada importância (0.1271)
    "offer_purchase_count",         # Comportamento na oferta
    "days_since_last_offer_purchase", # Recência
    "offer_purchase_count",     # Comportamento na categoria
    "offer_total_spent",        # Moderada importância (0.0999)
    "days_since_last_offer_purchase", # Recência na categoria
    "unique_chains",                # Diversidade de lojas
    "offervalue",                   # Valor da oferta
    "offer_value_per_transaction",  # Incentivo por transação
    "avg_days_between_purchases"    # Frequência
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_assembled = assembler.transform(train_split)
validation_assembled = assembler.transform(val_split)

modelos = {
    "LogisticRegression": LogisticRegression(labelCol="target", featuresCol="features"),
    "RandomForest":  RandomForestClassifier(labelCol="target", featuresCol="features"),
    "GradientBoosting": GBTClassifier(labelCol="target", featuresCol="features"),
    "LinearSVC": LinearSVC(labelCol="target", featuresCol="features"),
    "DecisionTree": DecisionTreeClassifier(labelCol="target", featuresCol="features")
}

evaluator = BinaryClassificationEvaluator(labelCol="target", metricName="areaUnderROC")
results = {}
detailed_results = {}

def calculate_metrics(predictions):
    """Calculate detailed classification metrics from predictions"""
    # Get confusion matrix components
    confusion_matrix = predictions.groupBy('target', 'prediction').count()
    
    tp = confusion_matrix.filter((confusion_matrix.target == 1) & (confusion_matrix.prediction == 1)).select('count').first()
    tn = confusion_matrix.filter((confusion_matrix.target == 0) & (confusion_matrix.prediction == 0)).select('count').first()
    fp = confusion_matrix.filter((confusion_matrix.target == 0) & (confusion_matrix.prediction == 1)).select('count').first()
    fn = confusion_matrix.filter((confusion_matrix.target == 1) & (confusion_matrix.prediction == 0)).select('count').first()

    tp_val = tp['count'] if tp else 0
    tn_val = tn['count'] if tn else 0
    fp_val = fp['count'] if fp else 0
    fn_val = fn['count'] if fn else 0
    
    # Calculate metrics
    accuracy = (tp_val + tn_val) / (tp_val + tn_val + fp_val + fn_val) if (tp_val + tn_val + fp_val + fn_val) > 0 else 0
    precision = tp_val / (tp_val + fp_val) if (tp_val + fp_val) > 0 else 0
    recall = tp_val / (tp_val + fn_val) if (tp_val + fn_val) > 0 else 0
    specificity = tn_val / (tn_val + fp_val) if (tn_val + fp_val) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'specificity': specificity,
        'f1_score': f1_score,
        'confusion_matrix': {'TP': tp_val, 'TN': tn_val, 'FP': fp_val, 'FN': fn_val}
    }

## Treino e Avaliação de Modelos de Classificação com Validação Cruzada


In [17]:
for nome, modelo in modelos.items():
    print(f"\n{'='*50}")
    print(f"Training {nome}...")
    print(f"{'='*50}")
    
    # Configurar validação cruzada
    paramGrid = ParamGridBuilder().build()
    if nome == "RandomForest":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.numTrees, [50]) \
            .addGrid(modelo.maxDepth, [10]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "GradientBoosting":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [50]) \
            .addGrid(modelo.maxDepth, [5]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "LinearSVC":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [100]) \
            .addGrid(modelo.regParam, [0.1]) \
            .build()
    elif nome == "DecisionTree":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxDepth, [10]) \
            .addGrid(modelo.seed, [42]) \
            .build()
    elif nome == "LogisticRegression":
        paramGrid = ParamGridBuilder() \
            .addGrid(modelo.maxIter, [100]) \
            .addGrid(modelo.regParam, [0.1]) \
            .build()

    crossval = CrossValidator(
        estimator=modelo,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=3
    )
    
    # Treinar modelo
    cv_model = crossval.fit(train_assembled)
    cv_model.save(f"modelos_segunda_melhoria/{nome}")

    # Avaliar na validação
    predictions = cv_model.transform(validation_assembled)
    auc = evaluator.evaluate(predictions)
    results[nome] = auc
    
    metrics = calculate_metrics(predictions)
    detailed_results[nome] = {**metrics, 'auc_roc': auc}


    print(f"{nome} AUC-ROC: {auc:.4f}")
    print(f"{nome} Accuracy: {metrics['accuracy']:.4f}")
    print(f"{nome} Precision: {metrics['precision']:.4f}")
    print(f"{nome} Recall: {metrics['recall']:.4f}")
    print(f"{nome} Specificity: {metrics['specificity']:.4f}")
    print(f"{nome} F1 Score: {metrics['f1_score']:.4f}")

melhor_modelo_nome = max(results, key=results.get)
print(f"Melhor modelo: {melhor_modelo_nome} com AUC-ROC {results[melhor_modelo_nome]}")


Training LogisticRegression...
LogisticRegression AUC-ROC: 0.5415
LogisticRegression Accuracy: 0.7280
LogisticRegression Precision: 0.8667
LogisticRegression Recall: 0.0010
LogisticRegression Specificity: 0.9999
LogisticRegression F1 Score: 0.0020

Training RandomForest...
RandomForest AUC-ROC: 0.6533
RandomForest Accuracy: 0.7289
RandomForest Precision: 0.6190
RandomForest Recall: 0.0110
RandomForest Specificity: 0.9975
RandomForest F1 Score: 0.0216

Training GradientBoosting...
GradientBoosting AUC-ROC: 0.6568
GradientBoosting Accuracy: 0.7302
GradientBoosting Precision: 0.5538
GradientBoosting Recall: 0.0459
GradientBoosting Specificity: 0.9862
GradientBoosting F1 Score: 0.0848

Training LinearSVC...
LinearSVC AUC-ROC: 0.5537
LinearSVC Accuracy: 0.7279
LinearSVC Precision: 1.0000
LinearSVC Recall: 0.0004
LinearSVC Specificity: 1.0000
LinearSVC F1 Score: 0.0008

Training DecisionTree...
DecisionTree AUC-ROC: 0.4867
DecisionTree Accuracy: 0.7275
DecisionTree Precision: 0.4973
Decisio

In [18]:

results_data = [
    ("RandomForest", 0.6533, None),  
    ("LogisticRegression", 0.5415, None),  
    ("DecisionTree", 0.4867, None),  
    ("GradientBoostedTrees", 0.6568, None), 
    ("LinearSVC", 0.5537, None)  
]

# Create a DataFrame for Plotly
data = {
    "Model": [name for name, _, _ in results_data],
    "AUC-ROC": [auc for _, auc, _ in results_data]
}

df = pd.DataFrame(data)


fig = px.bar(
    df,
    x="Model",
    y="AUC-ROC",
    title="Comparação entre modelos (AUC-ROC)",
    labels={"AUC-ROC": "AUC-ROC Score", "Model": "Model Name"},
    color="Model",
    color_discrete_sequence=px.colors.qualitative.Plotly
)


fig.update_layout(
    xaxis_title="Modelo",
    yaxis_title="Pontuação AUC-ROC",
    yaxis_range=[0, 1],  
    showlegend=False,
    title_x=0.5
)


fig.show()

In [None]:
import plotly.express as px
import pandas as pd

# Dados
data = {
    'Modelo': ['LogisticRegression']*6 + ['RandomForest']*6 + ['GradientBoosting']*6 + ['LinearSVC']*6 + ['DecisionTree']*6,
    'Métrica': ['AUC-ROC', 'Accuracy', 'Precision', 'Recall', 'Specificity', 'F1 Score']*5,
    'Valor': [0.5415, 0.7280, 0.8667, 0.0010, 0.9999, 0.0020,
              0.6533, 0.7289, 0.6190, 0.0110, 0.9975, 0.0216,
              0.6568, 0.7302, 0.5538, 0.0459, 0.9862, 0.0848,
              0.5537, 0.7279, 1.0000, 0.0004, 1.0000, 0.0008,
              0.4867, 0.7275, 0.4973, 0.0778, 0.9706, 0.1346]
}
df = pd.DataFrame(data)

fig = px.bar(df, x='Modelo', y='Valor', color='Métrica', barmode='group',
             title='Desempenho dos Modelos por Métrica',
             labels={'Valor': 'Valor da Métrica', 'Modelo': 'Modelos'},
             color_discrete_sequence=px.colors.qualitative.Plotly)
fig.show()