In [0]:
%pip install -r ../requirements/requirements-dev.txt

In [0]:
from src.transformation.sklearn_preprocessing import PreProcessing
from src.model.sklearn_model import Classifier
from src.tracker.mlflow_tracker import MLFlowTracker
import os
import mlflow
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, f1_score
from pyspark.sql.functions import col
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt


In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
if "DATABRICKS_RUNTIME_VERSION" in os.environ:
    dataset = spark.read.json("/Volumes/workspace/ifood_case/marketing_recommender/data/processed/sv_customer_offer_relationship.json")
else:
    dataset = spark.read.json("../data/processed/sv_customer_offer_relationship.json")

In [0]:
categorical_features = ["gender"]
numerical_features = [
    "total_customer_buy",
    "total_customer_interactions",
    "age",
    "credit_card_limit",
    "customer_total_spent",
    "avg_customer_spent",
    "registered_year"
]
target = "best_offer_id"

model_dataset = dataset.select(["customer_id"] + categorical_features + numerical_features + [target]).dropDuplicates()

In [0]:
assert dataset.select("customer_id").dropDuplicates().count() == model_dataset.count()

In [0]:
display(model_dataset.limit(5))
display(model_dataset.summary())

In [0]:
os.environ["SPARKML_TEMP_DFS_PATH"] = "/Volumes/workspace/ifood_case/marketing_recommender/sparkml_temp"

Tive que adaptar o pipeline para sklearn ao invés de spark por limitação do cluster serverless gratuito do Databricks, erro: got the error [JVM_ATTRIBUTE_NOT_SUPPORTED] Directly accessing the underlying Spark driver JVM using the attribute 'sparkContext' is not supported on serverless compute. If you require direct access to these fields, consider using a single-user cluster. For more details on compatibility and limitations, check: https://docs.databricks.com/release-notes/serverless.html#limitations


In [0]:
tracker = MLFlowTracker()
classifier = Classifier()
preprocessing = PreProcessing()

with mlflow.start_run(run_name="classifier_xgboost"):
    
    preprocessing.build_categorical_transformation_pipeline(categorical_features=categorical_features, imputer_mapping={"gender": "NI"})
    preprocessing.build_numerical_transformation_pipeline(numerical_features=numerical_features)
    preprocessing.index_target(target_col="best_offer_id")
    preprocessing.assemble_pipeline()
    print(len(preprocessing.stages))
    pipeline = classifier.build_pipeline(stages=preprocessing.stages, model=XGBClassifier)

    # train, test = model_dataset.randomSplit([0.8, 0.2], seed=42)

    pandas_df = model_dataset.toPandas()
    X = pandas_df.drop(columns=["customer_id", target])
    y = pandas_df[target]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    max_depth_choices = [5, 10, 15]
    num_trees_choices = [50, 100, 150]

    # paramGrid = (
    #     ParamGridBuilder()
    #     .addGrid(pipeline.getStages()[-1].max_depth, max_depth_choices)
    #     .addGrid(pipeline.getStages()[-1].n_estimators, num_trees_choices)
    #     .build()
    # )

    paramGrid = {
        "clf__max_depth": max_depth_choices,
        "clf__n_estimators": num_trees_choices,
    }
    
    mlflow.log_param("param_grid_size", len(paramGrid))
    mlflow.log_param("max_depth_values", max_depth_choices)
    mlflow.log_param("num_trees_values", num_trees_choices)

    evaluator = make_scorer(f1_score, average="macro")
    # evaluator = MulticlassClassificationEvaluator(
    #     labelCol="label",
    #     predictionCol="prediction",
    #     metricName=metric_name
    # )
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)

    cv_model = classifier.cross_validation_tuning(X_train, y_train_encoded, pipeline, paramGrid, evaluator)

    # best_idx = cv_model.avgMetrics.index(
    #     max(cv_model.avgMetrics)
    # )
    # best_params = cv_model.getEstimatorParamMaps()[best_idx]
    best_params= cv_model.best_params_
    mlflow.log_metric("best_f1_macro_cv", cv_model.best_score_)
    print(best_params)
    mlflow.log_params(best_params)
    best_pipeline = classifier.build_pipeline(stages=preprocessing.stages, model=XGBClassifier, **best_params)
    classifier.train(X_train, y_train_encoded, best_pipeline)
    predictions = classifier.model.predict(X_test)
    test_results = classifier.evaluate(y_test_encoded, predictions)
    cm = classifier.get_confusion_matrix(y_test_encoded, predictions)
    normalized_cm = classifier.get_confusion_matrix(y_test_encoded, predictions, normalize="true")
    mlflow.log_metrics(test_results)
    mlflow.log_params(best_params)
    # mlflow.spark.log_model(
    #     classifier.model,
    #     "model",
    #     registered_model_name="offer-predictions"
    # )
    mlflow.sklearn.log_model(
        classifier.model,
        "model",
        registered_model_name="offer-predictions",
        input_example=X_train.head()
    )

97% acerto em “sem oferta”, baixo acerto em ofertas. Ainda tem que melhorar.

In [0]:
test_results

In [0]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [0]:
disp = ConfusionMatrixDisplay(confusion_matrix=normalized_cm)
disp.plot()
plt.show()