In [1]:
import os
import sys
from pathlib import Path

import datarobot as dr
from dotenv import load_dotenv

# The notebook should be executed from the project root directory
if "_correct_path" not in locals():
    os.chdir("..")
    sys.path.append(".")
    print(f"changed dir to {Path('.').resolve()})")
    _correct_path = True
load_dotenv()
client = dr.Client()

changed dir to /home/notebooks/storage)


In [2]:
import textwrap

from infra.settings_main import project_name
from nbo.custom_metrics import metrics_manager
from nbo.schema import (
    AppDataScienceSettings,
    AppInfraSettings,
    LLMModelSpec,
    OutcomeDetail,
    association_id,
)

custom_metric_baselines = metrics_manager.get_baseline_values()

In [3]:
app_ds_settings = AppDataScienceSettings(
    page_title="Opportunity Judging Helper",
    page_subtitle=textwrap.dedent("""\
        Combine predictive and generative AI to help review customer transaction alerts.
        All you need to do is choose a transaction and hit submit!"""),
    record_identifier={
        "column_name": "社名",
        "display_name": "社名",
    },
    text_explanation_feature=None,  # Optional, include when a text variable is present and ngrams feature explanations are desired
    no_text_gen_label=None,  # Optional, include in predictions where an email is not desired
    default_number_of_explanations=3,
    target_probability_description="the likelihood of contract closure for lead",  # Preceeded in prompt by "Feature is increasing/decreasing"
    email_prompt = textwrap.dedent("""\
        Draft an email to the sales team regarding the predicted likelihood of contract closure for lead {selected_record}.
        The predicted outcome is {prediction_label}.
        The email should contain a subject line and body.
        Keep the email tone {tone}. Keep the length {verbosity}. Try not to use many emojis.

        The following factors influenced this prediction:

        {rsp}"""),
    outcome_details = [  # Each item should be a target value with it's corresponding label and optional description
        OutcomeDetail(
            prediction=0,
            label="Low likelihood of closing",
            description="Based on the analysis, this lead has a low probability of resulting in a closed contract and may require nurturing or disqualification.",
        ),
        OutcomeDetail(
            prediction=1,
            label="High likelihood of closing",
            description="Based on the analysis, this lead has a high probability of resulting in a closed contract and warrants proactive engagement.",
        ),
    ],
    custom_metric_baselines=custom_metric_baselines,
    association_id_column_name=association_id,
    tones=[
        "authoritative and expert",
        "educational and informative",
        "formal and elevated",
        "friendly and casual",
        "lighthearted and funny",
        "witty and playful",
    ],
    verbosity=["short and sweet", "normal", "long and detailed"],
    system_prompt=textwrap.dedent("""\
        You are a sales performance analyst named Ai and you work for SalesForward Inc., a sales consulting firm.
        Your job is to review lead customer data and predict the likelihood of contract closure.
        You have detailed information about each lead customer and their engagement behavior.
        You also have a list of important factors that explain why a particular lead may have a high or low likelihood of closing.
        The goal is to incorporate these factors into a report addressed to the sales team 
        in order to to decide on the appropriate sales strategy and level of engagement.
        Output by Japansese."""),
    model_spec=LLMModelSpec(
        input_price_per_1k_tokens=0.001,  # update these values based on the model's pricing
        output_price_per_1k_tokens=0.002,
    ),
)

ValidationError: 1 validation error for AppDataScienceSettings
text_explanation_feature
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.9/v/string_type

In [None]:
from datarobot_pulumi_utils.schema.training import (
    AdvancedOptionsArgs,
    AnalyzeAndModelArgs,
    AutopilotRunArgs,
)

use_case_name = f"NBO Opp [{project_name}]"
use_case_description = "Suspicious Activity Monitoring"

dataset_name = f"NBO Opp Training Data [{project_name}]"
file_path = "assets/lead_scoring_demo_data_train.csv"


autopilotrun_args = AutopilotRunArgs(
    name=f"NBO Opp AutoPilot Run [{project_name}]",
    analyze_and_model_config=AnalyzeAndModelArgs(
        metric="LogLoss", mode="quick", target="契約成立", positive_class=1
    ),
    advanced_options_config=AdvancedOptionsArgs(seed=42),
)

registered_model_name = f"NBO [{project_name}]"

In [None]:
from datarobotx.idp.autopilot import get_or_create_autopilot_run
from datarobotx.idp.datasets import get_or_create_dataset_from_file
from datarobotx.idp.registered_model_versions import (
    get_or_create_registered_leaderboard_model_version,
)
from datarobotx.idp.use_cases import get_or_create_use_case

In [None]:
print(f"Creating Use Case {use_case_name}")

if "DATAROBOT_DEFAULT_USE_CASE" in os.environ:
    use_case_id = os.environ["DATAROBOT_DEFAULT_USE_CASE"]
else:
    use_case_id = get_or_create_use_case(
        endpoint=client.endpoint,
        token=client.token,
        name=use_case_name,
        description=use_case_description,
    )

In [None]:
print(f"Creating Dataset {dataset_name}")
dataset_id = get_or_create_dataset_from_file(
    token=client.token,
    endpoint=client.endpoint,
    name=dataset_name,
    file_path=file_path,
    use_cases=use_case_id,
)

In [None]:
print(f"Creating Autopilot Run {autopilotrun_args.name}")
project_id = get_or_create_autopilot_run(
    token=client.token,
    endpoint=client.endpoint,
    dataset_id=dataset_id,
    use_case=use_case_id,
    **autopilotrun_args.model_dump(mode="json"),
)

In [None]:
recommended_model_id = dr.ModelRecommendation.get(project_id).model_id  # type: ignore[union-attr,attr-defined]

try:
    model = dr.Model.get(project_id, recommended_model_id)  # type: ignore[attr-defined]
    prediction_threshold = model.get_roc_curve(
        source="validation"
    ).get_best_f1_threshold()
except Exception:
    prediction_threshold = None

In [None]:
import pandas as pd

print(f"Best model: {model.model_type}\n\nMetrics:")

pd.DataFrame.from_records(model.metrics)

In [None]:
print("Creating Registered Model Version...")
registered_model_version_id = get_or_create_registered_leaderboard_model_version(
    token=client.token,
    endpoint=client.endpoint,
    model_id=recommended_model_id,
    registered_model_name=registered_model_name,
    prediction_threshold=prediction_threshold,
)

In [None]:
app_infra_settings = AppInfraSettings(
    registered_model_name=registered_model_name,
    registered_model_version_id=registered_model_version_id,
    scoring_dataset_id=dataset_id,
    use_case_id=use_case_id,
    project_id=project_id,
)

In [None]:
import yaml

from infra.settings_main import (
    model_training_output_ds_settings,
    model_training_output_infra_settings,
)

with open(model_training_output_ds_settings, "w") as f:
    yaml.safe_dump(app_ds_settings.model_dump(mode="json"), f, allow_unicode=True)
with open(model_training_output_infra_settings, "w") as f:
    yaml.safe_dump(app_infra_settings.model_dump(mode="json"), f, allow_unicode=True)