In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import numpy as np

session = get_active_session()

# Add a query tag to the session. This helps with debugging and performance monitoring.
session.query_tag = {"origin":"sf_sit-is", 
                     "name":"cr_notebooks_resolution", 
                     "version":{"major":1, "minor":0},
                     "attributes":{"is_quickstart":1, "source":"notebook"}}

# Set session context 
session.use_role("EMBEDDING_MODEL_HOL_USER") 
session.use_database("EMBEDDING_MODEL_HOL_DB")
session.use_schema("EMBEDDING_MODEL_HOL_SCHEMA")

# Print the current role, warehouse, and database/schema
print(f"role: {session.get_current_role()} | WH: {session.get_current_warehouse()} | DB.SCHEMA: {session.get_fully_qualified_current_schema()}")
     

In [None]:
! pip install transformers --quiet
! pip install torch --quiet

In [None]:
# This cell generates random examples to use at the end of the notebook for testing

import random
import pandas as pd

# Define 20 clean tech companies
clean_tech_companies = [
    "Google", "Microsoft", "Apple", "Amazon", "Meta",
    "Netflix", "Nvidia", "Intel", "Oracle", "Salesforce",
    "Adobe", "Dropbox", "Airbnb", "Uber", "Lyft",
    "Palantir", "Snowflake", "Stripe", "Zoom", "Spotify"
]

# Sample engineering job titles to inject as noise
engineering_titles = [
    "Software Engineer", "Backend Engineer", "Frontend Developer",
    "DevOps Engineer", "ML Engineer", "Data Engineer",
    "SRE", "Embedded Systems Engineer", "Security Engineer",
    "Principal Engineer", "Engineering Manager"
]

# Colleges for added noise
colleges = ["MIT", "Stanford", "Berkeley", "CMU", "Harvard", "Waterloo", "Georgia Tech"]

# Function to generate messy variants for a tech company
def generate_messy_variants_tech(name):
    variants = [
        name.lower(),
        name.upper(),
        name + " Inc.",
        name + " LLC",
        f"{name} Technologies",
        f"{name}.com",
        f"Worked at {name}",
        f"{name} - Engineering",
        f"{name} (Remote)",
        f"{name} | USA",
        f"{name} Corp",
        f"{name} - {random.choice(engineering_titles)}",
    ]
    return random.sample(variants, k=min(5, len(variants)))

# Function to inject engineering or college noise
def inject_tech_noise(text):
    noise_options = [
        f"{text} - {random.choice(engineering_titles)}",
        f"{random.choice(colleges)} alum, {text}",
        f"{text}, {random.choice(colleges)}",
        f"{text} | {random.choice(engineering_titles)}",
        f"Ex-{text} engineer"
    ]
    return random.choice(noise_options)

# Function to introduce typos into a word
def introduce_typos(text, max_typos=2):
    text = list(text)
    n_typos = random.randint(1, max_typos)
    for _ in range(n_typos):
        if len(text) == 0:
            break
        idx = random.randint(0, len(text) - 1)
        typo_type = random.choice(["delete", "swap", "replace"])
        if typo_type == "delete":
            del text[idx]
        elif typo_type == "swap" and idx < len(text) - 1:
            text[idx], text[idx + 1] = text[idx + 1], text[idx]
        elif typo_type == "replace":
            text[idx] = random.choice("abcdefghijklmnopqrstuvwxyz")
    return ''.join(text)

# Generate synthetic tech dataset
synthetic_tech_data = []
for company in clean_tech_companies:
    for variant in generate_messy_variants_tech(company):
        synthetic_tech_data.append({
            "messy_company_name": variant,
            "canonical_company_name": company
        })

# Sample 100 messy entries
synthetic_df = pd.DataFrame(synthetic_tech_data)
synthetic_df = synthetic_df.sample(n=100, replace=True, random_state=123).reset_index(drop=True)

# Add noise and typos
noisy_tech_names = []
for messy_name in synthetic_df["messy_company_name"]:
    if random.random() < 0.25:
        noisy_tech_names.append(inject_tech_noise(messy_name))
    elif (random.random() > 0.25) and (random.random() < 0.6):
        noisy_tech_names.append(introduce_typos(messy_name))
    else:
        noisy_tech_names.append(messy_name)

synthetic_df["messy_company_name"] = noisy_tech_names

In [None]:
synthetic_snowpark_df = session.create_dataframe(synthetic_df)
synthetic_snowpark_df.write.mode("overwrite").save_as_table("candidate_database")

In [None]:
synthetic_snowpark_df

# Model Import - Run in notebook to test

In [None]:
torch.cuda.is_available()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)

premise = "Angela Merkel ist eine Politikerin in Deutschland und Vorsitzende der CDU"
hypothesis = "Emmanuel Macron is the President of France"

input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt").to(device)
output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"
prediction = torch.softmax(output["logits"][0], -1).tolist()
label_names = ["entailment", "neutral", "contradiction"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
print(prediction)

In [None]:
input

In [None]:
import transformers
from snowflake.ml.registry import Registry

# Create Model Registry
reg = Registry(
    session=session, 
    database_name=session.get_current_database(), 
    schema_name=session.get_current_schema()
    )

# 1. Create the Hugging Face pipeline with softmax activation
res_pipeline = transformers.pipeline(
    task='text-classification',
    model='MoritzLaurer/mDeBERTa-v3-base-mnli-xnli',
    tokenizer='MoritzLaurer/mDeBERTa-v3-base-mnli-xnli',
    function_to_apply='softmax',
    return_all_scores=True, 
    device="cuda:0",     # or -1 for CPU
    top_k=6,
    batch_size=3,
    torch_dtype='float16'    
)

mv = reg.log_model(
    res_pipeline,
    model_name="MoritzLaurer_DeBERTa_nli_v1",
    version_name="v1",
    pip_requirements=["transformers", "torch", "pyarrow<19.0.0"],
)


In [None]:
reg.show_models()

In [None]:
# 5. Deploy the model into Snowpark Container Services (SPCS)
mv.create_service(
    service_name='nli_bert_svc',
    service_compute_pool='GPU_NV_S_COMPUTE_POOL',
    image_repo=f"{session.get_current_database()}.{session.get_current_schema()}.MY_INFERENCE_IMAGES",
    build_external_access_integration="ALLOW_ALL_INTEGRATION", #allows access to pypi to build
    ingress_enabled=True,
    gpu_requests        = "1",
    max_instances       = 2
)



In [None]:
-- SHOW SERVICES;
ALTER SERVICE NLI_BERT_SVC SUSPEND;

In [None]:
SHOW ENDPOINTS IN SERVICE NLI_BERT_SVC;

In [None]:
from pprint import pprint

sample_inputs = [
    "I love this game, it's amazing!, I am having a great time playing this game",
    "This is the best game ever, absolutely the worst game I ever played",
    "pizza is the best food ever, I can't stand this video game"
]

test_df = pd.DataFrame({ 'text': sample_inputs})
results = mv.run(
    test_df,
    service_name='nli_bert_svc'
)
pprint(results['labels'].tolist())

In [None]:
synthetic_df['text'] = synthetic_df['messy_company_name'] + ', ' + synthetic_df['canonical_company_name']
print(synthetic_df.shape)
synthetic_df.head()

In [None]:
from pprint import pprint

test_df = synthetic_df[['text']]
results = mv.run(
    test_df,
    service_name='nli_bert_svc'
)
pprint(results['labels'].tolist())