In [6]:
# %% [markdown]
# # Autodesk Customer Fit — MAML (Few-Shot) on Synthetic Data
# Tasks = segments by: industry|region|product_affinity
# Train MAML on many tasks; adapt to a new segment with k labeled examples.

# %% [code]
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score

np.random.seed(11); random.seed(11); torch.manual_seed(11)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [4]:
# %% [code]
def gen_synth(N=8000, today=datetime(2025,9,8)):
    regions = ["AMER", "EMEA", "APAC"]
    industries = ["AEC", "Manufacturing", "Media_Entertainment", "Education", "Government"]
    product_families = ["AutoCAD", "Revit", "Fusion", "Maya", "3dsMax"]
    lead_sources = ["Website", "Paid_Search", "Paid_Social", "Events", "Partner", "Email"]
    campaign_types = ["Trial", "Webinar", "Ebook", "Case_Study", "Promo", "Nurture"]
    seniority = ["Individual", "Manager", "Director", "VP", "C-Level"]
    titles = ["Design Engineer","Mechanical Engineer","Architect","BIM Manager","CAD Manager",
              "Product Designer","VFX Supervisor","Production Manager","IT Admin","Procurement Manager",
              "CIO","CTO","Head of Engineering"]
    company_sizes = ["1-10","11-50","51-200","201-1000","1001-5000","5001+"]

    def wchoice(items, w): return random.choices(items, weights=w, k=1)[0]
    w_region=[0.45,0.30,0.25]; w_ind=[0.35,0.30,0.20,0.10,0.05]
    w_prod=[0.30,0.25,0.20,0.15,0.10]; w_source=[0.35,0.20,0.10,0.15,0.10,0.10]
    w_camp=[0.25,0.20,0.15,0.10,0.15,0.15]; w_sen=[0.45,0.25,0.15,0.10,0.05]
    w_title=[0.10,0.10,0.10,0.08,0.08,0.10,0.06,0.06,0.10,0.10,0.05,0.04,0.03]
    w_size=[0.15,0.20,0.25,0.20,0.12,0.08]

    start = today - timedelta(days=365)
    lead_dates = [start + timedelta(days=int(np.random.beta(2,5)*365)) for _ in range(N)]
    rows = []
    for i in range(N):
        g = np.random.choice([1,2], p=[0.6,0.4])  # 60% cold-start
        region = wchoice(regions, w_region)
        industry = wchoice(industries, w_ind)
        product_interest = wchoice(product_families, w_prod)
        source = wchoice(lead_sources, w_source)
        campaign = wchoice(campaign_types, w_camp)
        senior = wchoice(seniority, w_sen)
        title = wchoice(titles, w_title)
        size = wchoice(company_sizes, w_size)
        created_at = lead_dates[i]

        base_eng = {"Trial":10,"Webinar":8,"Ebook":6,"Case_Study":5,"Promo":4,"Nurture":3}[campaign]
        site_pages = max(0, int(np.random.normal(3 + base_eng*0.2, 2)))
        email_opens = max(0, int(np.random.normal(1 + base_eng*0.15, 1.5)))
        email_clicks = max(0, np.random.binomial(n=max(1, email_opens), p=0.25))
        trial_started = int(campaign == "Trial") * np.random.binomial(1, 0.7 if source in ["Website","Paid_Search"] else 0.4)
        webinar_attended = int(campaign == "Webinar") * np.random.binomial(1, 0.6)

        if g==2:
            products_owned = random.sample(product_families, k=np.random.choice([1,2,3], p=[0.6,0.3,0.1]))
            seats = int(np.random.lognormal(mean=3.2, sigma=0.7)); seats=max(seats,1)
            seat_util = float(np.clip(np.random.normal(0.65, 0.15), 0, 1))
            renewal_days = int(np.random.normal(120, 90))
            tickets = max(0, int(np.random.normal(3, 4)))
            csat = float(np.clip(np.random.normal(0.78, 0.12), 0, 1))
            tenure = max(1, int(np.random.normal(30, 18)))
            arr_k = max(1, int(seats*np.random.uniform(0.5,2.0)))
            expansion = np.random.choice([-1,0,1], p=[0.15,0.65,0.20])
            existing = 1
        else:
            products_owned=[]; seats=np.nan; seat_util=np.nan; renewal_days=np.nan
            tickets=np.nan; csat=np.nan; tenure=np.nan; arr_k=np.nan; expansion=np.nan
            existing = 0

        domain_available = np.random.binomial(1, 0.85 if g==2 else 0.55)

        z = -0.5
        z += 0.6 if campaign=="Trial" else 0.0
        z += 0.3 if webinar_attended else 0.0
        z += 0.25*np.log1p(site_pages) + 0.20*np.log1p(email_clicks)
        z += 0.2 if industry in ["AEC","Manufacturing"] else -0.05
        z += 0.1 if region=="AMER" else 0.0
        z += 0.15 if senior in ["Manager","Director"] else (0.05 if senior in ["VP","C-Level"] else 0.0)
        if g==2:
            z += 0.35*(1-np.tanh(max(0,renewal_days)/365))
            z += 0.4*seat_util
            z -= 0.1*np.tanh((tickets if not np.isnan(tickets) else 0)/10)
            z += 0.15*np.tanh((tenure if not np.isnan(tenure) else 0)/60)
            z += 0.2*((expansion if not np.isnan(expansion) else 0))
            z += 0.1 if product_interest in products_owned else 0.0

        p_fit = 1/(1+np.exp(-z))
        fit_label = int(np.random.binomial(1, p_fit))

        product_aff = (product_interest if (g==1 or not products_owned) else
                       (product_interest if (product_interest in products_owned and np.random.rand()<0.6)
                        else random.choice(products_owned)))

        rows.append(dict(
            lead_id=f"LEAD-{i:06d}", created_at=lead_dates[i], group=g,
            region=region, industry=industry, product_interest=product_interest,
            lead_source=source, campaign_type=campaign, seniority=senior,
            job_title=title, company_size=size,
            site_pages=site_pages, email_opens=email_opens, email_clicks=email_clicks,
            trial_started=trial_started, webinar_attended=webinar_attended, domain_available=domain_available,
            existing_customer=existing, products_owned=";".join(products_owned),
            seats=seats, seat_utilization=seat_util, renewal_days=renewal_days,
            support_tickets_90d=tickets, csat=csat, tenure_months=tenure, arr_k=arr_k, expansion_6mo=expansion,
            fit_label=fit_label, product_affinity=product_aff,
            segment_key=f"{industry}|{region}|{product_aff}",
        ))
    return pd.DataFrame(rows).sort_values("created_at").reset_index(drop=True)



In [14]:
df = gen_synth(N=8000)
print (df.head().shape)
df.head()

(5, 30)


Unnamed: 0,lead_id,created_at,group,region,industry,product_interest,lead_source,campaign_type,seniority,job_title,company_size,site_pages,email_opens,email_clicks,trial_started,webinar_attended,domain_available,existing_customer,products_owned,seats,seat_utilization,renewal_days,support_tickets_90d,csat,tenure_months,arr_k,expansion_6mo,fit_label,product_affinity,segment_key
0,LEAD-007463,2024-09-09,2,AMER,Manufacturing,AutoCAD,Partner,Trial,Individual,Architect,201-1000,5,2,2,0,0,1,1,Maya;AutoCAD,20.0,0.493686,165.0,4.0,0.895185,23.0,16.0,0.0,1,AutoCAD,Manufacturing|AMER|AutoCAD
1,LEAD-005747,2024-09-09,2,AMER,Media_Entertainment,Fusion,Paid_Social,Trial,Individual,Product Designer,51-200,4,2,1,1,0,1,1,Fusion;Maya,28.0,0.620851,91.0,0.0,0.63301,19.0,40.0,1.0,0,Fusion,Media_Entertainment|AMER|Fusion
2,LEAD-006986,2024-09-09,1,AMER,Manufacturing,Revit,Website,Trial,Individual,Product Designer,201-1000,7,2,0,1,0,1,0,,,,,,,,,,1,Revit,Manufacturing|AMER|Revit
3,LEAD-004137,2024-09-10,2,EMEA,Manufacturing,AutoCAD,Email,Nurture,Director,Design Engineer,201-1000,4,2,0,0,0,1,1,3dsMax,23.0,0.531158,-23.0,11.0,0.621102,54.0,28.0,0.0,1,3dsMax,Manufacturing|EMEA|3dsMax
4,LEAD-004426,2024-09-10,1,AMER,Manufacturing,Fusion,Website,Nurture,Individual,Procurement Manager,201-1000,1,0,0,0,0,1,0,,,,,,,,,,1,Fusion,Manufacturing|AMER|Fusion


# 2) Vectorization (TF-IDF, One-Hot, Scaled numerics)

In [27]:
# %% [code]
from sklearn.pipeline import Pipeline as SKPipeline

TEXT_COL = "job_title"
CAT_COLS = ["region","industry","product_interest","lead_source","campaign_type","seniority","company_size","product_affinity"]
NUM_COLS = ["site_pages","email_opens","email_clicks","trial_started","webinar_attended","domain_available","existing_customer",
            "seats","seat_utilization","renewal_days","support_tickets_90d","csat","tenure_months","arr_k","expansion_6mo"]

cat_ohe = SKPipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("oh", OneHotEncoder(handle_unknown="ignore"))])

num_proc = SKPipeline([("imp", SimpleImputer(strategy="median")),
                       ("sc", StandardScaler(with_mean=False))])

full = ColumnTransformer([
    ("txt", TfidfVectorizer(min_df=5, ngram_range=(1,2)), TEXT_COL),
    ("cat", cat_ohe, CAT_COLS),
    ("num", num_proc, NUM_COLS),
], sparse_threshold=0.3)

# Time split (simulating train -> future test)
df = df.sort_values("created_at").reset_index(drop=True)
cut = int(0.8*len(df))
train_df = df.iloc[:cut].copy().reset_index(drop=True)
test_df  = df.iloc[cut:].copy().reset_index(drop=True)

X_train = full.fit_transform(train_df)
X_test  = full.transform(test_df)
y_train = train_df["fit_label"].values.astype(np.float32)
y_test  = test_df["fit_label"].values.astype(np.float32)

# densify
X_train = X_train.toarray() if hasattr(X_train, "toarray") else X_train
X_test  = X_test.toarray() if hasattr(X_test, "toarray") else X_test

print (X_train.shape, y_train.shape)

print (X_test.shape, y_test.shape)

(6400, 87) (6400,)
(1600, 87) (1600,)


## Task index (each task = a segment)

In [13]:
# # %% [code]
# from collections import defaultdict

# def build_task_index_pos(frame: pd.DataFrame, min_per_class=20):
#     # store POSITIONAL indices, use .iloc later
#     tasks = defaultdict(list)
#     ys = frame["fit_label"].values
#     for pos, seg in enumerate(frame["segment_key"].tolist()):
#         tasks[seg].append(pos)
#     good = {}
#     for seg, pos_list in tasks.items():
#         y = ys[pos_list]
#         if (y.sum() >= min_per_class) and ((len(y)-y.sum()) >= min_per_class):
#             good[seg] = pos_list
#     return good

# train_tasks = build_task_index_pos(train_df, min_per_class=15)
# test_tasks  = build_task_index_pos(test_df , min_per_class=10)
# len(train_tasks), len(test_tasks)

from collections import defaultdict
import numpy as np

def build_task_index_pos(
    frame,                      # a DataFrame (train_df OR test_df)
    target_col="fit_label",     # the label you meta-learn on
    min_per_class=20            # require at least this many 0s and 1s in the segment
):
    # Use POSITIONAL indices (0..len(frame)-1) so we can slice with .iloc and align with X arrays.
    tasks = defaultdict(list)
    segs = frame["segment_key"].tolist()
    for pos, seg in enumerate(segs):
        tasks[seg].append(pos)

    # Filter: keep only segments with enough positives and negatives
    y = frame[target_col].values
    good = {}
    for seg, pos_list in tasks.items():
        ys = y[pos_list]
        pos_count = int(ys.sum())
        neg_count = int(len(ys) - pos_count)
        if pos_count >= min_per_class and neg_count >= min_per_class:
            good[seg] = pos_list
    return good


train_tasks = build_task_index_pos(train_df, min_per_class=15)
test_tasks  = build_task_index_pos(test_df , min_per_class=10)
len(train_tasks), len(test_tasks)   

# Example:
# train_tasks = build_task_index_pos(train_df, target_col="fit_label", min_per_class=15)
# test_tasks  = build_task_index_pos(test_df , target_col="fit_label", min_per_class=10)

(55, 27)

In [28]:
len(train_tasks["AEC|AMER|AutoCAD"])

278

In [33]:
train_df.head()

Unnamed: 0,lead_id,created_at,group,region,industry,product_interest,lead_source,campaign_type,seniority,job_title,company_size,site_pages,email_opens,email_clicks,trial_started,webinar_attended,domain_available,existing_customer,products_owned,seats,seat_utilization,renewal_days,support_tickets_90d,csat,tenure_months,arr_k,expansion_6mo,fit_label,product_affinity,segment_key
0,LEAD-007463,2024-09-09,2,AMER,Manufacturing,AutoCAD,Partner,Trial,Individual,Architect,201-1000,5,2,2,0,0,1,1,Maya;AutoCAD,20.0,0.493686,165.0,4.0,0.895185,23.0,16.0,0.0,1,AutoCAD,Manufacturing|AMER|AutoCAD
1,LEAD-005747,2024-09-09,2,AMER,Media_Entertainment,Fusion,Paid_Social,Trial,Individual,Product Designer,51-200,4,2,1,1,0,1,1,Fusion;Maya,28.0,0.620851,91.0,0.0,0.63301,19.0,40.0,1.0,0,Fusion,Media_Entertainment|AMER|Fusion
2,LEAD-006986,2024-09-09,1,AMER,Manufacturing,Revit,Website,Trial,Individual,Product Designer,201-1000,7,2,0,1,0,1,0,,,,,,,,,,1,Revit,Manufacturing|AMER|Revit
3,LEAD-004137,2024-09-10,2,EMEA,Manufacturing,AutoCAD,Email,Nurture,Director,Design Engineer,201-1000,4,2,0,0,0,1,1,3dsMax,23.0,0.531158,-23.0,11.0,0.621102,54.0,28.0,0.0,1,3dsMax,Manufacturing|EMEA|3dsMax
4,LEAD-004426,2024-09-10,1,AMER,Manufacturing,Fusion,Website,Nurture,Individual,Procurement Manager,201-1000,1,0,0,0,0,1,0,,,,,,,,,,1,Fusion,Manufacturing|AMER|Fusion


## 3) First-Order MAML (binary fit_label)

In [32]:
# %% [code]
class FunctionalMLP(nn.Module):
    def __init__(self, in_dim, hidden=128):
        super().__init__()
        # Meta-parameters
        self.fc1_w = nn.Parameter(torch.randn(hidden, in_dim)*0.02)
        self.fc1_b = nn.Parameter(torch.zeros(hidden))
        self.fc2_w = nn.Parameter(torch.randn(1, hidden)*0.02)
        self.fc2_b = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        h = F.linear(x, self.fc1_w, self.fc1_b); h = F.relu(h)
        out = F.linear(h, self.fc2_w, self.fc2_b).squeeze(-1)
        return out  # logit

    def functional_forward(self, x, params):
        h = F.linear(x, params["fc1_w"], params["fc1_b"]); h = F.relu(h)
        out = F.linear(h, params["fc2_w"], params["fc2_b"]).squeeze(-1)
        return out

    def named_params_dict(self):
        return {"fc1_w": self.fc1_w, "fc1_b": self.fc1_b, "fc2_w": self.fc2_w, "fc2_b": self.fc2_b}

def inner_adapt(model, params, x_s, y_s, inner_lr=0.05, steps=1):
    # First-order MAML (no second-order graph)
    for _ in range(steps):
        logits = model.functional_forward(x_s, params)
        loss = F.binary_cross_entropy_with_logits(logits, y_s)
        grads = torch.autograd.grad(loss, params.values(), create_graph=False)
        
        # Parameter update (task-specific)
        params = {k: p - inner_lr*g for (k,p), g in zip(params.items(), grads)}
    return params

def sample_support_query_pos(frame, idxs, k_shot=10, q_size=30):
    y = frame["fit_label"].values
    pos = [i for i in idxs if y[i]==1]
    neg = [i for i in idxs if y[i]==0]
    if len(pos)<k_shot or len(neg)<k_shot: return None

    # k-shot per class for support
    pos_s = np.random.choice(pos, k_shot, replace=False)
    neg_s = np.random.choice(neg, k_shot, replace=False)

    # remaining pool for query
    remain = list(set(idxs) - set(pos_s) - set(neg_s))
    y_rem = y[remain]
    pos_r = [remain[i] for i in range(len(y_rem)) if y_rem[i]==1]
    neg_r = [remain[i] for i in range(len(y_rem)) if y_rem[i]==0]
    if len(pos_r)<q_size or len(neg_r)<q_size: return None
        
    pos_q = np.random.choice(pos_r, q_size, replace=False)
    neg_q = np.random.choice(neg_r, q_size, replace=False)
    
    # return np.concatenate([pos_s, neg_s]), np.concatenate([pos_q, neg_q])
    support = np.concatenate([pos_s, neg_s])
    query   = np.concatenate([pos_q, neg_q])
    return support, query

# tensors
# X_train_t: torch.tensor of your vectorized train features
# y_train_t: torch.tensor of your train labels
X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)

model = FunctionalMLP(in_dim=X_train.shape[1], hidden=128).to(device)
meta_opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

meta_iters = 40          # adjust up for better performance
tasks_per_batch = 8
k_shot = 12
q_size = 24

for it in range(1, meta_iters+1):
    meta_opt.zero_grad()
    keys = np.random.choice(list(train_tasks.keys()), min(tasks_per_batch, len(train_tasks)), replace=False)
    used = 0; tot_loss = 0.0
    for key in keys:
        pair = sample_support_query_pos(train_df, train_tasks[key], k_shot=k_shot, q_size=q_size)
        if pair is None: 
            continue
        s_idx, q_idx = pair
        x_s = X_train_t[s_idx]; y_s = y_train_t[s_idx]
        x_q = X_train_t[q_idx]; y_q = y_train_t[q_idx]
        params = {k: v.clone() for k,v in model.named_params_dict().items()}
        params = inner_adapt(model, params, x_s, y_s, inner_lr=0.05, steps=1)
        q_logits = model.functional_forward(x_q, params)
        q_loss = F.binary_cross_entropy_with_logits(q_logits, y_q)
        q_loss.backward()
        tot_loss += q_loss.item(); used += 1
    meta_opt.step()
    if it % 10 == 0:
        print(f"[MAML] iter {it:03d} | tasks {used} | avg query loss {tot_loss/max(1,used):.4f}")

[MAML] iter 010 | tasks 5 | avg query loss 0.6869
[MAML] iter 020 | tasks 4 | avg query loss 0.6831
[MAML] iter 030 | tasks 5 | avg query loss 0.6644
[MAML] iter 040 | tasks 6 | avg query loss 0.6724


## 4) Few-shot adaptation & prediction on held-out task

In [34]:
# %% [code]
X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device)

# choose a held-out test segment
test_key = list(test_tasks.keys())[0]
idxs = test_tasks[test_key]

def test_support_query_pos(frame, idxs, k=10, q=200):
    y = frame["fit_label"].values
    pos = [i for i in idxs if y[i]==1]
    neg = [i for i in idxs if y[i]==0]
    k = min(k, len(pos), len(neg))
    pos_s = np.random.choice(pos, k, replace=False)
    neg_s = np.random.choice(neg, k, replace=False)
    support = np.concatenate([pos_s, neg_s])
    remain = list(set(idxs) - set(support))
    q = min(q, len(remain))
    query = np.random.choice(list(remain), q, replace=False)
    return support, query

s_idx, q_idx = test_support_query_pos(test_df, idxs, k=10, q=200)
x_s = X_test_t[s_idx]; y_s = y_test_t[s_idx]
x_q = X_test_t[q_idx]; y_q = y_test_t[q_idx]

# base (no adaptation)
with torch.no_grad():
    base_probs = torch.sigmoid(model.forward(x_q)).cpu().numpy()
    base_auc = roc_auc_score(y_q.cpu().numpy(), base_probs)

# after 1 inner step on support
params = {k: v.clone() for k,v in model.named_params_dict().items()}
params = inner_adapt(model, params, x_s, y_s, inner_lr=0.05, steps=1)
with torch.no_grad():
    adap_probs = torch.sigmoid(model.functional_forward(x_q, params)).cpu().numpy()
    adap_auc = roc_auc_score(y_q.cpu().numpy(), adap_probs)

print("Held-out segment:", test_key)
print(f"AUC (no adaptation):  {base_auc:.3f}")
print(f"AUC (after 1 step):   {adap_auc:.3f}")

Held-out segment: Manufacturing|AMER|Fusion
AUC (no adaptation):  0.595
AUC (after 1 step):   0.590


## Predict on new leads with MAML (given few labeled support)

In [35]:
# %% [code]
def predict_maml(new_df: pd.DataFrame, support_df: pd.DataFrame, inner_lr=0.05, steps=1):
    # Fit/transform using the global 'full' vectorizer (already fitted)
    assert "fit_label" in support_df.columns, "support_df must include 'fit_label' labels"
    X_sup = full.transform(support_df).toarray()
    X_new = full.transform(new_df).toarray()
    x_s = torch.tensor(X_sup, dtype=torch.float32).to(device)
    y_s = torch.tensor(support_df["fit_label"].values.astype(np.float32), dtype=torch.float32).to(device)
    x_n = torch.tensor(X_new, dtype=torch.float32).to(device)

    # adapt the meta-parameters on the support labels
    params = {k: v.clone() for k,v in model.named_params_dict().items()}
    params = inner_adapt(model, params, x_s, y_s, inner_lr=inner_lr, steps=steps)

    with torch.no_grad():
        probs = torch.sigmoid(model.functional_forward(x_n, params)).cpu().numpy()

    out = new_df.copy()
    out["maml_fit_prob"] = probs
    return out

# Example: take support from the held-out segment; predict on 5 new leads of the same segment
support_df = test_df.iloc[s_idx].copy()
new_pool = test_df.iloc[q_idx].sample(5, random_state=0).drop(columns=["fit_label"])
predict_maml(new_pool, support_df, inner_lr=0.05, steps=1)[["segment_key","maml_fit_prob"]]

Unnamed: 0,segment_key,maml_fit_prob
759,AEC|EMEA|Revit,0.465168
44,AEC|AMER|Revit,0.569308
628,AEC|AMER|Fusion,0.544458
601,Media_Entertainment|AMER|3dsMax,0.615876
748,Manufacturing|APAC|Revit,0.452017
