In [386]:
from dotenv import load_dotenv
load_dotenv()

import logging
from pathlib import Path
import json, re, random, math
from datetime import datetime
from typing import List, Dict, Any
import os
import re

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import numpy as np
import pandas as pd
import tiktoken
import retry

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import roc_auc_score

from openai import OpenAI
from contextualbandits.online import LinUCB

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

DATA_PATH = "artifacts/data/sanitized-sample.json"
PREPROCESSED_DATA_PATH = "artifacts/data/preprocessed-sanitized-sample.json"

CACHE_PATH = "artifacts/cache/"
FEATURES_CSV_PATH = "artifacts/features.csv"
FEATURES_JSON_PATH = "artifacts/features.json"
COACHING_CARDS_JSON = "artifacts/coaching_next.json"
REPORT_JSON_PATH = "artifacts/report.json"

EXTRACT_LLM_FEATURES_SYSTEM_PROMPT_PATH = "artifacts/system_prompts/extract_llm_features.txt"
COACHING_CARD_GENERATION_SYSTEM_PROMPT_PATH = "artifacts/system_prompts/generate_coaching_card.txt"

ACTIONS = ["clarity", "active_listening", "call_to_action", "friendliness"]

DEFAULT_LINUCB_ALPHA = 0.4
DEFAULT_SKILL_WEIGHT = 0.6
DEFAULT_OVERALL_WEIGHT = 0.4

DEFAULT_LLM_MODEL = "gpt-4.1-nano"
DEFAULT_TEMPERATURE = 0.01
FORCE_OVERWRITE_CACHE = False


In [387]:
# Utils

def preprocess_dataset(input_path: str=DATA_PATH, output_path: str=PREPROCESSED_DATA_PATH) -> None:
    with open(input_path, "r", encoding="utf-8-sig") as f:
        dataset = json.load(f)
    dataset = sorted(dataset, key=lambda x: parse_timestamp(x.get("start_time","1970-01-01T00:00:00Z")))

    mapping = {
        "clarity and enthusiasm in pitch":"clarity",
        "active listening and objection handling":"active_listening",
        "effective call to action":"call_to_action",
        "friendliness and respectful tone":"friendliness"
    }
    for i in range(len(dataset)):
        session = dataset[i]
        criteria = session.get("assessment_data",{}).get("criteria",{})
        for k,v in mapping.items():
            # score = criteria.get(k,{}).get("score", np.nan)
            session["assessment_data"]["criteria"][v] = criteria.get(k,{})
            session["assessment_data"]["criteria"].pop(k, None)
    with open(output_path, "w") as f:
        json.dump(dataset, f, indent=2)


def parse_timestamp(ts: str) -> datetime:
    try:
        return datetime.fromisoformat(ts.replace("Z","+00:00"))
    except Exception as e:
        logger.warning(f"Failed to parse timestamp {ts}: {e}")
        return datetime.strptime(ts.split(".")[0], "%Y-%m-%dT%H:%M:%S")

def parse_transcript_to_string(transcript: List[Dict[str,str]]) -> str:
    transcript_str = ""
    for t in transcript:
        role = t.get("type","").lower()
        if "agent_ai" in role:
            prefix = "AI Agent: "
        elif "user" in role:
            prefix = "User: "
        else:
            prefix = "Unknown: "
            logger.warning(f"Unknown role in transcript: {role}")
        transcript_str += f"{prefix}{t.get('data','')}\n"
    return transcript_str

def extract_json(text: str) -> Any:
    json_pattern = r"```json(.*?)```"
    match = re.search(json_pattern, text, re.DOTALL)
    if match:
        json_str = match.group(1).strip()
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            logger.error(f"JSON decoding error: {e}")
            return {}
        
def count_tokens(text: str) -> int:
    encoding = tiktoken.encoding_for_model("gpt-4.1-nano")
    tokens = encoding.encode(text)
    return len(tokens)

def read_txt(file_path: str) -> str:
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

In [388]:

class OpenAILLM:
    def __init__(self, model: str=DEFAULT_LLM_MODEL, temperature: float=DEFAULT_TEMPERATURE, cache_path: str=CACHE_PATH):
        self.client = OpenAI()
        self.model = model
        self.temperature = temperature
        self.cache = self.load_cache(cache_path=cache_path)

    def load_cache(self, cache_path: str) -> Dict[str, Any]:
        Path(cache_path).mkdir(parents=True, exist_ok=True)
        cache_file_path = os.path.join(cache_path, f"{self.model.replace('/', '__')}.json")
        if os.path.exists(cache_file_path):
            with open(cache_file_path, "r") as f:
                cache_json = json.load(f)
            cache_map = {}
            for item in cache_json:
                cache_map[item["prompt_hash"]] = item
            return cache_map
        return {}
    
    def update_cache(self, prompt_hash: str, prompt: str, response: str, cache_path: str=CACHE_PATH):
        self.cache[prompt_hash] = {
            "prompt_hash": prompt_hash,
            "prompt": prompt,
            "response": response
        }
        cache_file_path = os.path.join(cache_path, f"{self.model.replace('/', '__')}.json")
        Path(cache_path).mkdir(parents=True, exist_ok=True)
        with open(cache_file_path, "w") as f:
            json.dump(list(self.cache.values()), f, indent=2, ensure_ascii=False)
            
    @retry.retry(tries=3, delay=10)
    def _chat(self, messages: List[Dict[str, str]], force_overwrite_cache: bool=False) -> str:
        try:
            prompt_str = "\n".join([f"{m['role']}: {m['content']}" for m in messages])
            prompt_hash = str(hash(prompt_str))
            if not force_overwrite_cache and prompt_hash in self.cache:
                logger.info("Cache hit for prompt.")
                return self.cache[prompt_hash].get("response")
            completion = self.client.chat.completions.create(
                model=self.model,
                messages=messages,
                temperature=self.temperature
            )
            response = completion.choices[0].message.content.strip()
            self.update_cache(prompt_hash, prompt_str, response)
            if not force_overwrite_cache:
                logger.info("Cache missed, updated with new prompt.")
            else:
                logger.info("Force overwrite cache, updated with new prompt.")
            return response
        except Exception as e:
            logger.error(f"Error in OpenAI chat completion: {e}")
            raise e

    def chat(self, messages: List[Dict[str, str]], force_overwrite_cache: bool=False) -> str:
        try:
            if messages is None or len(messages) == 0:
                raise ValueError("Messages cannot be empty.")
            return self._chat(messages, force_overwrite_cache)
        except Exception as e:
            logger.error(f"Error occurred in chat: {e}")
            raise e


In [389]:

class FeatureExtractor:
    def __init__(self, llm: OpenAILLM=None):
        self.llm = llm
        
    def get_rubric_features(self, session: dict) -> dict:
        feats = {}
        overall = session.get("assessment_data",{}).get("overall",{}).get("score", np.nan)
        feats["overall"] = overall/100 if overall is not None else np.nan
        criteria = session.get("assessment_data",{}).get("criteria",{})
        for k in ACTIONS:
            score = criteria.get(k,{}).get("score", np.nan)
            feats[k] = score/100 if score is not None else np.nan
        return feats

    def extract_baseline_features(self, session: dict) -> dict:
        feats = self.get_rubric_features(session)

        transcript = session.get("transcript",[])
        agent_messages = [t for t in transcript if "agent_ai" in t.get("type","").lower()]
        user_messages = [t for t in transcript if "user" in t.get("type","").lower()]
        agent_tokens = sum(count_tokens(t.get("data","")) for t in agent_messages)
        user_tokens = sum(count_tokens(t.get("data","")) for t in user_messages)
        feats["talk_ratio"] = agent_tokens / (agent_tokens + user_tokens)

        # latency
        latencies = []
        for i in range(len(transcript)-1):
            t1 = transcript[i]
            t2 = transcript[i+1]
            if "timestamp" in t1 and "timestamp" in t2:
                ts1 = parse_timestamp(t1["timestamp"])
                ts2 = parse_timestamp(t2["timestamp"])
                delta = (ts2 - ts1).total_seconds()
                latencies.append(delta)
            else:
                logger.warning(f"Missing timestamp in transcript entries: {t1}, {t2}")

        feats["latency"] = np.mean(latencies) if latencies else 0.0
        feats["duration"] = float(session.get("duration",0))
        return feats
    
    def extract_llm_features(self, session: dict) -> dict:
        transcript = session.get("transcript",[])
        system_prompt = read_txt(EXTRACT_LLM_FEATURES_SYSTEM_PROMPT_PATH)
        transcript_str = parse_transcript_to_string(transcript)
        user_prompt = f"""Here is the dialogue between an AI agent and a user:
{transcript_str}

Please provide your evaluation based on the criteria mentioned above in JSON format in JSON block (```json ... ```).
"""
        messages = [
            {"role":"system", "content": system_prompt},
            {"role":"user", "content": user_prompt}
        ]
        response = self.llm.chat(messages, force_overwrite_cache=FORCE_OVERWRITE_CACHE)
        try:
            feats = extract_json(response)
            if not feats:
                raise ValueError("Empty JSON extracted")
            for k in feats:
                feats[k] = feats[k]/100 if isinstance(feats[k], (int,float)) else 0.0
        except Exception as e:
            logger.warning(f"Failed to parse LLM response: {e}")
            feats = {
                "objection_monitoring": 0.0,
                "cta_explicitness": 0.0,
                "empathy_markers": 0.0,
                "you_we_orientation": 0.0,
                "collaborative_tone": 0.0
            }
        question_count = self.get_question_ratio(session)
        feats["question_ratio"] = question_count
        return feats
    
    def get_question_ratio(self, session: dict) -> dict:
        transcript = session.get("transcript",[])
        agent_messages = [t for t in transcript if "agent_ai" in t.get("type","").lower()]
        non_question_messages = [t for t in agent_messages if not t.get("data","").strip().endswith("?")]
        non_question_messages_str = "\n".join(f"{i+1}. {t.get('data','')}" for i, t in enumerate(non_question_messages))
        system_prompt = "You are a helpful assistant."
        user_prompt = f"""Here are some messages from an AI agent, these messages do NOT end with a question mark, please count how many of them are questions in nature (i.e. they are trying to elicit a response from the user):
{non_question_messages_str}

Please provide your answer as an integer number only.
"""
        messages = [
            {"role":"system", "content": system_prompt},
            {"role":"user", "content": user_prompt}
        ]
        response = self.llm.chat(messages, force_overwrite_cache=FORCE_OVERWRITE_CACHE)
        try:
            question_count = int(response.strip())
        except Exception as e:
            logger.warning(f"Failed to parse question count from LLM response: {response}")
            question_count = 0
        question_count += len(agent_messages) - len(non_question_messages)
        total = len(agent_messages)
        return question_count / total if total > 0 else 0.0


In [390]:

class CoachingCardGenerator:
    def __init__(self, llm: OpenAILLM):
        self.llm = llm

    def generate(self, focus: str) -> dict:
        system_prompt = read_txt(COACHING_CARD_GENERATION_SYSTEM_PROMPT_PATH)
        user_prompt = f"Given the focus skill: '{focus}'\nPlease generate a coaching card as per the above criteria in JSON format in JSON block (```json ... ```)."
        messages = [
            {"role":"system", "content": system_prompt},
            {"role":"user", "content": user_prompt}
        ]
        response = self.llm.chat(messages, force_overwrite_cache=FORCE_OVERWRITE_CACHE)
        
        try:
            coaching_card = {
                "focus": focus
            }
            coaching_card.update(extract_json(response))
        except Exception:
            logger.warning(f"Failed to parse coaching card from LLM response. Use fallback response.\nLLM response: {response}")
            coaching_card = {
                "focus": focus,
                "why": "Practice this skill to improve performance.",
                "exercises": ["Exercise 1","Exercise 2","Exercise 3"],
                "scenario_stub": {
                    "persona": "Generic friend",
                    "opening": "Why should I try this?",
                    "followups": ["Is it worth my time?","What’s fun about it?"]
                },
                "difficulty_upgrades": [
                    "Increase pushback on objections.",
                    "Reduce time to respond with clarity."
                ]
            }
        return coaching_card


In [391]:

class AdaptiveLearningEngine:
    def __init__(self, feature_extractor: FeatureExtractor, coaching_card_generator: CoachingCardGenerator, data_path: str=DATA_PATH):
        self.feature_extractor = feature_extractor
        self.coaching_card_generator = coaching_card_generator
        self.data_path = data_path

    def load_data(self, path=DATA_PATH):
        with open(path,"r",encoding="utf-8-sig") as f:
            return json.load(f)

    def compute_deltas(self, curr, nxt, skill: str):
        a = curr.get("assessment_data",{}).get("criteria",{}).get(skill,{}).get("score")
        b = nxt.get("assessment_data",{}).get("criteria",{}).get(skill,{}).get("score")
        if a is not None and b is not None and not np.isnan(a) and not np.isnan(b):
            return (b-a)/100.0
        logger.warning(f"Missing scores for skill '{skill}': curr={a}, next={b}")
        return 0.0

    def run(self):
        with open(self.data_path,"r",encoding="utf-8-sig") as f:
            data = json.load(f)
            
        rows, baseline_vecs, full_vecs, labels = [], [], [], []

        for i in range(len(data)-1):
            curr, nxt = data[i], data[i+1]
            session_id = curr.get("session_id", str(i))
            next_session_id = nxt.get("session_id", str(i+1))

            baseline_feats = self.feature_extractor.extract_baseline_features(curr)
            llm_feats = self.feature_extractor.extract_llm_features(curr)
            feats = {}
            feats.update(baseline_feats)
            feats.update(llm_feats)
            rows.append({"session_id": session_id, **feats})

            # baseline only vector
            base_vec = np.array(list(baseline_feats.values()), dtype=float)
            baseline_vecs.append(base_vec)

            # full vector (baseline + llm features)
            full_vec = np.array(list(feats.values()), dtype=float)
            full_vecs.append(full_vec)

            # delta overall
            ov_curr = curr.get("assessment_data",{}).get("overall",{}).get("score", np.nan)
            ov_next = nxt.get("assessment_data",{}).get("overall",{}).get("score", np.nan)
            delta_overall = 0.0 if (np.isnan(ov_curr) or np.isnan(ov_next)) else (ov_next-ov_curr)/100.0

            labels.append(1 if delta_overall>0 else 0)

        # save features csv + json
        pd.DataFrame(rows).to_csv(FEATURES_CSV_PATH,index=False)
        with open(FEATURES_JSON_PATH, "w") as f:
            json.dump(rows,f,indent=2,ensure_ascii=False)

        X_base, X_full = np.vstack(baseline_vecs), np.vstack(full_vecs)
        y = np.array(labels)     

        loo = LeaveOneOut()
        preds_base, preds_full, true = [], [], []
        for train, test in loo.split(X_base):
            # baseline
            baseline_scaler = MinMaxScaler()
            X_base[train] = baseline_scaler.fit_transform(X_base[train])
            X_base[test] = baseline_scaler.transform(X_base[test])
            clf_baseline = LogisticRegression(random_state=SEED)
            clf_baseline.fit(X_base[train], y[train])
            prob_b = clf_baseline.predict_proba(X_base[test])[:,1][0]
            preds_base.append(prob_b)

            # baseline+LLM
            full_scaler = MinMaxScaler()
            X_full[train] = full_scaler.fit_transform(X_full[train])
            X_full[test] = full_scaler.transform(X_full[test])
            clf_full = LogisticRegression(random_state=SEED)
            clf_full.fit(X_full[train], y[train])
            prob_f = clf_full.predict_proba(X_full[test])[:,1][0]
            preds_full.append(prob_f)
            true.append(y[test][0])

        auc_b = roc_auc_score(true, preds_base) if len(set(true))>1 else float("nan")
        auc_f = roc_auc_score(true, preds_full) if len(set(true))>1 else float("nan")

        linucb = LinUCB(nchoices=len(ACTIONS), alpha=DEFAULT_LINUCB_ALPHA, random_state=SEED)
        
        rewards_rl = []
        rewards_weak = []
        chosen_actions_rl = []
        coaching_cards = []
        for i in range(len(data)-1):
            curr, nxt = data[i], data[i+1]
            session_id = curr.get("session_id", str(i))
            next_session_id = nxt.get("session_id", str(i+1))
            ov_curr = curr.get("assessment_data",{}).get("overall",{}).get("score", np.nan)
            ov_next = nxt.get("assessment_data",{}).get("overall",{}).get("score", np.nan)
            delta_overall = 0.0 if (np.isnan(ov_curr) or np.isnan(ov_next)) else (ov_next-ov_curr)/100.0
            
            baseline_feats = self.feature_extractor.extract_baseline_features(curr)
            llm_feats = self.feature_extractor.extract_llm_features(curr)
            feats = {}
            feats.update(baseline_feats)
            feats.update(llm_feats)
            full_vec = np.array(list(feats.values()), dtype=float)
            
            # choose next action rl
            chosen_action_rl_index = linucb.predict(full_vec.reshape(1, -1))[0]
            chosen_action_rl = ACTIONS[chosen_action_rl_index]
            chosen_actions_rl.append(chosen_action_rl)
            delta_skill_rl = self.compute_deltas(curr,nxt, chosen_action_rl)
            rewards_rl.append(DEFAULT_SKILL_WEIGHT*delta_skill_rl + DEFAULT_OVERALL_WEIGHT*delta_overall)

            # generate coaching card
            coaching_card = self.coaching_card_generator.generate(chosen_action_rl)
            coaching_cards.append({"step": i, "session_id": session_id, "next_session_id": next_session_id, **coaching_card})
            
            # choose next action weak
            chosen_action_weak_index = np.argmin([baseline_feats.get(a,0.0) for a in ACTIONS])
            chosen_action_weak = ACTIONS[chosen_action_weak_index]
            delta_skill_weak = self.compute_deltas(curr,nxt, chosen_action_weak)
            rewards_weak.append(DEFAULT_SKILL_WEIGHT*delta_skill_weak + DEFAULT_OVERALL_WEIGHT*delta_overall)
            
            # train linucb for next step with all actions
            X_full = np.tile(full_vec, (len(ACTIONS),1))

            actions = []
            actions_reward = []
            for action_index, action in enumerate(ACTIONS):
                actions.append(action_index)
                delta_skill = self.compute_deltas(curr,nxt, action)
                actions_reward.append(DEFAULT_SKILL_WEIGHT*delta_skill + DEFAULT_OVERALL_WEIGHT*delta_overall)

            actions = np.array(actions)
            actions_reward = np.array(actions_reward)
            # print(X_full.shape, actions.shape, actions_reward.shape)
            linucb.partial_fit(X_full, actions, actions_reward)
    
        avg_lin, avg_weak = np.mean(rewards_rl), np.mean(rewards_weak)
            
        with open(COACHING_CARDS_JSON,"w") as f: 
            json.dump(coaching_cards,f,indent=2)

        report = {
            "sessions": len(data),
            "steps": len(data)-1,
            "policy": {
                "linucb_alpha": DEFAULT_LINUCB_ALPHA,
                "skill_weight": DEFAULT_SKILL_WEIGHT,
                "overall_weight": DEFAULT_OVERALL_WEIGHT
            },
            "results": {
                "rl_linucb_mean_reward": float(avg_lin),
                "weakest_mean_reward": float(avg_weak)
            },
            "feature_ablation": {
                "baseline_auc": float(auc_b), "full_auc": float(auc_f)
            }
        }
        with open(REPORT_JSON_PATH,"w") as f: 
            json.dump(report,f,indent=2)

        print("-------- REPORT ---------")
        print(json.dumps(report, indent=2, ensure_ascii=False))


In [392]:

if __name__=="__main__":
    preprocess_dataset(DATA_PATH, PREPROCESSED_DATA_PATH)
    llm = OpenAILLM(model=DEFAULT_LLM_MODEL, temperature=DEFAULT_TEMPERATURE)
    feature_extractor = FeatureExtractor(llm)
    coaching_card_generator = CoachingCardGenerator(llm)
    engine = AdaptiveLearningEngine(feature_extractor, coaching_card_generator, PREPROCESSED_DATA_PATH)
    engine.run()


INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.


INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for prompt.
INFO:__main__:Cache hit for 

-------- REPORT ---------
{
  "sessions": 19,
  "steps": 18,
  "policy": {
    "linucb_alpha": 0.4,
    "skill_weight": 0.6,
    "overall_weight": 0.4
  },
  "results": {
    "rl_linucb_mean_reward": -0.02061111111111111,
    "weakest_mean_reward": 0.026388888888888882
  },
  "feature_ablation": {
    "baseline_auc": 0.3538461538461538,
    "full_auc": 0.5076923076923077
  }
}
