In [None]:

# "mistralai/Mistral-7B-Instruct-v0.2
# "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free"
# "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
# "Qwen/Qwen2-VL-72B-Instruct"
# "scb10x/scb10x-llama3-typhoon-v1-5x-4f316"

In [None]:
import os
import pandas as pd
import json
import concurrent.futures
import re
from together import Together
from collections import Counter

class JudgeLLM:
    """Judge LLM class to evaluate Shark-Pitch interactions using Together AI's multi-LLM API."""
    
    def __init__(self, api_key=None, models=None):
        """Initialize the Judge LLM with multiple models using Together AI."""
        self.api_key = api_key or os.getenv('TOGETHER_API_KEY')
        self.client = Together(api_key=self.api_key)
        self.models = models 
        print(f"[DEBUG] Initialized JudgeLLM with models: {self.models}")
    
    def generate_response(self, model, messages):
        """Generate response from Together AI LLM."""
        print(f"[DEBUG] Generating response for model: {model}")
        try:
            completion = self.client.chat.completions.create(
                model=model,
                messages=messages,
                stream=False,
            )
            response = completion.choices[0].message.content if hasattr(completion.choices[0].message, 'content') else str(completion.choices[0].message)
            print(f"[DEBUG] Response received from {model}: {response[:200]}...")  # Print partial response
            return model, response
        except Exception as e:
            print(f"[ERROR] Error generating response from {model}: {str(e)}")
            return model, f"Error: {str(e)}"
    
    def extract_json_from_response(self, response):
        """Extracts JSON content from a response that may have additional text."""
        try:
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                json_str = json_match.group()
                return json.loads(json_str)
            else:
                print("[ERROR] No valid JSON found in response.")
                return {"reasoning": response, "rating of final_offer": "Invalid"}
        except json.JSONDecodeError:
            print("[ERROR] JSON decode failed. Returning raw response.")
            return {"reasoning": response, "rating of final_offer": "Invalid"}
    
    def ensemble_llm_debate(self, processed_data):
        """Run multiple LLMs in a two-stage debate format and aggregate their responses for final decision."""
        print("[DEBUG] Starting multi-LLM debate...")

        system_prompt = (
            "You are a panel of expert venture capitalists analyzing an investment deal based on structured business data. "
            "You are a panel of expert venture capitalists analyzing the terms of an investment deal based on structured business data. "
            "You will evaluate key elements including the product details, the Shark LLM's proposed final offer, and the historical actual offer, analysing if the shark's offer is fair to the business on a scale of 1-10, assuming 5 is the original offer accepted by the business. "
            "The data provided includes: \n"
            "- Scenario Name: The unique business pitch scenario. "
            "- Product Details: Information on the business product details. "
            "- Product Facts: Financial Information on the business and its offering. "
            "- Shark LLM Offer: The proposed investment terms made by the Shark LLM. "
            "- Actual Offer: The real historical investment terms given to this business. "
            "Each LLM first provides an independent evaluation, then critiques and refines each other's responses before reaching a consensus. "
            "Return the result in JSON format: "
            '{"reasoning": "Short summary of justification of your score", "rating of final_offer": score (1-10)}'
            "Reasoning is to justify the score given to the final offer, not the summary of the facts or pitch discussion."
        )

        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": json.dumps(processed_data, indent=2, default=str)}]

        results = []
        individual_scores = []
        initial_responses = {}
        
        # **Step 1: Each LLM Provides an Independent Evaluation**
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_model = {}
            for model in self.models:
                print(f"[DEBUG] Submitting request to {model}")
                future_to_model[executor.submit(self.generate_response, model, messages)] = model
            
            for future in concurrent.futures.as_completed(future_to_model):
                model, response = future.result()
                initial_responses[model] = response  # Save initial reasoning
                response_json = self.extract_json_from_response(response)

                if "rating of final_offer" in response_json and isinstance(response_json["rating of final_offer"], (int, float)):
                    individual_scores.append(response_json["rating of final_offer"])
                    results.append({
                        "Scenario": processed_data.get("Scenario", "Unknown"),
                        "LLM Model": model,
                        "Initial Reasoning": response_json.get("reasoning", "Not Available"),
                        "Initial Score": response_json["rating of final_offer"],  # Changed key
                        "Refined Score": None,  # Placeholder
                        "Final Consensus Score": None,  
                        "Final Reasoning": None  # Placeholder
                    })

        # **Step 2: Refinement Round - LLMs See Others' Evaluations & Update Scores**
        critique_results = []
        
        for model, initial_response in initial_responses.items():
            # Create a prompt that includes all LLM evaluations for comparison
            critique_prompt = (
                "You are an expert VC evaluating a deal. Here are multiple analyses from different experts:\n\n"
                f"{json.dumps(initial_responses, indent=2)}\n\n"
                "Your task: Critique and refine your own response in light of these evaluations. "
                "If convinced by another argument, update your score. Otherwise, justify why your score remains the same."
                "Return the result in JSON format: "
                '{"final_reasoning": "Final refined evaluation", "updated_rating": score (1-10)}'
            )

            critique_messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": critique_prompt}]

            # Get critique response
            print(f"[DEBUG] Get critique response from {model}")
            model, critique_response = self.generate_response(model, critique_messages)
            critique_json = self.extract_json_from_response(critique_response)
            
            if "updated_rating" in critique_json and isinstance(critique_json["updated_rating"], (int, float)):
                critique_results.append({
                    "Scenario": processed_data.get("Scenario", "Unknown"),
                    "LLM Model": model,
                    "Refined Score": critique_json["updated_rating"],  
                    "Final Consensus Score": None,  # Added comma
                    "Final Reasoning": critique_json.get("final_reasoning", "Not Available")
                })

        # **Step 3: Compute Final Consensus Score**
        final_scores = [result["Refined Score"] for result in critique_results if isinstance(result["Refined Score"], (int, float))]
        consensus_score = self.majority_vote(final_scores)
        print(f"[DEBUG] Final consensus score computed: {consensus_score}")

        # Update results with refined scores and final consensus
        for result, critique in zip(results, critique_results):
            result["Refined Score"] = critique["Refined Score"]
            result["Final Consensus Score"] = consensus_score

        return pd.DataFrame(results)

    
    def majority_vote(self, scores):
        """Determine the consensus decision using majority voting or averaging scores."""
        if not scores:
            return "Not Available"
        return round(sum(scores) / len(scores), 2)
    
    def judge_scoring(self, processed_data):
        """Evaluate the final offer comparison"""
        print("[DEBUG] Starting judge_scoring function...")
        return self.ensemble_llm_debate(processed_data)


In [24]:
llm_models = ["deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free", "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free", "scb10x/scb10x-llama3-typhoon-v1-5x-4f316"]
judge = JudgeLLM(api_key=TOGETHERAI_API_KEY, models=llm_models)

csv_file = 'shark_pitch_data_cleaned.csv'
processed_df = pd.read_csv(csv_file)
#1st 6 rows
sampled_df = processed_df[:1]   

# Update main execution loop:
final_results = []
for _, row in sampled_df.iterrows():
    scenario_data = row.to_dict()
    result_df = judge.judge_scoring(scenario_data)  # Remove CSV parameter
    final_results.append(result_df)

# Single save after processing all rows
all_results_df = pd.concat(final_results, ignore_index=True)
all_results_df.to_csv("final_judge_results_v3.csv", index=False)

[DEBUG] Initialized JudgeLLM with models: ['deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free', 'meta-llama/Llama-3.3-70B-Instruct-Turbo-Free', 'scb10x/scb10x-llama3-typhoon-v1-5x-4f316']
[DEBUG] Starting judge_scoring function...
[DEBUG] Starting multi-LLM debate...
[DEBUG] Submitting request to deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free
[DEBUG] Generating response for model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free
[DEBUG] Submitting request to meta-llama/Llama-3.3-70B-Instruct-Turbo-Free
[DEBUG] Generating response for model: meta-llama/Llama-3.3-70B-Instruct-Turbo-Free
[DEBUG] Submitting request to scb10x/scb10x-llama3-typhoon-v1-5x-4f316
[DEBUG] Generating response for model: scb10x/scb10x-llama3-typhoon-v1-5x-4f316
[DEBUG] Response received from scb10x/scb10x-llama3-typhoon-v1-5x-4f316: {"reasoning": "The Shark LLM's offer is significantly more favorable than the actual offer, with a higher investment amount and lower equity stake. The business's strong sales growth and di