In [None]:

# "mistralai/Mistral-7B-Instruct-v0.2
# "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free"
# "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
# "Qwen/Qwen2-VL-72B-Instruct"
# "scb10x/scb10x-llama3-typhoon-v1-5x-4f316"
# "scb10x/scb10x-llama3-typhoon-v1-5x-4f316"

In [39]:
import pandas as pd
import json
import re

excel_file_path = "2025-03-21_22-55-23_orchestrator-pitches-basic_gab_shak.xlsx"
json_file_path = "../data/all_processed_facts.json"

df_excel = pd.read_excel(excel_file_path, sheet_name="Sheet1")
with open(json_file_path, "r", encoding="utf-8") as file:
    json_data = json.load(file)

# Function to clean the 'shark_offer' column and ensure it's in JSON/dictionary format
def clean_shark_offer(offer):
    if isinstance(offer, str):
        # Remove surrounding ```json and ``` if present
        offer = re.sub(r"```json\s*", "", offer)  # Remove leading ```json
        offer = re.sub(r"\s*```", "", offer)  # Remove trailing ```

        # Try converting to dictionary
        try:
            return json.loads(offer)  # Convert to dictionary if valid JSON
        except json.JSONDecodeError:
            return {"error": "Invalid JSON format", "raw_data": offer}  # Return error marker if not valid
    return offer  # If already a dict, return as is

# Clean the 'shark_offer' column
df_excel["shark_offer"] = df_excel["shark_offer"].apply(clean_shark_offer)

# Extract product details and product facts from JSON data
def extract_product_info(scenario):
    scenario_key = f"{scenario}"
    if scenario_key in json_data:
        product_details = json_data[scenario_key].get("product_description", {})
        product_facts = json_data[scenario_key].get("facts", {})
        final_offer = json_data[scenario_key].get("final_offer", {})
    else:
        product_details, product_facts, final_offer = {}, {}, {}  # Default empty if not found
    return product_details, product_facts, final_offer

# Apply extraction function
df_excel["Product details"], df_excel["Product facts"], df_excel["final_offer"] = zip(*df_excel["scenario"].apply(extract_product_info))

# Select final columns
df_final = df_excel[["scenario", "model_identifier", "Product details", "Product facts", "shark_offer", "final_offer"]]

# Save to CSV
output_csv_path = "processed_shark_tank_data2.csv"
df_final.to_csv(output_csv_path, index=False)



In [16]:
import os
import pandas as pd
import json
import re

def process_excel_files(folder_path, json_file_path, output_file):
    """Process all Excel files in a folder and merge data with JSON into a single CSV file."""
    
    # Load the JSON file
    with open(json_file_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)
    
    # Initialize an empty list to store DataFrames
    all_data = []
    
    # Process each Excel file in the folder
    for file in os.listdir(folder_path):
        if file.endswith(".xlsx"):
            excel_file_path = os.path.join(folder_path, file)
            df_excel = pd.read_excel(excel_file_path, sheet_name="Sheet1")
            
            # Function to clean 'shark_offer' column
            def clean_shark_offer(offer):
                if isinstance(offer, str):
                    offer = re.sub(r"```json\s*", "", offer)  # Remove leading ```json
                    offer = re.sub(r"\s*```", "", offer)  # Remove trailing ```
                    try:
                        return json.loads(offer)  # Convert to dictionary if valid JSON
                    except json.JSONDecodeError:
                        return {"error": "Invalid JSON format", "raw_data": offer}
                return offer  # If already a dict, return as is
            
            # Clean the 'shark_offer' column
            df_excel["shark_offer"] = df_excel["shark_offer"].apply(clean_shark_offer)
            
            # Extract product details and product facts from JSON data
            def extract_product_info(scenario):
                scenario_key = f"{scenario}"
                if scenario_key in json_data:
                    product_details = json_data[scenario_key].get("product_description", {})
                    product_facts = json_data[scenario_key].get("facts", {})
                    final_offer = json_data[scenario_key].get("final_offer", {})
                else:
                    product_details, product_facts, final_offer = {}, {}, {}
                return product_details, product_facts, final_offer
            
            # Apply extraction function
            df_excel["Product details"], df_excel["Product facts"], df_excel["final_offer"] = zip(*df_excel["scenario"].apply(extract_product_info))
            
            # Select final columns
            df_final = df_excel[["scenario", "model_identifier", "Product details", "Product facts", "shark_offer", "final_offer"]]
            
            # Append processed data to list
            all_data.append(df_final)
    
    # Concatenate all processed DataFrames into a single DataFrame
    final_df = pd.concat(all_data, ignore_index=True)
    
    # Save the merged DataFrame to a single CSV file
    final_df.to_csv(output_file, index=False)
    print(f"[INFO] All processed files merged and saved: {output_file}")

# Example usage
folder_path = "multi"
json_file_path = "../data/all_processed_facts.json"
output_file = "processed_shark_tank_data3.csv"

process_excel_files(folder_path, json_file_path, output_file)

[INFO] All processed files merged and saved: processed_shark_tank_data3.csv


## 2 Round

In [18]:
import os
import pandas as pd
import json
import concurrent.futures
import threading
import re
from together import Together
from collections import Counter
import time

class JudgeLLM:
    def __init__(self, api_key=None, models=None):
        self.api_key = api_key or os.getenv('TOGETHER_API_KEY')
        self.client = Together(api_key=self.api_key)
        self.models = models
        self.lock = threading.Lock()  # Added lock for thread safety
        print(f"[DEBUG] Initialized JudgeLLM with models: {self.models}")
    
    def generate_response(self, model, messages):
        print(f"[DEBUG] Generating response for model: {model}")
        try:
            start_time = time.time()  # Start timing
            completion = self.client.chat.completions.create(
                model=model,
                messages=messages,
                stream=False,
            )
            end_time = time.time()  # End timing

            inference_time = round(end_time - start_time, 3)  # Compute inference time
            response = completion.choices[0].message.content
            print(f"[DEBUG] Response received from {model} (Time Taken: {inference_time}s): {response[:200]}...")
            return model, response, inference_time
        except Exception as e:
            print(f"[ERROR] Error generating response from {model}: {str(e)}")
            return model, f"Error: {str(e)}", None
    
    def extract_json_from_response(self, response, model_name=None):
        try:
            # Remove triple backticks if present
            response = re.sub(r"```json\s*", "", response)
            response = re.sub(r"\s*```", "", response)

            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                extracted_json = json.loads(json_match.group())

                # Check if response is nested and extract the correct part
                if "response" in extracted_json and isinstance(extracted_json["response"], dict):
                    extracted_json = extracted_json["response"]

                # **NEW**: If model is Qwen reasoniong, strip unnecessary CoT reasoning
                if model_name and "Qwen" in model_name:
                    if "reasoning" in extracted_json:
                        extracted_json["reasoning"] = extracted_json["reasoning"].split("\n\n")[0]  # Keep only the first paragraph
                
                print(f"[DEBUG] Successfully extracted JSON from {model_name}: {extracted_json}")
                return extracted_json
        except json.JSONDecodeError:
            print(f"[ERROR] JSON decode failed for response: {response[:200]}")
        
        return {"reasoning": response, "rating of final_offer": "Invalid"}
    
    def ensemble_llm_debate(self, processed_data):
        print("[DEBUG] Starting multi-LLM debate...")

        system_prompt = (
            "You are a panel of expert venture capitalists analyzing an investment deal based on structured business data. "
            "You are a panel of expert venture capitalists analyzing the terms of an investment deal based on structured business data. "
            "You will evaluate key elements including the product details, the Shark LLM's proposed final offer, and the historical actual offer, analysing if the shark's offer is fair to the business on a scale of 1-10. "
            "When evaluating the offer, consider the context of the business and the investment terms. Also use your own knowledge and experience on what is a fair offer in the industry and investment landscape. "
            "If the shark offer is perceived equivalent to the actual offer, the score should be 5. "
            "If the actual situation did not gave an offer to the entrepreneur but the Shark LLM did, you should still consider if the offer is fair with respect to the context of the actual decision. "
            "If neither the Shark LLM nor the actual situation gave an offer, you can assign a score of 5 but still provide a reasoning if the shark's decision is fair to the entrepreneur. "
            "The data provided includes: \n"
            "- Scenario Name: The unique business pitch scenario. "
            "- Product Details: Information on the business product details. "
            "- Product Facts: Financial Information on the business and its offering. "
            "- Shark LLM Offer: The proposed investment terms made by the Shark LLM. "
            "- Actual Offer: The real historical investment terms given to this business. "
            "Each LLM first provides an independent evaluation, then critiques and refines each other's responses before reaching a consensus. "
            "Return the result in JSON format: "
            '{"reasoning": "Short summary of justification of your score", "rating_of_final_offer": score (1-10)}'
            "Reasoning is to justify the score given to the final offer, not the summary of the facts or pitch discussion."
        )

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": json.dumps(processed_data, indent=2, default=str)},
        ]

        results = []
        individual_scores = []
        initial_responses = {}
        
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_model = {executor.submit(self.generate_response, model, messages): model for model in self.models}
            
            for future in concurrent.futures.as_completed(future_to_model):
                model, response,  inference_time = future.result()
                initial_responses[model] = response
                response_json = self.extract_json_from_response(response)
                print(f"[DEBUG] Extracted JSON from {model}: {response_json}")

                if isinstance(response_json.get("rating_of_final_offer"), (int, float)):
                    with self.lock:  # Prevent race conditions
                        individual_scores.append(response_json["rating_of_final_offer"])
                        results.append({
                            "Scenario": processed_data.get("scenario", "Unknown"),
                            "Shark Model": processed_data.get("model_identifier", "Unknown"),
                            "Judge Model": model,
                            "Initial Reasoning": response_json.get("reasoning", "Not Available"),
                            "Initial Score": response_json["rating_of_final_offer"],
                            "Refined Score": None,
                            "Final Consensus Score": None,
                            "Final Reasoning": None,
                            "Inference Time (s)": inference_time,
                        })

        critique_results = []
        for model, initial_response in initial_responses.items():
            critique_prompt = (
                "You are an expert VC evaluating a deal. Here are multiple analyses from different experts:\n\n"
                f"{json.dumps(initial_responses, indent=2)}\n\n"
                "Your task: Critique and refine your own response in light of these evaluations. "
                "If convinced by another argument, update your score. Otherwise, justify why your score remains the same."
                "Return the result in JSON format: "
                '{"final_reasoning": "Final refined evaluation", "updated_rating": score (1-10)}'
            )

            critique_messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": critique_prompt}]
            start_time = time.time()  # Start critique timing
            model, critique_response, critique_time = self.generate_response(model, critique_messages)
            end_time = time.time()  # End critique timing
            total_time = round(end_time - start_time, 3)  # Compute total critique time
            critique_json = self.extract_json_from_response(critique_response)
            
            if isinstance(critique_json.get("updated_rating"), (int, float)):
                critique_results.append({
                    "Scenario": processed_data.get("Scenario", "Unknown"),
                    "Shark Model": processed_data.get("model_identifier", "Unknown"),
                    "Judge Model": model,
                    "Refined Score": critique_json["updated_rating"],
                    "Final Consensus Score": None,
                    "Final Reasoning": critique_json.get("final_reasoning", "Not Available"),
                    "Inference Critique Time (s)": total_time,
                })

        final_scores = [result["Refined Score"] for result in critique_results if isinstance(result["Refined Score"], (int, float))]
        consensus_score = self.majority_vote(final_scores)
        print(f"[DEBUG] Final consensus score computed: {consensus_score}")

        for result, critique in zip(results, critique_results):
            result["Refined Score"] = critique["Refined Score"]
            result["Final Consensus Score"] = consensus_score
            result["Final Reasoning"] = critique["Final Reasoning"]
            result["Inference Critique Time (s)"] = critique["Inference Critique Time (s)"]

        return pd.DataFrame(results)

    def majority_vote(self, scores):
        return round(sum(scores) / len(scores), 2) if scores else "Not Available"
    
    def judge_scoring(self, processed_data):
        print("[DEBUG] Starting judge_scoring function...")
        return self.ensemble_llm_debate(processed_data)

## 3 Round

In [None]:
import os
import pandas as pd
import json
import concurrent.futures
import threading
import re
from together import Together
from collections import Counter

class JudgeLLM:
    def __init__(self, api_key=None, models=None):
        self.api_key = api_key or os.getenv('TOGETHER_API_KEY')
        self.client = Together(api_key=self.api_key)
        self.models = models
        self.lock = threading.Lock()
        print(f"[DEBUG] Initialized JudgeLLM with models: {self.models}")
    
    def generate_response(self, model, messages):
        print(f"[DEBUG] Generating response for model: {model}")
        try:
            completion = self.client.chat.completions.create(
                model=model,
                messages=messages,
                stream=False,
            )
            response = completion.choices[0].message.content
            print(f"[DEBUG] Response received from {model}: {response[:200]}...")
            return model, response
        except Exception as e:
            print(f"[ERROR] Error generating response from {model}: {str(e)}")
            return model, f"Error: {str(e)}"
    
    def extract_json_from_response(self, response):
        try:
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                return json.loads(json_match.group())
        except json.JSONDecodeError:
            print("[ERROR] JSON decode failed. Returning raw response.")
        return {"reasoning": response, "rating_of_final_offer": "Invalid"}
    
    def ensemble_llm_debate(self, processed_data):
        print("[DEBUG] Starting multi-LLM debate...")

        system_prompt = (
            "You are a panel of expert venture capitalists analyzing an investment deal based on structured business data..."
        )

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": json.dumps(processed_data, indent=2, default=str)},
        ]

        results = []
        individual_scores = []
        initial_responses = {}
        
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_model = {executor.submit(self.generate_response, model, messages): model for model in self.models}
            
            for future in concurrent.futures.as_completed(future_to_model):
                model, response = future.result()
                initial_responses[model] = response
                response_json = self.extract_json_from_response(response)

                if isinstance(response_json.get("rating_of_final_offer"), (int, float)):
                    with self.lock:
                        individual_scores.append(response_json["rating_of_final_offer"])
                        results.append({
                            "Scenario": processed_data.get("Scenario", "Unknown"),
                            "LLM Model": model,
                            "Initial Reasoning": response_json.get("reasoning", "Not Available"),
                            "Initial Score": response_json["rating_of_final_offer"],
                            "Refined Score": None,
                            "Final Consensus Score": None,
                            "Final Reasoning": None,
                        })

        critique_results = []
        for model, initial_response in initial_responses.items():
            critique_prompt = (
                "You are an expert VC evaluating a deal. Here are multiple analyses from different experts:\n\n"
                f"{json.dumps(initial_responses, indent=2)}\n\n"
                "Your task: Critique and refine your own response in light of these evaluations. "
                "If convinced by another argument, update your score. Otherwise, justify why your score remains the same."
                "Return the result in JSON format: "
                '{"final_reasoning": "Final refined evaluation", "updated_rating": score (1-10)}'
            )

            critique_messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": critique_prompt}]
            model, critique_response = self.generate_response(model, critique_messages)
            critique_json = self.extract_json_from_response(critique_response)
            
            if isinstance(critique_json.get("updated_rating"), (int, float)):
                critique_results.append({
                    "Scenario": processed_data.get("Scenario", "Unknown"),
                    "LLM Model": model,
                    "Refined Score": critique_json["updated_rating"],
                    "Final Consensus Score": None,
                    "Final Reasoning": critique_json.get("final_reasoning", "Not Available"),
                })

        final_scores = [result["Refined Score"] for result in critique_results if isinstance(result["Refined Score"], (int, float))]
        consensus_score = self.majority_vote(final_scores)
        print(f"[DEBUG] Final consensus score computed: {consensus_score}")

        final_consensus_prompt = (
            "The expert panel has now provided individual critiques."
            "Now, reach a final consensus as a panel of experts by considering the refined evaluations."
            "You should collectively determine the fairest possible rating."
            "Return the final consensus in JSON format: "
            '{"consensus_reasoning": "Final agreement or key disagreements", "final_consensus_rating": score (1-10)}'
        )

        final_consensus_messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": final_consensus_prompt}]
        
        final_consensus_results = []
        for model in self.models:
            model, final_consensus_response = self.generate_response(model, final_consensus_messages)
            final_consensus_json = self.extract_json_from_response(final_consensus_response)
            
            if isinstance(final_consensus_json.get("final_consensus_rating"), (int, float)):
                final_consensus_results.append({
                    "Scenario": processed_data.get("Scenario", "Unknown"),
                    "LLM Model": model,
                    "Final Consensus Score": final_consensus_json["final_consensus_rating"],
                    "Final Reasoning": final_consensus_json.get("consensus_reasoning", "Not Available"),
                })
        
        for result, consensus in zip(results, final_consensus_results):
            result["Final Consensus Score"] = consensus["Final Consensus Score"]
            result["Final Reasoning"] = consensus["Final Reasoning"]
        
        return pd.DataFrame(results)

    def majority_vote(self, scores):
        return round(sum(scores) / len(scores), 2) if scores else "Not Available"
    
    def judge_scoring(self, processed_data):
        print("[DEBUG] Starting judge_scoring function...")
        return self.ensemble_llm_debate(processed_data)

In [None]:
llm_models = ["deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free", "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free", "Qwen/Qwen2.5-7B-Instruct-Turbo", "Qwen/QwQ-32B"]
judge = JudgeLLM(api_key=TOGETHERAI_API_KEY, models=llm_models)

csv_file = 'processed_shark_tank_data3.csv'
processed_df = pd.read_csv(csv_file)
#1st 6 rows
sampled_df = processed_df[:]   

# Update main execution loop:
final_results = []
for _, row in sampled_df.iterrows():
    scenario_data = row.to_dict()
    result_df = judge.judge_scoring(scenario_data)  # Remove CSV parameter
    final_results.append(result_df)

# Single save after processing all rows
all_results_df = pd.concat(final_results, ignore_index=True)
all_results_df.to_csv("final_judge_results_gan.csv", index=False)

[DEBUG] Initialized JudgeLLM with models: ['deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free', 'meta-llama/Llama-3.3-70B-Instruct-Turbo-Free', 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'Qwen/QwQ-32B']
[DEBUG] Starting judge_scoring function...
[DEBUG] Starting multi-LLM debate...
[DEBUG] Generating response for model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free
[DEBUG] Generating response for model: meta-llama/Llama-3.3-70B-Instruct-Turbo-Free
[DEBUG] Generating response for model: Qwen/Qwen2.5-7B-Instruct-Turbo
[DEBUG] Generating response for model: Qwen/QwQ-32B
[DEBUG] Response received from Qwen/Qwen2.5-7B-Instruct-Turbo (Time Taken: 2.617s): ```json
{
  "reasoning": "The Shark LLM's offer of $250,000 for 15% equity in a business with significant sales and a 20% profit margin is quite generous. However, the final decision not to make a dea...
[DEBUG] Successfully extracted JSON from None: {'reasoning': "The Shark LLM's offer of $250,000 for 15% equity in a business with significant sales a

In [20]:
llm_models = ["deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free", "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free", "Qwen/Qwen2.5-7B-Instruct-Turbo", "Qwen/QwQ-32B"]
judge = JudgeLLM(api_key=TOGETHERAI_API_KEY, models=llm_models)

csv_file = 'processed_shark_tank_data2.csv'
processed_df = pd.read_csv(csv_file)
#1st 6 rows
sampled_df = processed_df[:]   

# Update main execution loop:
final_results = []
for _, row in sampled_df.iterrows():
    scenario_data = row.to_dict()
    result_df = judge.judge_scoring(scenario_data)  # Remove CSV parameter
    final_results.append(result_df)

# Single save after processing all rows
all_results_df = pd.concat(final_results, ignore_index=True)
all_results_df.to_csv("final_judge_results_gab.csv", index=False)

[DEBUG] Initialized JudgeLLM with models: ['deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free', 'meta-llama/Llama-3.3-70B-Instruct-Turbo-Free', 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'Qwen/QwQ-32B']
[DEBUG] Starting judge_scoring function...
[DEBUG] Starting multi-LLM debate...
[DEBUG] Generating response for model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free
[DEBUG] Generating response for model: meta-llama/Llama-3.3-70B-Instruct-Turbo-Free
[DEBUG] Generating response for model: Qwen/Qwen2.5-7B-Instruct-Turbo
[DEBUG] Generating response for model: Qwen/QwQ-32B
[DEBUG] Response received from Qwen/Qwen2.5-7B-Instruct-Turbo (Time Taken: 2.068s): ```json
{
  "reasoning": "The Sharks appreciated the product and the entrepreneurs' efforts, but they concluded that the business was strong enough to grow without giving up equity at this stage. This...
[DEBUG] Successfully extracted JSON from None: {'reasoning': "The Sharks appreciated the product and the entrepreneurs' efforts, but they concluded t

In [None]:
llm_models = ["deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free", "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free", "Qwen/Qwen2.5-7B-Instruct-Turbo", "Qwen/QwQ-32B"]
judge = JudgeLLM(api_key=TOGETHERAI_API_KEY, models=llm_models)

csv_file = 'processed_shark_tank_data.csv'
processed_df = pd.read_csv(csv_file)
#1st 6 rows
sampled_df = processed_df[:301]   

# Update main execution loop:
final_results = []
for _, row in sampled_df.iterrows():
    scenario_data = row.to_dict()
    result_df = judge.judge_scoring(scenario_data)  # Remove CSV parameter
    final_results.append(result_df)

# Single save after processing all rows
all_results_df = pd.concat(final_results, ignore_index=True)
all_results_df.to_csv("final_judge_results_kz.csv", index=False)