### Environment setup
```!pip install openai numpy tqdm tiktoken```

### Import Packages

In [1]:
from openai import OpenAI
import numpy as np
from typing import List, Tuple, Dict
import os
import json
import random
from tqdm import tqdm
import tiktoken
import re

### Utility Functions

In [2]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\$", "", text)
    text = re.sub(r"(?s).*#### ", "", text)
    text = re.sub(r"\.$", "", text)
    text = re.sub(r",", "", text)
    
    if not text:
        return "-1000000000"
    
    return text

def extract_value(text: str) -> str:
    pattern = r"(-?[$0-9.,]{2,})|(-?[0-9]+)"
    matches = re.findall(pattern, text)
    
    if matches:
        for match_groups in matches[::-1]:
            for group in match_groups:
                if group:
                    return clean_text(group)
    
    return "-1000000000"

def load_dataset(file_path: str, sample_size: int = 20) -> List[Dict]:
    """Load and sample from dataset."""
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return random.sample(data, sample_size)

def count_tokens(text: str, model: str) -> int:
    """Count tokens in text using tiktoken."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def evaluate_accuracy(predictions: List[str], ground_truth: List[str]) -> float:
    """Calculate accuracy of predictions."""
    correct = 0
    for pred, truth in zip(predictions, ground_truth):
        try:
            if int(pred) == int(truth):
                correct += 1
        except:
            pass
    return correct / len(predictions)

### Chain-of-Thoughts Implementation

1. Visit [DeepInfra](https://deepinfra.com/) and **register an account**. Familiarize yourself with how to use the API by referring to the [documentation](https://deepinfra.com/docs).  

2. Test the **API call** functionality provided by DeepInfra to ensure proper integration.


In [3]:
class ChainOfThought:
    def __init__(self, api_key: str, base_url: str = "https://api.deepinfra.com/v1/openai",
                 model: str = "Qwen/Qwen2.5-7B-Instruct", temperature: float = 0.7):
        self.client = OpenAI(
            api_key=api_key,
            base_url=base_url
        )
        self.model = "gpt-40"
        self.temperature = temperature
        self.total_tokens = 0

    def solve(self, question: str) -> Tuple[str, int]:
        prompt = f"""Please solve this math problem step by step.
Question: {question}
Let's think step by step."""

        try:
            messages = [{"role": "user", "content": prompt}]
            response = self.client.chat.completions.create(
                model=self.model,
                messages=messages,
                temperature=self.temperature,
                max_completion_tokens=256,
            )
            tokens_used = response.usage.total_tokens
            self.total_tokens += tokens_used
            return response.choices[0].message.content, tokens_used
        except Exception as e:
            print(f"Error in API call: {e}")
            return "", 0

### Tree-of-Thoughts Implementation

1. You are *highly encouraged* to read the [original paper](http://arxiv.org/abs/2501.02497) and run the [codebase](https://github.com/princeton-nlp/tree-of-thought-llm) first.  
    - Otherwise, you may have no idea what ToT is doing.
    
    - For simplicity, start by running the basic configuration:
        - Search algorithm: BFS

        - Thought generator: propose prompt

        - Task: Game of 24

2. Regarding your own implementation below, feel free to experiment with various hyperparameters, including:
    - API call parameters (e.g., `temperature`)
    
    - ToT implementation parameters (e.g., `max_steps`, `n_samples_per_step`)

3. [Optional] You could try other **7B-level** base models instead of `Qwen2.5-7B-Instruct` in DeepInfra. You might achieve better results with proper implementation.

4. [Optional] Multi-model setups are also allowed—for example, using one model to generate thoughts and another to evaluate them (reward model?).

In [121]:
class TreeOfThoughts:
    def __init__(self, api_key: str, base_url: str = "https://api.deepinfra.com/v1/openai", 
                 model: str = "Qwen/Qwen2.5-7B-Instruct", temperature: float = 0.6):
        self.client = OpenAI(
            api_key=api_key,
            base_url=base_url
        )
        self.model = model
        self.temperature = temperature
        self.total_tokens = 0
        self.evaluation_cache = {}  # For caching evaluations

    def chat_with_gpt(self, prompt: str, n: int = 1, stop: str = None) -> List[str]:
        """Get multiple completions from the model with proper stop sequences."""
        try:
            messages = [{"role": "user", "content": prompt}]
            response = self.client.chat.completions.create(
                model=self.model,
                messages=messages,
                temperature=self.temperature,
                n=n,
                stop=[stop] if stop else None,
                max_tokens=256
            )
            completions = [choice.message.content for choice in response.choices]
            return completions
        except Exception as e:
            print(f"API Error: {e}")
            return []

    def generate_thoughts(self, question: str, current_thought: str = "", n_samples: int = 3) -> List[str]:
        prompt = (
            f"Question: {question}\n\n"
            f"So far, the reasoning is:\n{current_thought}\n\n"
            f"Act as a Phd in Mathematics and solve the poroblem with that outlook, Please propose the next step in the reasoning. Be concise."
        )
        # Use the prompt in chat_with_gpt to return candidate thoughts.
        completion = self.chat_with_gpt(prompt, n=n_samples)
        return completion

    def evaluate_thought(self, question: str, thought: str, cache: bool = True) -> float:
        """Evaluate the quality of a thought using self-evaluation."""
        if cache and thought in self.evaluation_cache:
            return self.evaluation_cache[thought]
        
        prompt = f'''Rate how likely this thought will lead to the correct answer (1-10):
        Question: {question}
        Thought: {thought}
        Use the above Thought to Rate from 1 to 10 how likely this path is correct. so that we can move with that thought.
        Rating (1-10): '''
        
        response = self.chat_with_gpt(prompt, n=1)
        if not response: return 0.0
        
        try:
            score = float(re.search(r'\d+', response[0]).group())
            score = max(1.0, min(10.0, score))  # Clamp to 1-10
            if cache:
                self.evaluation_cache[thought] = score
            return score / 10.0  # Normalize to 0-1
        except:
            return 0.0

    def select_best_thoughts(self, thoughts: List[str], scores: List[float], k: int = 2) -> List[str]:
        """Select top-k thoughts based on evaluation scores."""
        combined = sorted(zip(thoughts, scores), key=lambda x: x[1], reverse=True)
        return [thought for thought, score in combined[:k]]

    def solve(self, question: str, max_steps: int = 8, n_samples_per_step: int = 3, 
             k_best_thoughts: int = 2) -> str:
        """Solve a problem using Tree-of-Thoughts BFS."""
        current_thoughts = [""]
        final_answer = ""
        
        for step in range(max_steps):
            new_thoughts = []
            scores = []
            
            for thought in current_thoughts:
                # Generate new thoughts
                candidates = self.generate_thoughts(question, thought, n_samples_per_step)
                # print(candidates)
                if not candidates:
                    continue
                
                # Evaluate candidates
                candidate_scores = [self.evaluate_thought(question, f"{thought} {c}".strip()) 
                                   for c in candidates]
                
                # Select and accumulate best candidates
                best_candidates = self.select_best_thoughts(candidates, candidate_scores, k_best_thoughts)
                new_thoughts.extend([f"{thought} {c}".strip() for c in best_candidates])
                scores.extend(candidate_scores[:k_best_thoughts])
            
            if not new_thoughts:
                break
                
            # Select best thoughts for next iteration
            current_thoughts = self.select_best_thoughts(new_thoughts, scores, k_best_thoughts)

            # print(current_thoughts)
            
            # Early termination if final answer detected
            for t in current_thoughts:
                if "final answer" in t.lower():
                    final_answer = t
                    # print(final_answer)
                    prompt = f"""Get the final numeric answer to the question : {question}
                    Using the current logic: {final_answer} 
                    Important : Your Answer should end with the number which is the final answer to the question asked.
                            """
                    # print(prompt)
                    # print("prompt ends here")
                    response = self.chat_with_gpt(prompt, n=1)
                    return final_answer.split("final answer")[-1].strip() + "final answer is" + str(response[0])
                
        
        # print(current_thoughts)
        # print("current thought ends here")
                
        prompt = f"""Get the final numeric answer to the question : {question}
        Using the current logic: {str(current_thoughts[0])} 
        Important : Your Answer should end with the number which is the final answer to the question asked.
        """
        # print(prompt)
        # print("prompt ends here")
        response = self.chat_with_gpt(prompt, n=1)
        # print(response)
        return current_thoughts[0]+ " answer is " +str(response[0]) if current_thoughts else ""

        
        # return current_thoughts[0] if current_thoughts else ""

In [4]:
# class TreeOfThoughts:
#     def __init__(self, api_key: str, base_url: str = "https://api.deepinfra.com/v1/openai", 
#                  model: str = "Qwen/Qwen2.5-7B-Instruct", temperature: float = 0.7):
#         """Initialize the Tree-of-Thoughts solver."""
#         # TODO: Initialize the OpenAI client and other necessary attributes
#         self.client = OpenAI(base_url=base_url, api_key=api_key)
#         self.model = model
#         self.temperature = temperature
#         self.evaluation_cache = {}
        

#     def chat_with_gpt(self, prompt: str, n: int = 1, stop: str = None, ) -> List[str]:
#         """Get completions from GPT model."""
#         # [IMPORTANT] `stop` is important here. You can refer to the implementation of the Game of 24 in the original ToT codebase.
#         # TODO: Implement the chat completion function
#         try:
#             response = self.client.chat.completions.create(
#                 model=self.model,
#                 messages=[{"role": "user", "content": prompt}],
#                 temperature=self.temperature,
#                 n=n,
#                 stop=stop,
#                 # max_tokens=max_tokens
#             )
#             return [choice.message.content for choice in response.choices]
#         except Exception as e:
#             print(f"Error in chat_with_gpt: {e}")
#             return []
        

#     def generate_thoughts(self, question: str, current_thought: str = "", n_samples: int = 3) -> List[str]:
#         """Generate multiple possible next steps in reasoning."""
#         # [IMPORTANT] A one-shot example can help the model follow your instructions precisely.
#         # TODO: Implement thought generation function
#         example_prompt = """Example:
#             Question: If Jane has 3 apples and gives 2 to Bob, how many does she have?
#             Current thought: 
#             Possible next steps:
#             1. Subtract the 2 apples given away from the initial 3: 3 - 2 = 1.
#             2. Check if there are any other apples involved; since none, the answer is 1.
#             3. Verify by adding Bob's apples to Jane's remaining: 1 + 2 = 3, which matches the initial count.
#             """
#         prompt = f"""You are a logical thinker and math problem solver. Given a question and current thought, list {n_samples} possible next steps to solve the problem. Each step must be a concise sentence. Follow the example format.

#             {example_prompt}
#             Question: {question}
#             Current thought: {current_thought}
#             Possible next steps:
#             """

# #         example_prompt = '''
# # Example:
# # Question: If Alice has 4 books and buys 7 more, how many does she have?
# # Current Thought: Alice started with 4 books.
# # Possible next thoughts:
# # 1. She adds the new books: 4 + 7
# # 2. Check if there are any books lost: 4 - 0 + 7
# # 3. Verify purchase quantity: 7 books purchased'''
        
# #         prompt = f'''{example_prompt.strip()}
        
# # Now generate {n_samples} possible next thoughts for:
# # Question: {question}
# # Current Thought: {current_thought or 'None'}
# # Possible next thoughts (numbered 1-{n_samples}):\n
# # Also if you think this is the final answer mention the word 'final answer' in it'''
        
#         response = self.chat_with_gpt(prompt, n=1, stop=["\n\n"])[0]
#         thoughts = []
#         for line in response.split('\n'):
#             line = line.strip()
#             if line:
#                 thought = re.sub(r'^[\d\-*]+\s*\.?\s*', '', line).strip()
#                 if thought:
#                     thoughts.append(thought)
#         return thoughts[:n_samples]

# #     def evaluate_thought(self, question: str, thought: str, cache: bool = True) -> float:
# #         """Evaluate the likelihood that a thought process leads to the correct answer."""
# #         # [IMPORTANT] You can use the base model (Qwen2.5-7B-Instruct) for self-evaluation; however, it is not the only option.
# #         # TODO: Implement thought evaluation function

# #         eval_prompt = f"""Evaluate the quality of the following thought in solving the given problem. Provide a numerical score between 0.0 and 1.0, where 1.0 means the thought is definitely leading to the correct answer, and 0.0 means it's irrelevant or incorrect.

# #             Question: {question}
# #             Thought: {thought}

# #             Your score: """
# # #         eval_prompt = f'''Rate how likely this thought will lead to the correct answer (0-1):
# # # Question: {question}
# # # Thought: {thought}
# # # Rating (1-10): '''
# #         response = self.chat_with_gpt(eval_prompt, n=1, stop=["\n"])[0].strip()
# #         numbers = re.findall(r"\d*\.?\d+", response)
# #         if numbers:
# #             score = float(numbers[0])
# #             return max(0.0, min(1.0, score))
# #         return 0.0


#     def evaluate_thought(self, question: str, thought: str, cache: bool = True) -> float:
#         """Evaluate the quality of a thought using self-evaluation."""
#         if cache and thought in self.evaluation_cache:
#             return self.evaluation_cache[thought]
        
#         prompt = f'''Rate how likely this thought will lead to the correct answer (1-10):
# Question: {question}
# Thought: {thought}
# Rating (1-10): '''
        
#         response = self.chat_with_gpt(prompt, n=1)
#         if not response: return 0.0
        
#         try:
#             score = float(re.search(r'\d+', response[0]).group())
#             score = max(1.0, min(10.0, score))  # Clamp to 1-10
#             if cache:
#                 self.evaluation_cache[thought] = score
#             return score / 10.0  # Normalize to 0-1
#         except:
#             return 0.0
        
#     def select_best_thoughts(self, thoughts: List[str], scores: List[float], k: int = 2) -> List[str]:
#         """Select the k best thoughts based on their scores."""
#         # TODO: Implement thought selection function
#         if not thoughts:
#             return []
    
#         combined = list(zip(thoughts, scores))
#         combined.sort(key=lambda x: x[1], reverse=True)
#         return [thought for thought, _ in combined[:k]]

#     def solve(self, question: str, max_steps: int = 8, n_samples_per_step: int = 3, 
#              k_best_thoughts: int = 2) -> str:
#         """Solve a problem using Tree-of-Thoughts reasoning."""
#         # TODO: Implement the main solving function
#         current_thoughts = [""]
#         for step in range(max_steps):
#             candidates = []
#             for thought in current_thoughts:
#                 next_steps = self.generate_thoughts(question, thought, n_samples_per_step)
#                 for next_step in next_steps:
#                     new_thought = f"{thought} {next_step}".strip() if thought else next_step
#                     score = self.evaluate_thought(question, new_thought)
#                     candidates.append((new_thought, score))
#                     print(candidates)
#             if not candidates:
#                 break
#             candidates.sort(key=lambda x: x[1], reverse=True)
#             current_thoughts = [thought for thought, _ in candidates[:k_best_thoughts]]
#             if candidates[0][1] >= 0.99:
#                 prompt = f"""
#                 Get the final numeric answer to the question : {question}
#                 Using the current thoughts: {candidates[-1][0]} 
#                 Important : Your Answer should end with the number which is the final answer to the question asked.
#                 """
#                 response = self.chat_with_gpt(prompt, n=1, stop=["\n"])
#                 # print(response)
#                 # print(candidates[-1][0])
#                 return candidates[-1][0]+" answer is "+str(response[0])
        
#         prompt = f"""
#                 Get the final numeric answer to the question : {question}
#                 Using the current thoughts: {current_thoughts[0]} 
#                 Important : Your Answer should end with the number which is the final answer to the question asked.
#                 """
#         response = self.chat_with_gpt(prompt, n=1, stop=["\n"])
#         # print(response)
#         return current_thoughts[0]+" answer is "+str(response[0]) if current_thoughts else ""
#         # return response

### Test ToT Using One Simple Example

In [7]:
def test_tot():
    # Set your API key
    api_key = os.getenv("DEEPINFRA_TOKEN", "")
    if not api_key:
        print("Please set DEEPINFRA_TOKEN environment variable")
        return

    # Test with a single example
    test_question = "Jean is two years older than Mark.  Two years ago Mark was 5 years older than half Jan's age.  If Jan is 30 how old is Jean?"
    tot_solver = TreeOfThoughts(api_key)
    solution = tot_solver.solve(
        question=test_question,
        max_steps=6,
        n_samples_per_step=3,
        k_best_thoughts=3
    )
    
    print(solution)
    answer = extract_value(solution)
    print("Solution:", solution)
    print("Extracted answer:", answer)

test_tot()

Given:
- Jean is two years older than Mark: \( J = M + 2 \)
- Two years ago, Mark was 5 years older than half of Jan's age: \( M - 2 = \frac{1}{2}(J - 2) + 5 \)
- Jan is 30 years old: \( J = 30 \)

Substitute \( J = 30 \) into the second equation:
\[ M - 2 = \frac{1}{2}(30 - 2) + 5 \]
\[ M - 2 = \frac{1}{2}(28) + 5 \]
\[ M - 2 = 14 + 5 \]
\[ M - 2 = 19 \]
\[ M = 21 \]

Now, use \( M = 21 \) in the first equation to find Jean's age:
\[ J = M + 2 \]
\[ J = 21 + 2 \]
\[ J = 23 \]

Thus, Jean is 23 years old. Given:
- \( J = M + 2 \)
- \( M - 2 = \frac{1}{2}(J - 2) + 5 \)
- \( J = 30 \)

Substitute \( J = 30 \) into the second equation:
\[ M - 2 = \frac{1}{2}(30 - 2) + 5 \]
\[ M - 2 = \frac{1}{2}(28) + 5 \]
\[ M - 2 = 14 + 5 \]
\[ M - 2 = 19 \]
\[ M = 21 \]

Now, use \( M = 21 \) in the first equation:
\[ J = M + 2 \]
\[ J = 21 + 2 \]
\[ J = 23 \]

Thus, Jean is \(\boxed{23}\) years old. Given:
- \( J = M + 2 \)
- \( M - 2 = \frac{1}{2}(J - 2) + 5 \)
- \( J = 30 \)

Substitute \( J = 30 \)

### Experiment on the Validation Set

- You can use multithreading to accelerate the process.

In [None]:
# 1. load DeepInfra api key
api_key = os.getenv("DEEPINFRA_TOKEN", "")

# 2. Load and sample dataset
dataset = load_dataset("cs5260_val_random300.jsonl", sample_size=25)

# 3. Initialize reasoning solvers
tot_solver = TreeOfThoughts(api_key)
cot_solver = ChainOfThought(api_key)

# 4. process dataset
tot_results, cot_results = [], []
tot_tokens, cot_tokens = 0, 0

print("\nProcessing questions...")
for item in tqdm(dataset):
    question = item["question"]
    true_answer = item["answer"]
    
    # ToT solving
    tot_solution = tot_solver.solve(
        question=question,
        max_steps=4, 
        n_samples_per_step=3, 
        k_best_thoughts=2 
    )
    tot_answer = extract_value(tot_solution)
    tot_results.append({
        "question_id": item["question_id"],
        "predicted": tot_answer,
        "true": true_answer,
    })
    
    # CoT solving
    cot_solution, tokens = cot_solver.solve(question)
    cot_answer = extract_value(cot_solution)
    cot_results.append({
        "question_id": item["question_id"],
        "predicted": cot_answer,
        "true": true_answer,
    })
    cot_tokens += tokens

# Calculate metrics
tot_accuracy = evaluate_accuracy([r["predicted"] for r in tot_results], 
                                [r["true"] for r in tot_results])
cot_accuracy = evaluate_accuracy([r["predicted"] for r in cot_results], 
                                [r["true"] for r in cot_results])

# Print results
print("\n=== Validation Results ===")
print(f"ToT Accuracy: {tot_accuracy:.2%}")
print(f"CoT Accuracy: {cot_accuracy:.2%}")
print(f"ToT Total Tokens: {tot_solver.total_tokens}")
print(f"CoT Total Tokens: {cot_tokens}")


Processing questions...


  0%|          | 0/25 [00:06<?, ?it/s]


KeyboardInterrupt: 

### Submission

- Refer to [here](https://www.kaggle.com/competitions/cs-5260-spring-2025-assignment-1/data) for submission format

In [None]:
# TODO: Evaluate on test set and build submission file.
# 1. load DeepInfra api key
api_key = os.getenv("DEEPINFRA_TOKEN", "")

# 2. Load and sample dataset
dataset = load_dataset("cs5260_val_random300.jsonl", sample_size=3)

# 3. Initialize reasoning solvers
tot_solver = TreeOfThoughts(api_key)

tot_results = []

print("\nProcessing questions...")
for item in tqdm(dataset):
    question = item["question"]
    question_id = item["question_id"]

    tot_solution = tot_solver.solve(
        question=question,
        max_steps=8, 
        n_samples_per_step=3, 
        k_best_thoughts=2 
    )
    tot_answer = extract_value(tot_solution)
    tot_results.append({
        "question_id": item["question_id"],
        "predicted": tot_answer,
    })

import csv
with open('A0297803U.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["question_id", "answer"])
    writer.writeheader()
    writer.writerows(tot_results)

In [16]:
import os
import concurrent.futures
from tqdm import tqdm

def process_item(item, api_key):
    """Thread worker function to process one question with both solvers"""
    # Initialize fresh solvers for thread safety
    tot_solver = TreeOfThoughts(api_key)
    # cot_solver = ChainOfThought(api_key)
    
    # ToT processing
    tot_solution = tot_solver.solve(
        question=item["question"],
        max_steps=8,
        n_samples_per_step=4,
        k_best_thoughts=4
    )
    
    # CoT processing
    # cot_solution, cot_tokens = cot_solver.solve(item["question"])
    
    return {
        "tot_entry": {
            "question_id": item["question_id"],
            "predicted": extract_value(tot_solution),
            "true": item["answer"]
        },
        # "cot_entry": {
        #     "question_id": item["question_id"],
        #     "predicted": extract_value(cot_solution),
        #     "true": item["answer"]
        # },
        "tot_tokens": tot_solver.total_tokens,
        # "cot_tokens": cot_tokens
    }

# Main execution flow
if __name__ == "__main__":
    api_key = os.getenv("DEEPINFRA_TOKEN", "")
    dataset = load_dataset("cs5260_val_random300.jsonl", sample_size=50)
    
    tot_results, cot_results = [], []
    tot_tokens_total, cot_tokens_total = 0, 0

    print("\nProcessing questions...")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_item, item, api_key) for item in dataset]
        
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            result = future.result()
            tot_results.append(result["tot_entry"])
            # cot_results.append(result["cot_entry"])
            # tot_tokens_total += result["tot_tokens"]
            # cot_tokens_total += result["cot_tokens"]

    # Calculate and display metrics
    tot_accuracy = evaluate_accuracy([r["predicted"] for r in tot_results], 
                                    [r["true"] for r in tot_results])
    # cot_accuracy = evaluate_accuracy([r["predicted"] for r in cot_results], 
    #                                 [r["true"] for r in cot_results])

    print("\n=== Validation Results ===")
    print(f"ToT Accuracy: {tot_accuracy:.2%}")
    # print(f"CoT Accuracy: {cot_accuracy:.2%}")
    print(f"ToT Total Tokens: {tot_tokens_total}")
    # print(f"CoT Total Tokens: {cot_tokens_total}")


Processing questions...


100%|██████████| 50/50 [10:46<00:00, 12.93s/it]


=== Validation Results ===
ToT Accuracy: 84.00%
ToT Total Tokens: 0





In [10]:
#multithread for test set
import os
import csv
import concurrent.futures
from tqdm import tqdm

def process_question(item, api_key):
    """Process a single question using TreeOfThoughts solver"""
    # Create new solver instance for thread safety
    tot_solver = TreeOfThoughts(api_key)
    
    tot_solution = tot_solver.solve(
        question=item["question"],
        max_steps=5, 
        n_samples_per_step=3, 
        k_best_thoughts=2 
    )
    
    tot_answer = extract_value(tot_solution)
    return {
        "question_id": item["question_id"],
        "predicted": tot_answer,
    }

def main():
    # 1. Load DeepInfra API key
    api_key = os.getenv("DEEPINFRA_TOKEN", "")

    # 2. Load and sample dataset
    dataset = load_dataset("cs5260_test_random300.jsonl", sample_size=300)

    # 3. Process questions using ThreadPoolExecutor
    tot_results = []
    print("\nProcessing questions...")
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit tasks for each question
        futures = [executor.submit(process_question, item, api_key) 
                  for item in dataset]
        
        # Collect results as they complete
        for future in tqdm(concurrent.futures.as_completed(futures), 
                         total=len(futures)):
            tot_results.append(future.result())

    # 4. Create CSV file
    with open('A0297803U_2.csv', 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=["question_id", "predicted"])
        writer.writeheader()
        writer.writerows(tot_results)
    
    print("Processing complete. CSV file 'A0297803U.csv' created.")

if __name__ == "__main__":
    main()


Processing questions...


100%|██████████| 300/300 [10:59<00:00,  2.20s/it]

Processing complete. CSV file 'A0297803U.csv' created.



