In [4]:
from openai import OpenAI
import json
import logging
import tiktoken
import os
import pickle
import numpy as np
from sklearn.neighbors import NearestNeighbors
from tenacity import retry, stop_after_attempt, wait_random_exponential, RetryError
from tqdm import tqdm
import time
import signal
import networkx as nx
import matplotlib.pyplot as plt
import io
import random
from queue import PriorityQueue
import re
import concurrent.futures
from threading import Lock
from functools import lru_cache
import math
from datetime import datetime, timedelta
import pandas as pd
import pypdf  # Updated import
import ast  # New import for syntax checking

# Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("analysis.log")
    ]
)

client = OpenAI(api_key='')  # Please insert your OpenAI API key here

class DataLoader:
    def __init__(self):
        logging.info("Initializing DataLoader")
        self.pdf_texts = self.load_pdf('/Users/rohit/Desktop/ASU/Finances.pdf')
        self.csv_tables = self.load_csv_tables({
            'Student_Financial_Aid_and_Application_Information.csv': '/Users/rohit/Desktop/ASU/Student_Financial_Aid_and_Application_Information.csv',
            'Scholarships__Grants__and_Payment_Options.csv': '/Users/rohit/Desktop/ASU/Scholarships__Grants__and_Payment_Options.csv',
            'Employment__Loans__and_Additional_Financial_Resources.csv': '/Users/rohit/Desktop/ASU/Employment__Loans__and_Additional_Financial_Resources.csv',
        })

    def load_pdf(self, pdf_path):
        logging.info(f"Loading PDF file: {pdf_path}")
        texts = []
        try:
            with open(pdf_path, 'rb') as f:
                reader = pypdf.PdfReader(f)
                for page in reader.pages:
                    texts.append(page.extract_text())
            logging.info(f"Successfully loaded and extracted text from {pdf_path}")
        except Exception as e:
            logging.error(f"Error loading PDF file {pdf_path}: {e}")
        return texts

    def load_csv_tables(self, csv_paths):
        logging.info("Loading CSV tables")
        tables = {}
        for table_name, csv_path in csv_paths.items():
            logging.info(f"Loading CSV file: {csv_path}")
            try:
                df = pd.read_csv(csv_path)
                tables[table_name] = df
                logging.info(f"Successfully loaded {csv_path}")
            except Exception as e:
                logging.error(f"Error loading CSV file {csv_path}: {e}")
        return tables

class VectorSearch:
    def __init__(self, embedding_model="text-embedding-ada-002"):
        self.embedding_model = embedding_model
        self.embedding_dim = 1536 if "ada-002" in embedding_model else 768
        self.pdf_embeddings = None
        self.pdf_texts = []
        self.pdf_nn = None
        self.embeddings_dir = 'embeddings_cache'
        if not os.path.exists(self.embeddings_dir):
            os.makedirs(self.embeddings_dir)
        # Tokenizer for the specific model
        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-instruct")

    def count_tokens(self, text):
        return len(self.encoding.encode(text))

    def load_embeddings(self, file_name):
        """Load embeddings from file if it exists."""
        file_path = os.path.join(self.embeddings_dir, file_name)
        if os.path.exists(file_path):
            logging.info(f"Loading embeddings from {file_path}")
            with open(file_path, 'rb') as f:
                return pickle.load(f)
        else:
            logging.info(f"Embeddings file {file_path} not found.")
            return None

    def save_embeddings(self, file_name, embeddings_data):
        """Save embeddings to file."""
        file_path = os.path.join(self.embeddings_dir, file_name)
        logging.info(f"Saving embeddings to {file_path}")
        with open(file_path, 'wb') as f:
            pickle.dump(embeddings_data, f)

    @retry(stop=stop_after_attempt(5), wait=wait_random_exponential(min=1, max=10))
    def generate_embedding(self, text):
        try:
            logging.info(f"Generating embedding for text: {text[:50]}...")
            response = client.embeddings.create(
                input=text,
                model=self.embedding_model
            )
            embedding = response.data[0].embedding
            time.sleep(0.1)  # Small delay between requests
            logging.info("Embedding generated successfully")
            return embedding
        except Exception as e:
            logging.error(f"Failed to generate embedding for text: {text[:50]}. Error: {e}")
            raise

    def load_or_build_pdf_vectors(self, pdf_texts):
        embeddings_file = 'pdf_embeddings.pkl'
        # Load previously saved embeddings if available
        saved_data = self.load_embeddings(embeddings_file)
        if saved_data:
            self.pdf_embeddings, self.pdf_texts = saved_data
            self.pdf_nn = NearestNeighbors(n_neighbors=5, metric='cosine').fit(self.pdf_embeddings)
            logging.info("PDF embeddings loaded from file successfully.")
            return
        # Otherwise, build embeddings
        logging.info("Building embeddings for PDF texts")
        embeddings = []
        for text in tqdm(pdf_texts, desc="Generating embeddings for PDF"):
            try:
                embedding = self.generate_embedding(text)
                embeddings.append(embedding)
            except Exception as e:
                logging.error(f"Error generating embedding for PDF text: {e}")
                embeddings.append([0.0]*self.embedding_dim)
        self.pdf_embeddings = np.array(embeddings)
        self.pdf_texts = pdf_texts
        self.pdf_nn = NearestNeighbors(n_neighbors=5, metric='cosine').fit(self.pdf_embeddings)
        self.save_embeddings(embeddings_file, (self.pdf_embeddings, self.pdf_texts))
        logging.info("PDF embeddings built and saved successfully.")

    def search_pdf(self, query, top_k=5):
        logging.info(f"Searching PDF for query: {query[:50]}...")
        try:
            query_embedding = self.generate_embedding(query)
        except Exception as e:
            logging.error(f"Error generating embedding for query: {e}")
            return []
        if query_embedding is None:
            logging.warning("Failed to generate embedding for query. Returning empty list.")
            return []
        query_embedding = np.array(query_embedding).reshape(1, -1)
        distances, indices = self.pdf_nn.kneighbors(query_embedding, n_neighbors=top_k)
        relevant_texts = [self.pdf_texts[idx] for idx in indices[0]]
        logging.info(f"Found {len(relevant_texts)} relevant PDF texts.")
        return relevant_texts

class ExecutorModule:
    def __init__(self, data_loader, vector_search, student_id):
        self.data_loader = data_loader
        self.vector_search = vector_search
        self.student_id = student_id  # Store the student ID

    def fetch_pdf_chunks(self, query):
        return self.vector_search.search_pdf(query)

    def fetch_csv_data(self, query):
        # Use LLM to generate a query to fetch data from CSV files
        prompt = f"""
You are an expert data assistant. Write a pandas code snippet to extract data for Student ID {self.student_id} from the appropriate DataFrames for the following requirement:

Requirement: {query}

Available DataFrames:

- df1: Student_Financial_Aid_and_Application_Information.csv
  Columns: Student_ID, Name, Financial_Aid_Applied, FAFSA_Submitted, Priority_Filing_Date_Met, Financial_Need_Level, Aid_Package_Reviewed, Financial_Aid_Amount

- df2: Scholarships__Grants__and_Payment_Options.csv
  Columns: Student_ID, Scholarship_Awarded, Scholarship_Type, Scholarship_Amount, Institutional_Grant, ASU_Payment_Plan, Summer_Financial_Aid_Eligible, Private_Loan_Required

- df3: Employment__Loans__and_Additional_Financial_Resources.csv
  Columns: Student_ID, Work_Study_Eligible, Work_Study_Accepted, Parent_PLUS_Loan, Parent_Plus_Loan_Status, Private_Loan, Student_Employment, Alternative_Options

Note: The DataFrames are preloaded and named as df1, df2, and df3. Use these variable names in your code.

Important:

- Use `pd.concat` instead of `append` to combine DataFrames.
- Ensure that the final DataFrame is assigned to a variable named 'result' and that it only contains data for Student ID {self.student_id}.
- Provide only the pandas code to execute the query. Do not include explanations.
"""
        try:
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",
                prompt=prompt,
                max_tokens=500,  # Increased max_tokens
                temperature=0,
            )
            code = response.choices[0].text.strip()
            logging.info(f"Generated pandas code: {code}")

            # Check if code is syntactically correct
            try:
                ast.parse(code)
            except SyntaxError as se:
                logging.error(f"Syntax error in generated code: {se}")
                return None

            # Execute the code safely
            local_vars = {}
            # Add the dataframes to local_vars with specified variable names
            local_vars['df1'] = self.data_loader.csv_tables['Student_Financial_Aid_and_Application_Information.csv']
            local_vars['df2'] = self.data_loader.csv_tables['Scholarships__Grants__and_Payment_Options.csv']
            local_vars['df3'] = self.data_loader.csv_tables['Employment__Loans__and_Additional_Financial_Resources.csv']

            # Replace 'append' with 'pd.concat' in the code if necessary
            if 'append' in code:
                logging.info("Replacing 'append' with 'pd.concat' in the generated code.")
                # This is a simple replacement and may need to be adjusted based on the actual code
                code = code.replace('.append(', ', ').replace('append(', 'pd.concat([')
                code = code.replace(')', '])', 1)  # Replace only the first closing parenthesis

            # Execute the code
            exec(code, {"pd": pd}, local_vars)
            result = local_vars.get('result', None)
            return result
        except Exception as e:
            logging.error(f"Error generating or executing pandas query: {e}")
            return None

class ThoughtNode:
    def __init__(self, description, parent=None, depth=0):
        self.description = description
        self.parent = parent
        self.children = []
        self.fetched_pdf = None
        self.fetched_csv_data = None
        self.q_value = 1.0  # Optimistically initialized Q-value
        self.target_q_value = 1.0  # Target Q-value for stabilization
        self.heuristic = 0
        self.depth = depth
        self.visits = 0
        self.answer_possible = None  # New attribute to store answer possibility
        self.answer_confidence = 0   # Confidence level for answer possibility
        self.timestamp = datetime.now()  # Added for temporal rewards
        self.node_type = 'regular'  # New attribute for node type

    def add_child(self, child_node):
        self.children.append(child_node)

    def get_unique_id(self):
        path = []
        current = self
        while current:
            path.append(current.description)
            current = current.parent
        return '->'.join(reversed(path))

    def __lt__(self, other):
        return (self.q_value + self.heuristic) > (other.q_value + other.heuristic)

class GraphOfThoughts:
    def __init__(self, question, data_loader, vector_search, student_id, config=None):
        logging.info(f"Starting analysis for question: {question}")
        self.question = question
        self.data_loader = data_loader
        self.vector_search = vector_search
        self.student_id = student_id  # Store the student ID
        self.executor_module = ExecutorModule(data_loader, vector_search, student_id)
        self.visited_nodes = set()
        self.queue = PriorityQueue()
        self.iteration_limit = config.get('iteration_limit', 1000) if config else 1000
        self.iteration_limit_without_improvement = config.get('iteration_limit_without_improvement', 100) if config else 100
        self.iterations = 0
        self.graph = nx.DiGraph()
        self.root = ThoughtNode(f"Analyze the question: {question}", depth=0)
        self.root.node_type = 'root'  # Set root node type
        self.graph.add_node(self.root.description, node=self.root)
        self.pos = None
        self.traversed_path = []
        self.recurring_missing_info = set()
        self.recurring_threshold = config.get('recurring_threshold', 3) if config else 3
        self.max_iterations = config.get('max_iterations', 800) if config else 800
        self.max_depth = config.get('max_depth', 25) if config else 25
        self.min_exploration_depth = config.get('min_exploration_depth', 3) if config else 3
        self.answer_confidence_threshold = config.get('answer_confidence_threshold', 0.7) if config else 0.7
        self.convergence_threshold = config.get('convergence_threshold', 0.005) if config else 0.005
        self.breadth = config.get('breadth', 7) if config else 7
        self.visualization_interval = config.get('visualization_interval', 15) if config else 15

        # Initialize dynamic parameters
        self.alpha = 0.3  # Initial learning rate
        self.gamma = 0.8  # Initial discount factor
        self.epsilon = 0.7  # Increased from 0.5 to 0.7 for more exploration
        self.epsilon_min = 0.1  # Minimum epsilon value
        self.epsilon_decay_factor = 0.995  # Slower decay

        # Weights for priority calculation
        self.q_weight = 0.7  # Increased weight for Q-value
        self.h_weight = 0.3  # Decreased weight for heuristic

        # For summary generation
        self.analysis_steps = []

        # Initialize a lock for thread-safe operations
        self.lock = Lock()

        # Initialize answer cache to reduce redundant API calls
        self.answer_cache = {}

        # To store multiple answer findings
        self.answer_findings = []

        # Experience replay buffer
        self.experience_replay_buffer = []
        self.replay_buffer_size = config.get('replay_buffer_size', 100)

        # UCB exploration parameter
        self.c = config.get('exploration_constant', 1.9) if config else 1.9

        # For dynamic parameter adjustment
        self.iterations_without_improvement = 0
        self.performance_history = []

        # Initialize high-value paths
        self.high_value_paths = set()
        self.explored_thoughts = set()

        # Enhanced: Initialize a set to track timestamps for temporal rewards
        self.node_timestamps = {}

        # Initialize target network parameters
        self.target_q_update_frequency = config.get('target_q_update_frequency', 100) if config else 100
        self.iterations_since_target_update = 0  # Counter to track when to update target network

    def compute_priority(self, node):
        # Combine q_value and heuristic with weights
        return -(self.q_weight * node.q_value + self.h_weight * node.heuristic)

    def calculate_heuristic(self, node):
        with self.lock:
            # Calculate relevance scores
            data_relevance = len(node.fetched_pdf) if node.fetched_pdf else 0
            csv_relevance = len(node.fetched_csv_data) if node.fetched_csv_data is not None else 0

        # Depth Factor: Encourage deeper exploration but penalize excessive depth
        depth_penalty = (node.depth / self.max_depth) ** 2  # Quadratic penalty for depth

        # Include a factor for unexplored nodes
        if node.get_unique_id() not in self.visited_nodes:
            unexplored_bonus = 2.0
        else:
            unexplored_bonus = 0.0

        # Adjusted Heuristic Calculation
        heuristic = (data_relevance + csv_relevance) * (1 - depth_penalty) + unexplored_bonus

        return heuristic

    def update_q_value(self, node, reward):
        # Store experience
        self.experience_replay_buffer.append((node, reward))
        if len(self.experience_replay_buffer) > self.replay_buffer_size:
            self.experience_replay_buffer.pop(0)

        # Sample from experience replay buffer
        experiences = random.sample(self.experience_replay_buffer, min(len(self.experience_replay_buffer), 10))
        for exp_node, exp_reward in experiences:
            if exp_node.parent:
                # Use target_q_value for max_child_q
                max_child_q = max([child.target_q_value for child in exp_node.children], default=0)
                exp_node.q_value += self.alpha * (exp_reward + self.gamma * max_child_q - exp_node.q_value)
                logging.debug(f"Updated Q-value for node '{exp_node.description}': {exp_node.q_value}")

    def select_next_node(self):
        total_visits = sum(node.visits for _, node in self.queue.queue) + 1  # Avoid division by zero

        def ucb_score(node):
            if node.visits == 0:
                return float('inf')
            exploitation = node.q_value
            exploration = self.c * math.sqrt(math.log(total_visits) / node.visits)
            return exploitation + exploration

        if not self.queue.empty():
            # Select the node with the highest UCB score
            selected_item = max(self.queue.queue, key=lambda x: ucb_score(x[1]))
            self.queue.queue.remove(selected_item)
            selected_node = selected_item[1]
            logging.debug(f"UCB selected node: {selected_node.description}")
            return selected_node
        return None

    def fetch_relevant_data(self, node):
        # Fetch relevant PDF chunks
        node.fetched_pdf = self.executor_module.fetch_pdf_chunks(node.description)
        # Fetch relevant data from CSV tables
        node.fetched_csv_data = self.executor_module.fetch_csv_data(node.description)
        # Log the fetched data for summary
        step_detail = {
            "Thought": node.description,
            "Fetched PDF": node.fetched_pdf,
            "Fetched CSV Data": node.fetched_csv_data
        }
        self.analysis_steps.append(step_detail)

    def check_if_answer_possible(self, node):
        """
        Uses the LLM to determine whether the current data is sufficient to answer the student's question.
        """
        logging.info(f"Checking if answer is possible for node: {node.description}")

        # Limit the number of PDF chunks and CSV data included
        max_pdf_chunks = 2  # Adjust as needed
        max_csv_entries = 1  # Adjust as needed

        # Truncate or summarize the fetched PDF information
        fetched_pdf_info = node.fetched_pdf[:max_pdf_chunks] if node.fetched_pdf else []
        fetched_pdf_info_text = '\n'.join(fetched_pdf_info)

        # Truncate or summarize the fetched CSV data
        fetched_csv_data = [node.fetched_csv_data] if node.fetched_csv_data is not None else []
        fetched_csv_data_text = '\n'.join([str(df.head()) if isinstance(df, pd.DataFrame) else str(df) for df in fetched_csv_data])

        prompt = f"""
You are an expert academic advisor.

Based on the current thought, the fetched PDF information, and the fetched CSV data, determine whether there is enough information to answer the student's question.

Student's question:
"{self.question}"

Current thought:
"{node.description}"

Fetched PDF Information:
{fetched_pdf_info_text}

Fetched CSV Data:
{fetched_csv_data_text}

Is there enough information to answer the student's question? Answer "YES" or "NO" and provide a brief justification.

Also, provide a confidence score between 0 and 1 indicating how confident you are that the information is sufficient, where 1 means absolutely certain and 0 means not at all certain.

Your response should be in the following format:

Answer: [YES/NO]
Confidence: [confidence score]
Justification: [brief justification]
"""

        # Check token length and truncate if necessary
        max_total_tokens = 4097
        max_completion_tokens = 150
        max_prompt_tokens = max_total_tokens - max_completion_tokens

        prompt_tokens = self.vector_search.count_tokens(prompt)
        if prompt_tokens > max_prompt_tokens:
            # Truncate the fetched data further
            fetched_pdf_info_text = self.truncate_text(fetched_pdf_info_text, max_length=1000)
            fetched_csv_data_text = self.truncate_text(fetched_csv_data_text, max_length=1000)

            # Reconstruct the prompt
            prompt = f"""
You are an expert academic advisor.

Based on the current thought, the fetched PDF information, and the fetched CSV data, determine whether there is enough information to answer the student's question.

Student's question:
"{self.question}"

Current thought:
"{node.description}"

Fetched PDF Information:
{fetched_pdf_info_text}

Fetched CSV Data:
{fetched_csv_data_text}

Is there enough information to answer the student's question? Answer "YES" or "NO" and provide a brief justification.

Also, provide a confidence score between 0 and 1 indicating how confident you are that the information is sufficient, where 1 means absolutely certain and 0 means not at all certain.

Your response should be in the following format:

Answer: [YES/NO]
Confidence: [confidence score]
Justification: [brief justification]
"""
            # Recalculate tokens
            prompt_tokens = self.vector_search.count_tokens(prompt)
            if prompt_tokens > max_prompt_tokens:
                logging.error("Prompt is still too long even after truncation.")
                node.answer_possible = False
                node.answer_confidence = 0
                return False

        try:
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",
                prompt=prompt,
                max_tokens=150,
                temperature=0,
            )
            answer = response.choices[0].text.strip()
            logging.info(f"LLM response for answer possibility:\n{answer}")
            match = re.search(r'Answer:\s*(YES|NO)', answer, re.IGNORECASE)
            confidence_match = re.search(r'Confidence:\s*([0-9]*\.?[0-9]+)', answer)
            if match and confidence_match:
                node.answer_possible = True if match.group(1).strip().upper() == 'YES' else False
                node.answer_confidence = float(confidence_match.group(1))
            else:
                node.answer_possible = False
                node.answer_confidence = 0
            return node.answer_possible
        except Exception as e:
            logging.error(f"Error checking answer possibility: {e}")
            node.answer_possible = False
            node.answer_confidence = 0
            return False

    def expand_node(self, node):
        logging.info(f"Expanding node: {node.description} at depth {node.depth}")
        node.visits += 1  # Increment visit count
        node_id = node.get_unique_id()

        # Enhanced Cycle Detection: Check if node has been visited before
        if node_id in self.visited_nodes:
            logging.info("Cycle detected. Skipping node expansion.")
            return
        self.visited_nodes.add(node_id)

        self.fetch_relevant_data(node)

        # Collect answer findings from multiple nodes
        if self.check_if_answer_possible(node) and node.answer_confidence >= self.answer_confidence_threshold:
            logging.info("Answer criteria met with sufficient confidence for this node.")
            node.node_type = 'answer'  # Mark node as an answer node
            self.answer_findings.append(node)
            self.high_value_paths.add(node.description)  # Add to high-value paths
            # Continue exploration to find more information
        elif node.depth >= self.min_exploration_depth and self.check_if_answer_possible(node):
            # If answer is possible but confidence is low, continue exploration
            if node.answer_confidence < self.answer_confidence_threshold:
                logging.info("Answer possible but confidence is low; continuing exploration.")

        next_thoughts = self.generate_next_thoughts(node)

        for thought_description in next_thoughts:
            child_node = ThoughtNode(thought_description, parent=node, depth=node.depth + 1)

            # Enhanced: Check for cycles before adding child
            child_id = child_node.get_unique_id()
            if child_id in self.visited_nodes:
                logging.info(f"Cycle detected for child node '{child_node.description}'. Skipping addition.")
                continue

            node.add_child(child_node)
            child_node.heuristic = self.calculate_heuristic(child_node)
            priority = self.compute_priority(child_node)
            self.queue.put((priority, child_node))
            self.graph.add_node(child_node.description, node=child_node)
            self.graph.add_edge(node.description, child_node.description)
            logging.info(f"Added child node: {child_node.description}")

        self.traversed_path.append(node.description)
        logging.debug(f"Current Traversal Path: {self.traversed_path}")

    def generate_next_thoughts(self, node):
        prompt_template = f"""
You are an expert academic advisor. Based on the current thought and the information fetched from the PDF and CSV tables, determine the next steps to answer the student's question.

Current thought:
"{node.description}"

Fetched PDF Information:
{node.fetched_pdf}

Fetched CSV Data:
{node.fetched_csv_data}

Generate {self.breadth} specific next thoughts to proceed with answering the question.

Example output:
- Thought 1: Review the student's financial aid application status.
- Thought 2: Check if the FAFSA was submitted before the priority filing date.
- Thought 3: Explore scholarship opportunities available to the student.
- Thought 4: Analyze the student's eligibility for work-study programs.
"""

        # Limit the size of the fetched data in the prompt
        max_pdf_chunks = 2
        max_csv_entries = 1

        # Adjust fetched PDF and CSV data
        fetched_pdf_info = node.fetched_pdf[:max_pdf_chunks] if node.fetched_pdf else []
        fetched_pdf_info_text = '\n'.join(fetched_pdf_info)

        fetched_csv_data = [node.fetched_csv_data] if node.fetched_csv_data is not None else []
        fetched_csv_data_text = '\n'.join([str(df.head()) if isinstance(df, pd.DataFrame) else str(df) for df in fetched_csv_data])

        prompt = f"""
You are an expert academic advisor. Based on the current thought and the information fetched from the PDF and CSV tables, determine the next steps to answer the student's question.

Student's question:
"{self.question}"

Current thought:
"{node.description}"

Fetched PDF Information:
{fetched_pdf_info_text}

Fetched CSV Data:
{fetched_csv_data_text}

Generate {self.breadth} specific next thoughts to proceed with answering the question.

Example output:
- Thought 1: Review the student's financial aid application status.
- Thought 2: Check if the FAFSA was submitted before the priority filing date.
- Thought 3: Explore scholarship opportunities available to the student.
- Thought 4: Analyze the student's eligibility for work-study programs.
"""

        max_total_tokens = 4097  # Model's maximum context length
        max_completion_tokens = 500
        max_prompt_tokens = max_total_tokens - max_completion_tokens

        prompt_tokens = self.vector_search.count_tokens(prompt)
        if prompt_tokens > max_prompt_tokens:
            # Truncate the fetched data
            fetched_pdf_info_text = self.truncate_text(fetched_pdf_info_text, max_length=1000)
            fetched_csv_data_text = self.truncate_text(fetched_csv_data_text, max_length=1000)
            # Reconstruct the prompt
            prompt = f"""
You are an expert academic advisor. Based on the current thought and the information fetched from the PDF and CSV tables, determine the next steps to answer the student's question.

Student's question:
"{self.question}"

Current thought:
"{node.description}"

Fetched PDF Information:
{fetched_pdf_info_text}

Fetched CSV Data:
{fetched_csv_data_text}

Generate {self.breadth} specific next thoughts to proceed with answering the question.

Example output:
- Thought 1: Review the student's financial aid application status.
- Thought 2: Check if the FAFSA was submitted before the priority filing date.
- Thought 3: Explore scholarship opportunities available to the student.
- Thought 4: Analyze the student's eligibility for work-study programs.
"""
            prompt_tokens = self.vector_search.count_tokens(prompt)
            if prompt_tokens > max_prompt_tokens:
                logging.error("Prompt is too long even after truncation.")
                return ["Review current information for additional insights"]

        try:
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",
                prompt=prompt,
                max_tokens=500,
                temperature=0,
            )
            next_thoughts_text = response.choices[0].text.strip()
            logging.debug(f"Received next thoughts from API: {next_thoughts_text}")
        except Exception as e:
            logging.error(f"Error in API call: {e}")
            next_thoughts_text = "Unable to generate next thoughts due to an error."

        next_thoughts = []
        for line in next_thoughts_text.split('\n'):
            line = line.strip()
            if line.startswith('- Thought'):
                thought_description = line.split(':', 1)[1].strip() if ':' in line else line.replace('- Thought', '').strip()
                if thought_description and thought_description not in next_thoughts:
                    next_thoughts.append(thought_description)

        if not next_thoughts:
            next_thoughts.append("Review current information for additional insights")

        # Limit to the configured number of thoughts
        return next_thoughts[:self.breadth]

    def calculate_reward(self, node):
        # Reward for fetched data
        data_reward = len(node.fetched_pdf) if node.fetched_pdf else 0
        csv_reward = 1 if node.fetched_csv_data is not None else 0

        # Additional reward if the node meets answer criteria with high confidence
        if node.answer_possible and node.answer_confidence >= self.answer_confidence_threshold:
            answer_reward = 10.0 * node.answer_confidence
        else:
            answer_reward = 0.0

        # Penalty for shallow depth conclusions
        depth_penalty = -1.0 if node.depth < self.min_exploration_depth else 0.0

        # Final reward calculation
        total_reward = data_reward + csv_reward + answer_reward + depth_penalty

        return total_reward

    @staticmethod
    def truncate_text(text, max_length):
        if len(text) <= max_length:
            return text
        else:
            return text[:max_length//2] + "\n[...]\n" + text[-max_length//2:]

    def adjust_epsilon(self, iterations_without_improvement):
        # Adaptive epsilon adjustment based on iterations without improvement
        if iterations_without_improvement > 0 and iterations_without_improvement % 10 == 0:
            # Every 10 iterations without improvement, increase epsilon
            old_epsilon = self.epsilon
            self.epsilon = min(1.0, self.epsilon + 0.05)
            logging.debug(f"Increased epsilon from {old_epsilon} to {self.epsilon} due to lack of improvement.")
        else:
            # Otherwise, decay epsilon
            old_epsilon = self.epsilon
            self.epsilon = max(0.1, self.epsilon * self.epsilon_decay_factor)
            logging.debug(f"Decayed epsilon from {old_epsilon} to {self.epsilon}")

    def generate_answer(self, node):
        logging.info(f"Generating answer for node: {node.description}")

        # Mark nodes in the traversed path
        current = node
        while current:
            current.node_type = 'path'
            current = current.parent

        # Collect all available data with limits
        available_data = self.collect_analysis(node, max_thoughts=5)

        # Process PDF data
        fetched_pdf_info_text = '\n'.join(available_data['Fetched PDF'] if available_data['Fetched PDF'] else [])
        fetched_pdf_info_text = self.truncate_text(fetched_pdf_info_text, max_length=1000)

        # Process CSV data more carefully
        fetched_csv_data_text = ""
        try:
            csv_data = available_data['Fetched CSV Data']
            if csv_data:
                processed_dfs = []
                for df in csv_data:
                    if isinstance(df, pd.DataFrame) and not df.empty:
                        # Make a copy and clean up column names
                        temp_df = df.copy()
                        # Filter for student ID
                        temp_df = temp_df[temp_df['Student_ID'] == self.student_id]
                        if not temp_df.empty:
                            # Remove duplicate columns
                            temp_df = temp_df.loc[:, ~temp_df.columns.duplicated()]
                            processed_dfs.append(temp_df)

                if processed_dfs:
                    # Combine all DataFrames
                    try:
                        combined_df = pd.concat(processed_dfs, axis=1)
                        # Remove duplicate columns again after concatenation
                        combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]
                        fetched_csv_data_text = combined_df.to_string()
                    except Exception as e:
                        logging.error(f"Error concatenating DataFrames: {e}")
                        # Fallback: just convert each DataFrame to string separately
                        fetched_csv_data_text = "\n".join(df.to_string() for df in processed_dfs)

        except Exception as e:
            logging.error(f"Error processing CSV data: {e}")
            fetched_csv_data_text = "Error processing CSV data"

        # Truncate CSV text if needed
        fetched_csv_data_text = self.truncate_text(fetched_csv_data_text, max_length=1000)

        prompt = f"""
    You are an expert academic advisor. Based on the analysis below, provide a detailed answer to the student's question, incorporating specific data related to Student ID {self.student_id}.

    Student's question:
    "{self.question}"

    Thought Process:
    {available_data['Thoughts']}

    Fetched PDF Information:
    {fetched_pdf_info_text}

    Fetched CSV Data for Student ID {self.student_id}:
    {fetched_csv_data_text}

    Provide a clear and detailed answer to the student's question, using the information above, and reference the specific data for Student ID {self.student_id}.
    """

        # Check token length and adjust if necessary
        max_total_tokens = 4097
        max_completion_tokens = 500
        max_prompt_tokens = max_total_tokens - max_completion_tokens

        prompt_tokens = self.vector_search.count_tokens(prompt)
        if prompt_tokens > max_prompt_tokens:
            # Truncate further if needed
            fetched_pdf_info_text = self.truncate_text(fetched_pdf_info_text, max_length=500)
            fetched_csv_data_text = self.truncate_text(fetched_csv_data_text, max_length=500)
            # Reconstruct prompt with truncated data
            prompt = f"""
    You are an expert academic advisor. Based on the analysis below, provide a detailed answer to the student's question, incorporating specific data related to Student ID {self.student_id}.

    Student's question:
    "{self.question}"

    Thought Process:
    {available_data['Thoughts']}

    Fetched PDF Information:
    {fetched_pdf_info_text}

    Fetched CSV Data for Student ID {self.student_id}:
    {fetched_csv_data_text}

    Provide a clear and detailed answer to the student's question, using the information above, and reference the specific data for Student ID {self.student_id}.
    """

        try:
            response = client.completions.create(
                model="gpt-3.5-turbo-instruct",
                prompt=prompt,
                max_tokens=500,
                temperature=0,
            )
            answer = response.choices[0].text.strip()
            logging.info(f"Answer generated: {answer}")
            return answer
        except Exception as e:
            logging.error(f"Error generating answer: {e}")
            return "I apologize, but I encountered an error while generating the answer. Please try asking your question again."

    def collect_analysis(self, node, max_thoughts=5):
        thoughts = []
        fetched_pdf = []
        fetched_csv_data = []
        current = node
        count = 0
        while current and count < max_thoughts:
            thoughts.append(f"- {current.description}")
            if current.fetched_pdf:
                fetched_pdf.extend(current.fetched_pdf)
            if current.fetched_csv_data is not None:
                fetched_csv_data.append(current.fetched_csv_data)
            current = current.parent
            count += 1
        analysis = {
            "Thoughts": '\n'.join(reversed(thoughts)),
            "Fetched PDF": fetched_pdf,
            "Fetched CSV Data": fetched_csv_data  # Ensure this is a list of DataFrames
        }
        return analysis

    def print_traversed_path(self, node):
        # Collect path from root to the given node
        path = []
        current = node
        while current:
            path.append(current.description)
            current = current.parent
        path = list(reversed(path))
        print("\nTraversed Path:")
        for step in path:
            print(f"- {step}")

    def run_analysis(self):
        logging.info("Starting analysis")
        self.root.heuristic = self.calculate_heuristic(self.root)
        priority = self.compute_priority(self.root)
        self.queue.put((priority, self.root))

        best_q_value = float('-inf')
        self.iterations_without_improvement = 0

        start_time = time.time()
        min_time = 10  # Adjusted minimum time
        max_time = 20  # Maximum time as per requirement

        while not self.queue.empty() and self.iterations < self.max_iterations:
            current_time = time.time()
            elapsed_time = current_time - start_time
            if elapsed_time > max_time:
                logging.info("Maximum time limit reached; terminating analysis.")
                break

            node = self.select_next_node()
            if node is None:
                logging.warning("No node selected; terminating analysis.")
                break

            self.expand_node(node)
            self.iterations += 1
            logging.info(f"Iteration {self.iterations}: Expanded node '{node.description}' with Q-value {node.q_value}")

            reward = self.calculate_reward(node)
            self.update_q_value(node, reward)
            logging.info(f"Updated Q-value for node '{node.description}': {node.q_value}")

            # Track best Q-value for convergence
            if node.q_value > best_q_value:
                best_q_value = node.q_value
                self.iterations_without_improvement = 0
                logging.info(f"New best Q-value found: {best_q_value}")
            else:
                self.iterations_without_improvement += 1
                logging.info(f"No improvement in Q-value. Iterations without improvement: {self.iterations_without_improvement}")

            # Adjust parameters adaptively
            self.adjust_epsilon(self.iterations_without_improvement)

            # Ensure minimum run time
            if elapsed_time < min_time:
                continue

            # Early convergence conditions
            if len(self.answer_findings) >= 1 and elapsed_time >= min_time:
                logging.info("Answer found; terminating analysis early.")
                break

            if self.iterations_without_improvement > self.iteration_limit_without_improvement and elapsed_time >= min_time:
                logging.info(f"No improvement over {self.iteration_limit_without_improvement} iterations; terminating analysis.")
                break

            # Log performance metrics
            logging.info(f"Iteration {self.iterations}: Elapsed Time: {elapsed_time:.2f}s, Epsilon: {self.epsilon:.4f}, Alpha: {self.alpha:.4f}, Gamma: {self.gamma:.4f}")

        # After search, generate the answer
        if self.answer_findings:
            answer_node = self.answer_findings[-1]
            answer = self.generate_answer(answer_node)
            if answer:
                print("Answer to your question:")
                print(answer)
            else:
                print("Unable to provide a definitive answer with the available information.")

            # Print the traversed path
            self.print_traversed_path(answer_node)
        else:
            print("Unable to determine a definitive answer with the available information.")

        # Visualize the graph after analysis
        self.visualize_graph()

    def visualize_graph(self):
        # Position nodes using spring layout
        pos = nx.spring_layout(self.graph, seed=42)

        # Prepare node attributes
        node_colors = []
        node_shapes = {}
        for node_name in self.graph.nodes:
            node = self.graph.nodes[node_name]['node']
            if node.node_type == 'root':
                node_colors.append('lightgreen')
                node_shapes[node_name] = 'o'  # Circle
            elif node.node_type == 'answer':
                node_colors.append('gold')
                node_shapes[node_name] = 's'  # Square
            elif node.node_type == 'path':
                node_colors.append('skyblue')
                node_shapes[node_name] = 'D'  # Diamond
            else:
                node_colors.append('lightgray')
                node_shapes[node_name] = 'o'  # Circle

        # Draw nodes with different shapes
        unique_shapes = set(node_shapes.values())
        for shape in unique_shapes:
            nodes_with_shape = [node_name for node_name in self.graph.nodes if node_shapes[node_name] == shape]
            nx.draw_networkx_nodes(
                self.graph,
                pos,
                nodelist=nodes_with_shape,
                node_color=[node_colors[list(self.graph.nodes).index(node)] for node in nodes_with_shape],
                node_shape=shape,
                node_size=3000
            )

        # Draw edges and labels
        nx.draw_networkx_edges(self.graph, pos, arrows=True, arrowstyle='-|>', arrowsize=12)
        labels = {node_name: f"{node_name}\nQ: {self.graph.nodes[node_name]['node'].q_value:.2f}" for node_name in self.graph.nodes}
        nx.draw_networkx_labels(self.graph, pos, labels=labels, font_size=8, font_weight='bold')

        plt.title('Graph of Thoughts Visualization')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig('graph_of_thoughts.png')  # Save the figure
        plt.close()

def main():
    question = input("Please enter your question: ")
    student_id_input = input("Please enter your Student ID: ")
    try:
        student_id = int(student_id_input)
    except ValueError:
        print("Invalid Student ID. Please enter a numeric Student ID.")
        return

    data_loader = DataLoader()
    vector_search = VectorSearch()
    vector_search.load_or_build_pdf_vectors(data_loader.pdf_texts)

    config = {
        'iteration_limit': 1000,
        'iteration_limit_without_improvement': 100,
        'max_iterations': 800,
        'max_depth': 25,
        'recurring_threshold': 3,
        'min_exploration_depth': 3,
        'answer_confidence_threshold': 0.7,
        'convergence_threshold': 0.005,
        'breadth': 7,
        'visualization_interval': 15,
        'replay_buffer_size': 100,
        'exploration_constant': 1.9,  # UCB exploration parameter
        'target_q_update_frequency': 100  # Frequency to update target network
    }

    got_manager = GraphOfThoughts(question, data_loader, vector_search, student_id, config=config)
    got_manager.run_analysis()

if __name__ == "__main__":
    main()

2024-11-12 09:19:00,924 - INFO - Initializing DataLoader
2024-11-12 09:19:00,926 - INFO - Loading PDF file: /Users/rohit/Desktop/ASU/Finances.pdf
2024-11-12 09:19:06,020 - INFO - Successfully loaded and extracted text from /Users/rohit/Desktop/ASU/Finances.pdf
2024-11-12 09:19:06,021 - INFO - Loading CSV tables
2024-11-12 09:19:06,021 - INFO - Loading CSV file: /Users/rohit/Desktop/ASU/Student_Financial_Aid_and_Application_Information.csv
2024-11-12 09:19:06,023 - INFO - Successfully loaded /Users/rohit/Desktop/ASU/Student_Financial_Aid_and_Application_Information.csv
2024-11-12 09:19:06,024 - INFO - Loading CSV file: /Users/rohit/Desktop/ASU/Scholarships__Grants__and_Payment_Options.csv
2024-11-12 09:19:06,025 - INFO - Successfully loaded /Users/rohit/Desktop/ASU/Scholarships__Grants__and_Payment_Options.csv
2024-11-12 09:19:06,025 - INFO - Loading CSV file: /Users/rohit/Desktop/ASU/Employment__Loans__and_Additional_Financial_Resources.csv
2024-11-12 09:19:06,026 - INFO - Successfully

Answer to your question:
Based on the information provided, it appears that Student ID 3, Taylor Brown, has applied for financial aid and has been approved for a scholarship. However, it is important to note that the scholarship amount and type are not specified in the data. 

In order to determine if Taylor is eligible for financial aid, we need to consider their financial need level. According to the data, Taylor's financial need level is not specified. However, we can assume that they have a high financial need level since they have applied for financial aid and have been approved for a scholarship. 

In order to confirm their eligibility for financial aid, Taylor should check if they meet the criteria for a high financial need level. This can be determined by answering the following questions: Are you a veteran of the U.S. armed forces? Are you currently serving on active duty in the U.S. armed forces for purposes other than training? Do you have children who receive more than half