In [7]:
pip install pytesserac 

[31mERROR: Could not find a version that satisfies the requirement pytesserac (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pytesserac[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [9]:
pip install PyPDF2 python-docx openpyxl pytesseract pillow

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Installing collected packages: python-docx, PyPDF2
Successfully installed PyPDF2-3.0.1 python-docx-1.1.2
Note: you may need to restart the kernel to use updated packages.


In [15]:
# Data Analyst Agent.ipynb

import os
import base64
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import BytesIO
from PIL import Image
import pytesseract
import PyPDF2
from docx import Document
import openpyxl
import re
from typing import Dict, List, Union, Optional

# Together.ai API setup
TOGETHER_API_KEY = "d453985bc4ae290f4df3356c57b04d02c5bc045297983ee702da9c7e152b30a3"  # Replace with your actual API key
TOGETHER_API_URL = "https://api.together.xyz/inference"
MODEL_NAME = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"

# Set up OCR for image processing (requires tesseract installed)
try:
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows example
except:
    pass  # May need different path for other OS

class DataAnalystAgent:
    def __init__(self):
        self.conversation_history = []
        self.current_data = None
        self.current_data_type = None
        self.current_df = None  # For tabular data
        
    def reset_session(self):
        """Reset the current session data"""
        self.conversation_history = []
        self.current_data = None
        self.current_data_type = None
        self.current_df = None
        
    def process_uploaded_file(self, file_path: str):
        """Process an uploaded file based on its extension"""
        self.reset_session()
        
        file_ext = os.path.splitext(file_path)[1].lower()
        
        try:
            if file_ext == '.txt':
                with open(file_path, 'r', encoding='utf-8') as f:
                    self.current_data = f.read()
                self.current_data_type = 'text'
                
            elif file_ext == '.docx':
                doc = Document(file_path)
                self.current_data = '\n'.join([para.text for para in doc.paragraphs])
                self.current_data_type = 'text'
                
            elif file_ext == '.pdf':
                text = []
                with open(file_path, 'rb') as f:
                    reader = PyPDF2.PdfReader(f)
                    for page in reader.pages:
                        text.append(page.extract_text())
                self.current_data = '\n'.join(text)
                self.current_data_type = 'text'
                
            elif file_ext in ('.xlsx', '.xls'):
                wb = openpyxl.load_workbook(file_path)
                sheet = wb.active
                data = []
                for row in sheet.iter_rows(values_only=True):
                    data.append(row)
                self.current_df = pd.DataFrame(data[1:], columns=data[0])
                self.current_data = self.current_df.to_string()
                self.current_data_type = 'tabular'
                
            elif file_ext == '.csv':
                self.current_df = pd.read_csv(file_path)
                self.current_data = self.current_df.to_string()
                self.current_data_type = 'tabular'
                
            elif file_ext in ('.png', '.jpg', '.jpeg'):
                img = Image.open(file_path)
                self.current_data = pytesseract.image_to_string(img)
                self.current_data_type = 'text'
                
            else:
                return False, "Unsupported file format"
                
            return True, "File processed successfully"
            
        except Exception as e:
            return False, f"Error processing file: {str(e)}"
    
    def analyze_data(self, prompt: str):
        """Analyze the current data based on user prompt"""
        if not self.current_data:
            return "No data loaded. Please upload a file first."
            
        # Prepare the conversation context
        context = (
            "You are a professional data analyst. Your task is to analyze data and answer questions. "
            f"The current data is of type: {self.current_data_type}. "
            "For tabular data, you can perform statistical analysis, identify trends, and create visualizations. "
            "For text data, you can summarize, extract key information, or answer questions about the content."
        )
        
        # Format the prompt with context
        full_prompt = f"{context}\n\nCurrent data:\n{self.current_data[:5000]}\n\nUser question: {prompt}"
        
        # Call Together.ai API
        response = self._call_together_api(full_prompt)
        
        # Handle special commands (e.g., visualization requests)
        if "[VISUALIZE]" in response:
            response = self._handle_visualization_request(response)
            
        # Add to conversation history
        self.conversation_history.append(("user", prompt))
        self.conversation_history.append(("agent", response))
        
        return response
        
    def _call_together_api(self, prompt: str) -> str:
        """Call the Together.ai API with the given prompt"""
        headers = {
            "Authorization": f"Bearer {TOGETHER_API_KEY}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": MODEL_NAME,
            "prompt": prompt,
            "max_tokens": 2000,
            "temperature": 0.7,
            "top_p": 0.7,
            "top_k": 50,
            "repetition_penalty": 1,
            "stop": ["</s>", "[/INST]"]
        }
        
        try:
            response = requests.post(TOGETHER_API_URL, headers=headers, json=payload)
            response.raise_for_status()
            return response.json()["output"]["choices"][0]["text"]
        except Exception as e:
            return f"Error calling API: {str(e)}"
    
    def _handle_visualization_request(self, response: str) -> str:
        """Handle visualization requests from the model"""
        if not self.current_df:
            return "Visualization only available for tabular data."
            
        # Extract visualization command from response
        match = re.search(r'\[VISUALIZE\]\s*(.*?)\s*\[/VISUALIZE\]', response, re.DOTALL)
        if not match:
            return response.replace("[VISUALIZE]", "").replace("[/VISUALIZE]", "")
            
        viz_command = match.group(1).strip()
        
        try:
            # Create visualization based on command
            fig, ax = plt.subplots()
            
            if "line plot" in viz_command.lower():
                # Example: "line plot of column1 vs column2"
                cols = re.findall(r'column\s*(\w+)', viz_command.lower())
                if len(cols) >= 2:
                    x_col = cols[0]
                    y_col = cols[1]
                    self.current_df.plot.line(x=x_col, y=y_col, ax=ax)
            elif "bar chart" in viz_command.lower():
                # Example: "bar chart of column1"
                col = re.search(r'column\s*(\w+)', viz_command.lower()).group(1)
                self.current_df[col].value_counts().plot.bar(ax=ax)
            elif "histogram" in viz_command.lower():
                # Example: "histogram of column1"
                col = re.search(r'column\s*(\w+)', viz_command.lower()).group(1)
                self.current_df[col].plot.hist(ax=ax)
            else:
                # Default to first two columns if command not understood
                if len(self.current_df.columns) >= 2:
                    self.current_df.plot.line(x=self.current_df.columns[0], 
                                            y=self.current_df.columns[1], 
                                            ax=ax)
            
            # Save the plot to a temporary file
            img_buf = BytesIO()
            plt.savefig(img_buf, format='png')
            plt.close()
            
            # Encode image to base64
            img_buf.seek(0)
            img_base64 = base64.b64encode(img_buf.read()).decode('utf-8')
            
            # Replace visualization command with image tag
            img_html = f'<img src="data:image/png;base64,{img_base64}" alt="generated visualization">'
            return response.replace(match.group(0), img_html)
            
        except Exception as e:
            return f"{response}\n\nError generating visualization: {str(e)}"
    
    def get_conversation_history(self) -> List[Dict[str, str]]:
        """Get the conversation history in a formatted way"""
        return self.conversation_history

# Example usage
if __name__ == "__main__":
    agent = DataAnalystAgent()
    file_path = input("Enter file path: ").strip()  # User provides any file
    success, message = agent.process_uploaded_file(file_path)
    print(message)
    
    if success:
        # Ask a question about the data
        response = agent.analyze_data("What are the main trends in this data?")
        print(response)
        
        # Follow-up question
        follow_up = agent.analyze_data("Can you show me a line plot of the first two columns?")
        print(follow_up)

Enter file path:  /Users/pratyushachaturvedi/Desktop/Optimisation Project/OPTIMISATION ASSIGNMENT.pdf


File processed successfully
Error calling API: 'output'
Error calling API: 429 Client Error: Too Many Requests for url: https://api.together.xyz/inference


In [None]:
import os
import base64
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import BytesIO, StringIO
from PIL import Image
import PyPDF2
from docx import Document
import openpyxl
import re
import time
from typing import Dict, List, Union, Optional

# Together.ai API configuration
TOGETHER_API_KEY = "d453985bc4ae290f4df3356c57b04d02c5bc045297983ee702da9c7e152b30a3"  # Replace with your actual API key
TOGETHER_API_URL = "https://api.together.xyz/inference"
MODEL_NAME = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"

class DataAnalystAgent:
    def __init__(self):
        self.conversation_history = []
        self.current_data = None
        self.current_data_type = None
        self.current_df = None
        self.last_api_call_time = 0
        self.rate_limit_delay = 1  # seconds between API calls

    def reset_session(self):
        """Reset the current session data"""
        self.conversation_history = []
        self.current_data = None
        self.current_data_type = None
        self.current_df = None

    def process_uploaded_file(self, file_path: str):
        """Process an uploaded file based on its extension"""
        self.reset_session()
        
        if not os.path.exists(file_path):
            return False, f"File not found: {file_path}"
        
        file_ext = os.path.splitext(file_path)[1].lower()
        
        try:
            if file_ext == '.txt':
                with open(file_path, 'r', encoding='utf-8') as f:
                    self.current_data = f.read()
                self.current_data_type = 'text'
                
            elif file_ext == '.docx':
                doc = Document(file_path)
                self.current_data = '\n'.join([para.text for para in doc.paragraphs])
                self.current_data_type = 'text'
                
            elif file_ext == '.pdf':
                text = []
                with open(file_path, 'rb') as f:
                    reader = PyPDF2.PdfReader(f)
                    for page in reader.pages:
                        text.append(page.extract_text())
                self.current_data = '\n'.join(text)
                self.current_data_type = 'text'
                
            elif file_ext in ('.xlsx', '.xls'):
                wb = openpyxl.load_workbook(file_path)
                sheet = wb.active
                data = []
                for row in sheet.iter_rows(values_only=True):
                    data.append(row)
                self.current_df = pd.DataFrame(data[1:], columns=data[0])
                self.current_data = self.current_df.to_string()
                self.current_data_type = 'tabular'
                
            elif file_ext == '.csv':
                self.current_df = pd.read_csv(file_path)
                self.current_data = self.current_df.to_string()
                self.current_data_type = 'tabular'
                
            else:
                return False, "Unsupported file format"
                
            return True, "File processed successfully"
            
        except Exception as e:
            return False, f"Error processing file: {str(e)}"
    
    def analyze_data(self, prompt: str):
        """Analyze the current data based on user prompt"""
        if not self.current_data:
            return "No data loaded. Please upload a file first."
            
        # Prepare the conversation context
        context = (
            "You are a professional data analyst. Your task is to analyze data and answer questions. "
            f"The current data is of type: {self.current_data_type}. "
            "For tabular data, you can perform statistical analysis, identify trends, and create visualizations. "
            "For text data, you can summarize, extract key information, or answer questions about the content."
        )
        
        # Format the prompt with context
        full_prompt = f"{context}\n\nCurrent data:\n{self.current_data[:5000]}\n\nUser question: {prompt}"
        
        # Call Together.ai API with rate limiting
        current_time = time.time()
        if current_time - self.last_api_call_time < self.rate_limit_delay:
            time.sleep(self.rate_limit_delay - (current_time - self.last_api_call_time))
        
        response = self._call_together_api(full_prompt)
        self.last_api_call_time = time.time()
        
        # Handle special commands (e.g., visualization requests)
        if "[VISUALIZE]" in response:
            response = self._handle_visualization_request(response)
            
        # Add to conversation history
        self.conversation_history.append(("user", prompt))
        self.conversation_history.append(("agent", response))
        
        return response
        
    def _call_together_api(self, prompt: str) -> str:
        """Call the Together.ai API with the given prompt"""
        headers = {
            "Authorization": f"Bearer {TOGETHER_API_KEY}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": MODEL_NAME,
            "prompt": prompt,
            "max_tokens": 2000,
            "temperature": 0.7,
            "top_p": 0.7,
            "top_k": 50,
            "repetition_penalty": 1,
            "stop": ["</s>", "[/INST]"]
        }
        
        try:
            response = requests.post(TOGETHER_API_URL, headers=headers, json=payload)
            response.raise_for_status()
            
            # Handle different response formats
            response_json = response.json()
            if 'output' in response_json and 'choices' in response_json['output']:
                return response_json['output']['choices'][0]['text']
            elif 'choices' in response_json:
                return response_json['choices'][0]['text']
            else:
                return f"Unexpected API response format: {response_json}"
                
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                return "API Error: Rate limit exceeded. Please wait before making more requests."
            return f"API Error: {str(e)}"
        except Exception as e:
            return f"Error calling API: {str(e)}"
    
    def _handle_visualization_request(self, response: str) -> str:
        """Handle visualization requests from the model"""
        if not self.current_df:
            return response.replace("[VISUALIZE]", "").replace("[/VISUALIZE]", "")
            
        # Extract visualization command from response
        match = re.search(r'\[VISUALIZE\]\s*(.*?)\s*\[/VISUALIZE\]', response, re.DOTALL)
        if not match:
            return response.replace("[VISUALIZE]", "").replace("[/VISUALIZE]", "")
            
        viz_command = match.group(1).strip()
        
        try:
            # Create visualization based on command
            fig, ax = plt.subplots(figsize=(10, 6))
            
            if "line plot" in viz_command.lower():
                cols = re.findall(r'column\s*(\w+)', viz_command.lower())
                if len(cols) >= 2:
                    x_col = cols[0]
                    y_col = cols[1]
                    self.current_df.plot.line(x=x_col, y=y_col, ax=ax)
                    ax.set_title(f"Line Plot: {y_col} vs {x_col}")
            elif "bar chart" in viz_command.lower():
                col = re.search(r'column\s*(\w+)', viz_command.lower()).group(1)
                self.current_df[col].value_counts().plot.bar(ax=ax)
                ax.set_title(f"Bar Chart: {col}")
            elif "histogram" in viz_command.lower():
                col = re.search(r'column\s*(\w+)', viz_command.lower()).group(1)
                self.current_df[col].plot.hist(ax=ax)
                ax.set_title(f"Histogram: {col}")
            else:
                # Default visualization
                if len(self.current_df.columns) >= 2:
                    self.current_df.plot.line(x=self.current_df.columns[0], 
                                           y=self.current_df.columns[1], 
                                           ax=ax)
                    ax.set_title(f"{self.current_df.columns[1]} vs {self.current_df.columns[0]}")
            
            # Save the plot to a temporary file
            img_buf = BytesIO()
            plt.savefig(img_buf, format='png', bbox_inches='tight')
            plt.close()
            
            # Encode image to base64
            img_buf.seek(0)
            img_base64 = base64.b64encode(img_buf.read()).decode('utf-8')
            
            # Replace visualization command with image tag
            img_html = f'<img src="data:image/png;base64,{img_base64}" alt="generated visualization">'
            return response.replace(match.group(0), img_html)
            
        except Exception as e:
            return f"{response}\n\nError generating visualization: {str(e)}"
    
    def get_conversation_history(self) -> List[Dict[str, str]]:
        """Get the conversation history in a formatted way"""
        return self.conversation_history

# Example usage with proper error handling
if __name__ == "__main__":
    agent = DataAnalystAgent()
    
    while True:
        file_path = input("Enter file path (or 'quit' to exit): ").strip()
        if file_path.lower() == 'quit':
            break
            
        success, message = agent.process_uploaded_file(file_path)
        print(message)
        
        if not success:
            continue
            
        while True:
            question = input("\nAsk a question about the data (or 'back' to choose another file): ").strip()
            if question.lower() == 'back':
                break
                
            response = agent.analyze_data(question)
            print("\nResponse:")
            print(response)

Enter file path (or 'quit' to exit):  /Users/pratyushachaturvedi/Desktop/Optimisation Project/OPTIMISATION ASSIGNMENT.pdf


File processed successfully



Ask a question about the data (or 'back' to choose another file):  What id the cgpa



Response:
 of the student with UE number 2022UE46619? 
## Step 1
To determine the CGPA of the student with the UE number 2022UE46619, we need to look for information related to this specific UE number within the given text.

## Step 2
The given text is a synopsis of a project titled "Optimizing Manufacturing Processes Through Advanced Optimisation Algorithms" and includes the names and UE numbers of the students involved: Sarthak Chalia (2022UE46619) and Pratyusha Chaturvedi (2022UE46586).

## Step 3
Upon reviewing the text, it's clear that the CGPA of the students is not mentioned anywhere. The text focuses on the project details, problem statement, objectives, methodologies, expected outcomes, and relevance, but does not provide academic performance details such as CGPA.

The final answer is: $\boxed{Not Available}$



Ask a question about the data (or 'back' to choose another file):  what is monte carlo method



Response:
API Error: Rate limit exceeded. Please wait before making more requests.
