<a href="https://colab.research.google.com/github/punnoose-1620/masters-thesis-sensor-data/blob/main/LiteratureReviewHelper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Perform Relevance analysis for all papers related to an idea.

### Expected File Structure :
```
papers_folder
  |---subfolder
      |---paper1
      |---paper2
      |---notepad
```

## Imports and Installs

In [None]:
!pip install google-generativeai
!pip install pdfplumber

In [None]:
from pydantic import BaseModel, ConfigDict
import google.generativeai as genai
from google.colab import userdata
from typing import Type, Optional

from tqdm import tqdm
import pdfplumber
import json
import os

## Declare class for return types

In [None]:
class ContextualSummary(BaseModel):
  summary: str

In [None]:
class Author(BaseModel):
  name: str
  institution: str

In [None]:
class Paper(BaseModel):
  title: str
  abstract: str
  methodology: str
  conclusion: str
  relevance: float
  relevant_pages: list
  citation: str
  paperType: str
  authors: list[Author]

  model_config = ConfigDict(extra='allow')

## Declare Static Queries

In [None]:
CLASS_DETAILS = {
    "title": "Title of the Paper",
    "abstract": "Abstract section from the paper",
    "methodology": "What is done in the paper and how it is done, including relevant technical details?",
    "conclusion": "What was the results of this paper with regard to our context?",
    "relevance": "Relevance score (0-1) for how relevant this paper is to our context.",
    "relevant_pages": "List of pages that have content relevant to our topic.",
    "citation": "String to cite this paper",
    "paperType": "What type of paper is this (qualitative/quantitative)?",
    "authors": [
        {
            "name": "Author Name",
            "institution": "Institution of Author"
        }
    ]
}

In [1]:
SYSTEM_QUERY_SUMMARIZER = """
You are an academically profound individual well versed in the domain of the reference paper. Do not miss any technical terms that might be relevant to this domain. Summarize this paper.
"""

SYSTEM_QUERY_RELEVANCE = f"""
You are an academically profound individual well versed in the domain of both papers. Do not miss any technical terms that might be relevant to this domain. Output must be Strictly in this format :
{CLASS_DETAILS}
"""

## Declare Models for each purpose

In [None]:
SUMMARIZATION_MODEL = "gemini-2.5-flash-lite"
RELEVANCE_MODEL = "gemini-2.5-pro"
MODEL_API_KEY = userdata.get('GOOGLE_API_KEY')

## Configure API Key for LLM

In [None]:
genai.configure(api_key=MODEL_API_KEY)

## Declare Folder Path

In [None]:
FOLDER_PATH = ""
PROJECT_CONTEXT = """"""

## Function to get paths for files and notepads from a folder

In [None]:
def getFilesAndNotepads(folderPath:str):
    file_paths = []
    notepad_paths = []

    for root, _, files in os.walk(folderPath):
        for file in files:
            file_path = os.path.join(root, file)
            if file.lower().endswith(('.pdf')):
                file_paths.append(file_path)
            elif file.lower().endswith(('.txt')):
                notepad_paths.append(file_path)

    return file_paths, notepad_paths

## Functions to read content from Documents

In [None]:
def read_txt_file_content(file_path: str) -> str:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        return content
    except FileNotFoundError:
        return f"Error: The file at {file_path} was not found."
    except Exception as e:
        return f"An error occurred while reading the file: {e}"

In [None]:
def read_pdf_contents(pdf_path, detect_columns=True):
    """
    Read all contents from a PDF file, handling both single and multi-column layouts.

    Args:
        pdf_path: Path to the PDF file
        detect_columns: Whether to automatically detect and handle multi-column layouts

    Returns:
        Extracted text as a string

    Example:
        text = read_pdf_contents("research_paper.pdf")
        print(text)
    """
    all_text = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            if detect_columns:
                # Get page dimensions
                page_width = page.width
                page_height = page.height
                words = page.extract_words()

                # Detect multi-column layout
                is_multi_column = False
                if len(words) >= 20:
                    midpoint = page_width / 2
                    left_words = sum(1 for w in words if (w['x0'] + w['x1']) / 2 < midpoint)
                    right_words = sum(1 for w in words if (w['x0'] + w['x1']) / 2 > midpoint)
                    total = len(words)
                    is_multi_column = (left_words / total >= 0.3 and right_words / total >= 0.3)

                # Extract based on column detection
                if is_multi_column:
                    split_point = page_width * 0.5
                    left_text = page.crop((0, 0, split_point, page_height)).extract_text() or ""
                    right_text = page.crop((split_point, 0, page_width, page_height)).extract_text() or ""
                    page_text = f"{left_text}\n\n{right_text}".strip()
                else:
                    page_text = page.extract_text()
            else:
                page_text = page.extract_text()

            if page_text:
                all_text.append(f"=== Page {page_num} ===\n{page_text}")

    return "\n\n".join(all_text)

## Function to invoke LLM

In [None]:
def invoke_gemini(
    query: str,
    responseClass: Type[BaseModel],
    modelName: str,
    system_query: Optional[str] = None
):
    """
    Invokes Gemini and parses the response into responseClass.
    The user query is passed EXACTLY as-is.
    """

    model = genai.GenerativeModel(
        model_name=modelName,
        system_instruction=system_query,

    )

    response = model.generate_content(
        query,  # <-- query is untouched
        generation_config={
            "response_mime_type": "application/json",
            "response_schema": responseClass
        }
    )

    # Gemini already validates against the schema
    return response.parsed


## Start Analysis

In [None]:
papersWithRelevance = []

In [None]:
# Summarize each paper
# Analyze each summary with reference to project idea
files, notepads = getFilesAndNotepads(FOLDER_PATH)
for paper in tqdm(files, desc="Analyzing reference papers...."):
  paper_content = read_pdf_contents(paper)

  try:
    summary = invoke_gemini(paper_content, ContextualSummary, SUMMARIZATION_MODEL, SYSTEM_QUERY_SUMMARIZER)
  except Exception as e:
    print("ERROR: Summary generation faced an error : ", e)
    break

  relevance_query = f"""
  Here is my current project idea :
  {PROJECT_CONTEXT}

  Calculate relevance of this paper with context to my project. Here is the summary of the paper :
  {summary.summary}
  """

  try:
    relevance = invoke_gemini(relevance_query, Paper, RELEVANCE_MODEL, SYSTEM_QUERY_RELEVANCE)
  except Exception as e:
    print("ERROR: Relevance calculation faced an error : ", e)
    break
  relevance.paper_path = paper
  papersWithRelevance.append(relevance.model_dump())

In [None]:
# Print all relevances
for paper in papersWithRelevance:
  print(json.dumps(paper, indent=2))