In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Important Imports

In [None]:
import pandas as pd
import re
from typing import List, Dict
from dataclasses import dataclass

### Loading Dataset

In [None]:
file_path = "/kaggle/input/sub-arxiv/sub_arxiv_subset.csv"

# reading csv file
dataset = pd.read_csv(file_path)

In [None]:
# basic info about dataset
print(dataset.info())

## Cleaning Markdown

In [None]:
import re
import pandas as pd
from dataclasses import dataclass


@dataclass
class CleaningStats:
    equations_removed: int
    citations_cleaned: int
    tables_found: int
    figures_found: int
    latex_commands_cleaned: int
    references_removed: int


class AcademicPaperCleaner:
    def __init__(self):
        self.inline_equation = r'\$[^$]+\$'
        self.block_equation = r'\$\$[^$]+\$\$'
        self.latex_command = r'\\[a-zA-Z]+\{[^}]*\}'
        self.citation = r'\[[\d,\s]+\]'
        self.reference = r'\\cite\{[^}]*\}|\\citep\{[^}]*\}|\\citet\{[^}]*\}'
        self.table_start = r'\\begin\{table\}'
        self.table_end = r'\\end\{table\}'
        self.figure_pattern = r'\\includegraphics(\[.*?\])?\{.*?\}'
        self.stats = CleaningStats(0, 0, 0, 0, 0, 0)

    def clean_equations(self, text):
        inline_matches = re.findall(self.inline_equation, text)
        self.stats.equations_removed += len(inline_matches)
        text = re.sub(self.inline_equation, '[EQUATION]', text)
        
        block_matches = re.findall(self.block_equation, text)
        self.stats.equations_removed += len(block_matches)
        text = re.sub(self.block_equation, '[BLOCK_EQUATION]', text)
        
        return text

    def clean_citations(self, text):
        citation_matches = re.findall(self.citation, text)
        self.stats.citations_cleaned += len(citation_matches)
        text = re.sub(self.citation, '[REF]', text)
        return text

    def clean_references(self, text):
        reference_matches = re.findall(self.reference, text)
        self.stats.references_removed += len(reference_matches)
        text = re.sub(self.reference, '', text)
        return text

    def handle_tables(self, text):
        table_pattern = f"{self.table_start}.*?{self.table_end}"
        tables = re.findall(table_pattern, text, re.DOTALL)
        self.stats.tables_found += len(tables)
        text = re.sub(table_pattern, '[TABLE]', text, flags=re.DOTALL)
        return text

    def handle_figures(self, text):
        figure_matches = re.findall(self.figure_pattern, text)
        self.stats.figures_found += len(figure_matches)
        text = re.sub(self.figure_pattern, '[FIGURE]', text)
        return text

    def clean_latex_commands(self, text):
        command_matches = re.findall(self.latex_command, text)
        self.stats.latex_commands_cleaned += len(command_matches)
        text = re.sub(self.latex_command, '', text)
        return text

    def clean(self, text):
        text = self.clean_equations(text)
        text = self.clean_citations(text)
        text = self.clean_references(text)
        text = self.handle_tables(text)
        text = self.handle_figures(text)
        text = self.clean_latex_commands(text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def get_cleaning_stats(self):
        return self.stats

    def print_cleaning_stats(self):
        print("Cleaning Statistics:")
        print(f"Equations removed: {self.stats.equations_removed}")
        print(f"Citations cleaned: {self.stats.citations_cleaned}")
        print(f"References removed: {self.stats.references_removed}")
        print(f"Tables found: {self.stats.tables_found}")
        print(f"Figures found: {self.stats.figures_found}")
        print(f"LaTeX commands cleaned: {self.stats.latex_commands_cleaned}")



    def clean_dataset(self, dataset):
        dataset['cleaned_markdown'] = dataset['markdown'].apply(self.clean)
        return dataset



In [None]:
# Example usage
if __name__ == "__main__":

    # Initialize the cleaner
    cleaner = AcademicPaperCleaner()

    # Clean the dataset
    cleaned_dataset = cleaner.clean_dataset(dataset)

    # Print the cleaning statistics
    cleaner.print_cleaning_stats()

    # Save the cleaned dataset
    cleaned_dataset.to_csv('cleaned_dataset.csv', index=False)

## CSV Loader

In [None]:
import csv
from langchain_community.document_loaders import CSVLoader

class DatasetLoader:
    def __init__(self, file_path, content_columns, metadata_columns, delimiter=',', quotechar='"', field_size_limit=10**6):
        self.file_path = file_path
        self.content_columns = content_columns
        self.metadata_columns = metadata_columns
        self.delimiter = delimiter
        self.quotechar = quotechar
        self.field_size_limit = field_size_limit

    def load_data(self):
        # Increase the maximum field size limit
        csv.field_size_limit(self.field_size_limit)
        
        # Initialize the CSVLoader with provided arguments
        loader = CSVLoader(
            file_path=self.file_path,
            csv_args={
                'delimiter': self.delimiter,
                'quotechar': self.quotechar,
            },
            content_columns=self.content_columns,
            metadata_columns=self.metadata_columns
        )
        
        return loader


In [None]:
if __name__== "__main__":
    # Example usage
    dataset_loader = DatasetLoader(
    file_path="/kaggle/working/cleaned_dataset.csv",
    content_columns=['markdown'],
    metadata_columns=['id', 'title', 'authors', 'published_date', 'link']
)

loader = dataset_loader.load_data()
docs = loader.load()

In [None]:
docs = loader.load()
print("Metadata")
print(docs[0].metadata)
print("Page Content")
print(docs[0].page_content[:100])

## Recursive Split (Chunking)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

class TextSplitter:
    def __init__(self, chunk_size=1500, chunk_overlap=200, length_function=len, is_separator_regex=False):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.length_function = length_function
        self.is_separator_regex = is_separator_regex
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=self.length_function,
            is_separator_regex=self.is_separator_regex
        )

    def split_documents(self, docs):
        return self.text_splitter.split_documents(docs)


In [None]:
if __name__=="__main__":
    # Example usage
    text_splitter = TextSplitter(chunk_size=1500, chunk_overlap=200)
    documents = text_splitter.split_documents(docs)


In [None]:
print(documents[0].page_content)
print(documents[0].metadata)

## Vector Store

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from tqdm import tqdm

class VectorStore:
    def __init__(self, model_name="all-MiniLM-L6-v2", collection_name="default_collection", persist_directory="./chroma_db"):
        self.model_name = model_name
        self.collection_name = collection_name
        self.persist_directory = persist_directory

        # Initialize the embeddings model
        self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)

        # Initialize the vector store
        self.vector_store = Chroma(
            collection_name=self.collection_name,
            embedding_function=self.embeddings,
            persist_directory=self.persist_directory
        )

    def add_documents(self, documents):
        """
        Embeds and adds documents to the vector store, with a progress bar.
        :param documents: A list of LangChain document objects, each with 'page_content' and 'metadata'.
        """
        # Extract content and metadata from each document
        texts = [doc.page_content for doc in documents]
        metadata = [doc.metadata for doc in documents]
        
        # Initialize the progress bar
        with tqdm(total=len(texts), desc="Adding documents to vector store") as pbar:
            batch_size = 1000  # Adjust batch size as needed
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i + batch_size]
                batch_metadata = metadata[i:i + batch_size]
                
                # Add batch to vector store
                self.vector_store.add_texts(texts=batch_texts, metadatas=batch_metadata)
                
                # Update progress bar
                pbar.update(len(batch_texts))

    def get_vector_store(self):
        """Returns the initialized Chroma vector store."""
        return self.vector_store


In [None]:
if __name__=="__main__":
    # Example usage
    vector_store = VectorStore(
        model_name="all-MiniLM-L6-v2",
        collection_name="arXiv_splits",
        persist_directory="./chroma_arXiv_db"
    )

    # Directly use your list of LangChain documents
    vector_store.add_documents(documents)

## Database Loader

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

class DatabaseLoader:
    def __init__(self, model_name: str, collection_name: str, persist_directory: str):
        """Initialize the DatabaseLoader with the specified model and database settings."""
        self.embeddings = HuggingFaceEmbeddings(model_name=model_name)
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.db = self.load_database()

    def load_database(self):
        """Load the existing database using the specified embeddings."""
        return Chroma(
            collection_name=self.collection_name,
            embedding_function=self.embeddings,
            persist_directory=self.persist_directory
        )

In [8]:
if __name__=="__main__":

    # Usage
    db_loader = DatabaseLoader(
        model_name="all-MiniLM-L6-v2",
        collection_name="arXiv_splits",
        persist_directory="./chroma_arXiv_db"
    )
    vector_database = db_loader.db

  from tqdm.autonotebook import tqdm, trange
  return Chroma(


## Retriever

In [9]:
# Initializing vector database

retriever = vector_database.as_retriever(
    search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}
)

In [10]:
retriever.invoke("What are the LLMs")

[Document(metadata={'authors': 'Ankita Kakoti, Bichitra Bijay Boruah, Mrinal Kumar Das', 'id': '2301.13552', 'link': 'http://arxiv.org/abs/2301.13552v1', 'published_date': '2023-01-31T11:04:01Z', 'row': 363, 'source': '/kaggle/working/cleaned_dataset.csv', 'title': 'Minimal Left-Right Symmetric Model with $A_4$ modular symmetry'}, page_content='\\[<\\Delta_{L,R}>=\\frac{1}{\\sqrt{2}}\\begin{pmatrix}0&0\\\\ v_{L,R}&0\\end{pmatrix} \\tag{2.5}\\]\n\n\\[<\\phi>=\\begin{pmatrix}k&0\\\\ 0&e^{i\\theta}k^{\\prime}\\end{pmatrix} \\tag{2.6}\\]\n\nThe magnitudes of the VEVs follows the relation, \\(|v_{L}|^{2}<|k^{2}+{k^{\\prime}}^{2}|<|v_{R}|^{2}\\). The breaking pattern of the LRSM gauge group takes place in two steps. The LRSM gauge group is first broken down to the Standard Model gauge group by the vev of the scalar triplet \\(\\Delta_{R}\\), and then the Standard Model gauge group is broken down to the electromagnetic gauge group i.e., \\(U(1)_{em}\\) by the vev of the bidoublet and a tiny v

## Evaluation (code was generated by Claud)

In [None]:
import pandas as pd
import random
from typing import List, Dict
import numpy as np
from tqdm import tqdm

def create_test_query(title: str) -> str:
    """
    Use just the title as query for simplicity and reliability
    """
    return title.strip()

def load_test_data(file_path: str, n_test_cases: int = 20) -> List[Dict]:
    """
    Load test cases with more debugging information
    """
    df = pd.read_csv(file_path)
    
    # Randomly select n_test_cases documents
    test_indices = random.sample(range(len(df)), n_test_cases)
    
    test_cases = []
    for idx in test_indices:
        query = create_test_query(title=df.loc[idx, 'title'])
        
        test_cases.append({
            'query': query,
            'title': df.loc[idx, 'title'],
            'id': str(df.loc[idx, 'id']),  # Convert to string for consistent comparison
            'index': idx
        })
    
    return test_cases

def print_retrieval_debug(query: str, relevant_id: str, retrieved_docs, retriever_name: str):
    """
    Print debugging information for a retrieval attempt
    """
    print(f"\nDEBUG for {retriever_name}")
    print(f"Query: {query}")
    print(f"Expected ID: {relevant_id}")
    print("\nRetrieved docs:")
    for i, doc in enumerate(retrieved_docs[:5], 1):
        doc_id = doc.metadata.get('id', 'NO_ID')
        doc_title = doc.metadata.get('title', 'NO_TITLE')
        print(f"{i}. ID: {doc_id} | Title: {doc_title}")
    print("-" * 50)

def calculate_metrics(retrieved_docs, relevant_doc_id: str, k: int = 5, debug: bool = False):
    """
    Calculate metrics with option for debugging output
    """
    # Convert retrieved IDs to strings for consistent comparison
    retrieved_ids = [str(doc.metadata.get('id', '')) for doc in retrieved_docs[:k]]
    
    if debug:
        print(f"Relevant ID: {relevant_doc_id}")
        print(f"Retrieved IDs: {retrieved_ids}")
    
    metrics = {
        'hit@k': 0,
        'mrr': 0,
        'rank': k + 1
    }
    
    if relevant_doc_id in retrieved_ids:
        rank = retrieved_ids.index(relevant_doc_id) + 1
        metrics['hit@k'] = 1
        metrics['mrr'] = 1.0 / rank
        metrics['rank'] = rank
    
    return metrics

def evaluate_retriever(text_retriever, test_cases: List[Dict], k: int = 5):
    """
    Evaluate text retriever with debugging information
    """
    results = {
        'text': {'hits': 0, 'mrr': [], 'ranks': [], 'queries': []}
    }
    
    # Print first few retrievals for debugging
    debug_count = 2  # Number of queries to debug in detail
    
    for i, test_case in enumerate(tqdm(test_cases, desc="Evaluating text retriever")):
        query = test_case['query']
        relevant_id = test_case['id']
        
        debug = (i < debug_count)  # Debug only first few queries
        
        # Test text retriever
        try:
            text_docs = text_retriever.get_relevant_documents(query)
            if debug:
                print_retrieval_debug(query, relevant_id, text_docs, "Text Retriever")
            
            text_metrics = calculate_metrics(text_docs, relevant_id, k, debug)
            results['text']['hits'] += text_metrics['hit@k']
            results['text']['mrr'].append(text_metrics['mrr'])
            results['text']['ranks'].append(text_metrics['rank'])
            results['text']['queries'].append({
                'query': query,
                'success': text_metrics['hit@k'] == 1
            })
        except Exception as e:
            print(f"Error in text retriever: {e}")
    
    return results

def format_results(results: Dict, n_queries: int):
    """
    Format and print evaluation results
    """
    print("\nEvaluation Results:")
    print("-" * 50)
    
    for retriever_name, metrics in results.items():
        hit_rate = (metrics['hits'] / n_queries) * 100
        mrr = np.mean(metrics['mrr'])
        median_rank = np.median(metrics['ranks'])
        
        print(f"\n{retriever_name.upper()} Retriever:")
        print(f"Hit@5 Rate: {hit_rate:.2f}%")
        print(f"Mean Reciprocal Rank: {mrr:.3f}")
        print(f"Median Rank: {median_rank:.1f}")
        
        # Show some example successful and failed queries
        successful_queries = [q['query'] for q in metrics['queries'] if q['success']]
        failed_queries = [q['query'] for q in metrics['queries'] if not q['success']]
        
        print(f"\nTotal Successful Queries: {len(successful_queries)}")
        print(f"Total Failed Queries: {len(failed_queries)}")

In [None]:
# Load test cases
test_cases = load_test_data('/kaggle/working/cleaned_dataset.csv', n_test_cases=20)

# Evaluate retrievers
results = evaluate_retrievers(retriever, retriever, test_cases, k=5)

# Format and print results
format_results(results, n_queries=len(test_cases))

## Retriever

In [1]:
%pip install langchain_community
%pip install langchain-chroma
%pip install -qU langchain-huggingface

Collecting langchain_community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.14 (from langchain_community)
  Downloading langchain-0.3.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.29 (from langchain_community)
  Downloading langchain_core-0.3.29-py3-none-any.whl.metadata (6.3 kB)
Collecting langsmith<0.3,>=0.1.125 (from langchain_community)
  Downloading langsmith-0.2.10-py3-none-any.whl.metadata (14 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.3 (from langchain<0.4.0,>=0.3.14->langchain_community)
  Downloading langchain_text_splitters-0.3.4-py3-none-any.whl.metadata (2.3 kB)
Collecting packaging<25,>=23.2 (from langchai