In [8]:
import os
import re
import random
import PyPDF2
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
import spacy
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pickle
import time
from google import genai
from langchain.embeddings.base import Embeddings
from google.genai import types

class GeminiEmbeddings(Embeddings):
    def embed_query(self, text: str) -> list[float]:
        result = client.models.embed_content(
            model="gemini-embedding-exp-03-07",
            contents=text
        )
        return result.embeddings[0].values

    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        return [self.embed_query(text) for text in texts]


txt_folder = "Enter your file path here"
gemini_api_key_lab="Enter your API key here"
client = genai.Client(api_key=gemini_api_key_lab)


nlp = spacy.load("en_core_web_sm")
def split_sentences_with_spacy(text):
    """Use spaCy to split text into sentences."""
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

def extract_text_from_pdf(pdf_folder):
    """Extract text excluding references from a PDF file."""
    
    with open(pdf_folder, "rb") as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        text = "".join(page.extract_text() for page in reader.pages)
        
        # Find where references begin (common markers)
        ref_markers = ["references","References", "Acknowledgements", "REFERENCES"]
        ref_start = float('inf')
        for marker in ref_markers:
            pos = text.rfind(marker) #find all and pinpoint the last one
            if pos != -1:
                ref_start = min(ref_start, pos)
                
        # Return text up to references section
        if ref_start != float('inf'):
            return text[:ref_start]
        return text
    
def extract_text_from_txt(txt_folder):
    """Extract text excluding references from a .txt file."""
    with open(txt_folder, "r", encoding='UTF-8') as txt_file:
        lines = txt_file.readlines()
        text = "".join(line for line in lines)
        
        # Find where references begin (common markers)
        ref_markers = ["references","References", "Acknowledgements", "REFERENCES"]
        ref_start = float('inf')
        for marker in ref_markers:
            pos = text.rfind(marker) #find all and pinpoint the last one
            if pos != -1:
                ref_start = min(ref_start, pos)
                
        # Return text up to references section
        if ref_start != float('inf'):
            return text[:ref_start]
        return text


def extract_numbers(file_name):
    """Extract numeric values from a file name."""
    match = re.match(r"(\d+)-(\d+)", file_name)  
    if match:
        return int(match.group(1)), int(match.group(2))  
    return float('inf'), float('inf') 

def process_reference_papers(pdf_folder):
    """Process all PDF files in the folder and create documents."""
    documents = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            filepath = os.path.join(pdf_folder, filename)
            print(filename)
            text = extract_text_from_pdf(filepath)
            # Exclude reference lists if needed 
            documents.append(Document(page_content=text, metadata={"source": filename}))

def process_reference_papers_txt(txt_folder):
    """Process all PDF files in the folder and create documents."""
    documents = []
    for filename in os.listdir(txt_folder):
        if filename.endswith(".txt"):
            filepath = os.path.join(txt_folder, filename)
            print(filename)
            text = extract_text_from_txt(filepath)
            # Exclude reference lists if needed 
            documents.append(Document(page_content=text, metadata={"source": filename}))
    return documents

def preprocess_introduction(intro_text):

    ref_pattern = r"\[(\d+(?:[-\u2013]\d+)?(?:,\s*\d+(?:[-\u2013]\d+)?)*)\]"
    
    def expand_range(range_str):
        numbers = set()

        parts = range_str.replace('\u2013', '-').replace(' ', '').split(',')
        for part in parts:
            if '-' in part:
                start, end = map(int, part.split('-'))
                numbers.update(range(start, end + 1))
            else:
                numbers.add(int(part))
        return sorted(numbers)
    
    matches = re.finditer(ref_pattern, intro_text)
    ground_truth = {}
    ref_count = 1
    
    cleaned_intro = intro_text
    for match in matches:
        original = match.group(0)
        numbers = expand_range(match.group(1))
        placeholder = f"[ref{ref_count}]"
        ground_truth[placeholder] = numbers
        cleaned_intro = cleaned_intro.replace(original, placeholder, 1)
        ref_count += 1
        
    return cleaned_intro, ground_truth

# Utility function for cosine similarity
def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors."""

    vec1 = np.array(vec1).flatten()  
    vec2 = np.array(vec2).flatten()  
    
    norm_product = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    if norm_product == 0:
        return 0
    return np.dot(vec1, vec2) / norm_product

# Save and Load Embeddings
def save_embeddings(documents, file_name=".pkl"): #embedding file name insert
    """Save embeddings to a file."""
    with open(file_name, "wb") as f:
        pickle.dump(documents, f)

def load_embeddings(file_name=".pkl"): #embedding file name insert
    """Load embeddings from a file."""
    with open(file_name, "rb") as f:
        return pickle.load(f)

# Phase 2: Build Vector Store
def build_vector_store(documents, embedding_file=".pkl", chunk_size=1000, chunk_overlap=200):  #embedding file name insert
    """Create a FAISS vector store from documents with caching for Gemini embeddings."""
    try:
        # Try to load split_documents with pre-computed embeddings
        split_documents = load_embeddings(os.path.join(txt_folder , embedding_file))
        print(" Loaded embeddings from file.")
        
    except FileNotFoundError:
        print(" Embeddings file not found. Creating embeddings...")
        
 
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        split_documents = text_splitter.split_documents(documents)

        split_documents = split_documents[:]

        for idx, doc in enumerate(split_documents):
            try:
                result = client.models.embed_content(
                    model="gemini-embedding-exp-03-07",
                    contents=doc.page_content
                )
                embed_ = result.embeddings[0].values  
                doc.metadata["embedding"] = embed_
                print(f" Generated embedding for chunk {idx + 1}/{len(split_documents)}")
                time.sleep(0.5)
            except Exception as e:
                print(f" Error embedding document {idx + 1}: {e}")
                doc.metadata["embedding"] = [0.0] * 768  


        save_embeddings(split_documents, embedding_file)

    texts = [doc.page_content for doc in split_documents]
    metadatas = [doc.metadata for doc in split_documents]
    embeddings = [doc.metadata["embedding"] for doc in split_documents]
    
    
    text_embedding_pairs = list(zip(texts, embeddings))
    
    embedding_model = GeminiEmbeddings()
    
    vector_store = FAISS.from_embeddings(
        text_embeddings=text_embedding_pairs,
        embedding=embedding_model,
        metadatas=metadatas
    )
   
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print(" Total documents in vector store:", vector_store.index.ntotal)
    unique_sources = {doc.metadata.get("source", "N/A") for doc in split_documents}
    print(" Unique document sources:", len(unique_sources))

    return vector_store

# Phase 3: Reference Prediction
def predict_references_with_RAG(vector_store, cleaned_intro):
    """Predict references for each [ref] in the introduction using RAG"""
    faiss_references_dict = {}

    for placeholder in re.findall(r"\[ref\d+\]", cleaned_intro):
        query_text = extract_context_for_ref(cleaned_intro, placeholder, sentences_before=1, sentences_after=0)

        all_documents = vector_store.similarity_search(query_text, k=60)
              
        unique_references = []
        seen = set()
        
        for doc in all_documents:
            source = doc.metadata["source"]
            if source not in seen and len(unique_references) < 5:
                unique_references.append(source)
                seen.add(source)
        
        faiss_references_dict[placeholder] = unique_references
        print(f"FAISS references for {placeholder}: {unique_references}")

    return faiss_references_dict

# Placeholder context extraction
def extract_context_for_ref(cleaned_intro, placeholder, sentences_before=1, sentences_after=0):
    """Extract a window of sentences around [ref] for focused queries."""
    sentences = split_sentences_with_spacy(cleaned_intro)
    for i, sentence in enumerate(sentences):
        if placeholder in sentence:
            start = max(0, i - sentences_before)
            end = min(len(sentences), i + sentences_after + 1)
            context = '. '.join(sentences[start:end]).strip()
            return context
    print(f"Warning: Placeholder {placeholder} not found in any sentence.")
    return ""


def evaluate_predictions(ground_truth, faiss_references_dict):  
    """Evaluate predictions against ground truth and calculate the overall score."""
    total_matched_refs = 0
    total_ground_truth_refs = 0
    detailed_report = []

    for placeholder, true_refs in ground_truth.items():
        predicted_refs = faiss_references_dict.get(placeholder, [])
        
        true_refs = {ref for ref in true_refs}
        predicted_refs = {int(ref.split("-")[-1].replace(".txt", "")) for ref in predicted_refs}

        # Count matched references directly
        total_matched_refs += len(set(predicted_refs) & set(true_refs))
        total_ground_truth_refs += len(true_refs)
        
        detailed_report.append({
            "Placeholder": placeholder,
            "Ground Truth": true_refs,
            "predicted_refs": predicted_refs
        })

    # Calculate overall score as total matched references / total ground truth references
    overall_score = total_matched_refs / total_ground_truth_refs if total_ground_truth_refs > 0 else 0

    return detailed_report, overall_score


# Phase 7: Main Execution

# Step 1: Process reference papers
print("Processing reference papers...")
documents = process_reference_papers_txt(txt_folder)

# Step 2: Build vector store
print("Building vector store...")
vector_store = build_vector_store(documents)

# Step 3: Input introduction section
intro_text = """     The term “globalisation ” is often used in a similar way , indeed almost interchangeably 
,to internationalisation when discussing higher education policy and
trends.Some authors,however,seek to make a distinction between the two,seeing
internationalisation as a contemporary expression of internationalism ,
encompassing responses to the “forces ” of globalisation , which are viewed as
being far from benign in nature[6,25,29,70].
Inpractice, some limit-ations are normally placed upon the search,such asinterms of the date,
place , and language of publication . In this case, the search was limited to items
published inthe English language ,and there was aparticular focus on most recent
publications , given that a related systematic review[73] had recently been undertaken.
The items identi ﬁed were
then checked for relevance ;where relevant ,copies were obtained for scrutiny and
analysis,with any additional items identiﬁed through the irreferencesfol-lowedup.
The analysis presented in this article builds on and extends an earlier review
of research on globalisation and internationalisation in higher education[73].
Related analyses have also been carried out by others: for example,Kos-mützkyand
Putty ’s review of the literature on transnational ,oﬀshore ,cross -border ,
and borderless higher education[46]; Bedenlier et al.’s examination of 2
decades ofresearch into the internationalisation of higher education published in
the Journal of Studies in International Education [11];

"""
    
cleaned_intro, ground_truth = preprocess_introduction(intro_text)

# Step 4: Predict references
print("Predicting references...")
faiss_references_dict = predict_references_with_RAG(vector_store, cleaned_intro)

# Step 5: Assess predictions (if in assessment mode)
print("Assessing predictions...")
detailed_report, overall_score = evaluate_predictions(ground_truth, faiss_references_dict)

# Output results
print("\nDetailed Report:")
for item in detailed_report:
    print(item)
print(f"\noverall_score: {overall_score*100:.2f}%")
