# PaperFinder
A scientific article recommendation system using Sentence Transformers and cosine similarity.

# Installing all requirements

In [None]:
!pip install pandas numpy sentence_transformers rich
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

# Import necessary libraries

In [None]:
import torch
import pandas as pd
import ast
import numpy as np
from sentence_transformers import SentenceTransformer, util
from rich.console import Console
from rich.table import Table
from rich import box

# Determine whether to use GPU or CPU

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
console = Console()
console.print(f"[bold green]Using device: {device}[/bold green]")

# Load and preprocess dataset

In [None]:
df = pd.read_csv('arXiv_scientific dataset.csv')
df = df.dropna(subset=['summary']).reset_index(drop=True)

# Clean summary text by removing unnecessary whitespace and line breaks

In [None]:
df['summary'] = df['summary'].str.replace('\n', ' ')
df['summary'] = df['summary'].str.replace('\r', ' ')
df['summary'] = df['summary'].str.replace('\t', ' ')
df['summary'] = df['summary'].str.replace('  ', ' ')
df['summary'] = df['summary'].str.strip()

# Load the SentenceTransformer model for semantic similarity search

In [None]:
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device=device)

# Encode summaries into dense vector representations

In [None]:
summaries = df['summary'].tolist()
desc_embeddings = model.encode(summaries, convert_to_tensor=True, device=device)

In [None]:
def suggest_article(query, model, desc_embeddings, df, top_n=5):
    query_embedding = model.encode(query, convert_to_tensor=True, device=device)
    cosine_scores = util.cos_sim(query_embedding, desc_embeddings)[0]
    cosine_scores_cpu = cosine_scores.cpu().numpy()
    
    # Retrieve top N indices with highest similarity scores
    top_indices = np.argpartition(-cosine_scores_cpu, range(top_n))[:top_n]
    top_indices = top_indices[np.argsort(-cosine_scores_cpu[top_indices])]
    
    # Select the relevant articles and compute match percentages
    suggested_df = df.iloc[top_indices][['title', 'category', 'published_date', 'authors', 'summary']].copy()
    suggested_df['match_percentage'] = cosine_scores_cpu[top_indices] * 100
    
    return suggested_df

In [None]:
def display_suggestions(suggestions: pd.DataFrame):
    table = Table(
        title="[bold bright_blue]Recommended articles[/bold bright_blue]",
        title_style="bold underline",
        box=box.DOUBLE_EDGE,
        border_style="bright_green",
        show_lines=True,
        padding=(0, 1)
    )
    table.add_column("Title", style="bold cyan", overflow="fold")
    table.add_column("Category", style="green", overflow="fold", justify="left")
    table.add_column("Published date", style="magenta")
    table.add_column("Authors", style="yellow", overflow="fold")
    table.add_column("Summary", style="orchid", overflow="fold")
    table.add_column("Match", style="bright_red")
    
    for _, row in suggestions.iterrows():
        table.add_row(
            f"[bold]{row.get('title', 'N/A')}[/bold]",
            row.get('category', 'N/A'),
            str(row.get('published_date', 'N/A')),
            ", ".join(ast.literal_eval(row.get('authors', '[]'))),
            row.get('summary', 'N/A'),
            f"{row.get('match_percentage', 0):.2f}%"
        )
    
    console.print(table)

# Interactive loop for querying articles

In [None]:
while True:
    query = input("Enter query (or 'exit' to stop): ")
    if query.lower() == 'exit':
        break
    
    suggestions = suggest_article(query, model, desc_embeddings, df, top_n=5)
    display_suggestions(suggestions)