In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
file_path = "/content/drive/MyDrive/Colab Notebooks/arxiv_data.csv"
df = pd.read_csv(file_path)

# Display basic information and first few rows
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51774 entries, 0 to 51773
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     51774 non-null  object
 1   summaries  51774 non-null  object
 2   terms      51774 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


(None,
                                               titles  \
 0  Survey on Semantic Stereo Matching / Semantic ...   
 1  FUTURE-AI: Guiding Principles and Consensus Re...   
 2  Enforcing Mutual Consistency of Hard Regions f...   
 3  Parameter Decoupling Strategy for Semi-supervi...   
 4  Background-Foreground Segmentation for Interio...   
 
                                            summaries  \
 0  Stereo matching is one of the widely used tech...   
 1  The recent advancements in artificial intellig...   
 2  In this paper, we proposed a novel mutual cons...   
 3  Consistency training has proven to be an advan...   
 4  To ensure safety in automated driving, the cor...   
 
                          terms  
 0           ['cs.CV', 'cs.LG']  
 1  ['cs.CV', 'cs.AI', 'cs.LG']  
 2           ['cs.CV', 'cs.AI']  
 3                    ['cs.CV']  
 4           ['cs.CV', 'cs.LG']  )

In [None]:
# Take a smaller subset for now (like 10000 papers)
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

In [None]:
# Custom stopwords list (common English stopwords)
custom_stopwords = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about",
    "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up",
    "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when",
    "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor",
    "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should",
    "now","paper", "approach", "propose", "method", "methods", "results","present", "performance", "problem", "data",
    "using", "use", "model"
])

# Function to preprocess text without NLTK
def preprocess_text_basic(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r"[^a-z\s]", "", text)  # Remove special characters & numbers
    tokens = text.split()  # Tokenization using split()
    tokens = [word for word in tokens if word not in custom_stopwords]  # Stopword removal
    return " ".join(tokens)

# Apply preprocessing to 'titles' and 'summaries'
df["processed_titles"] = df["titles"].apply(preprocess_text_basic)
df["processed_summaries"] = df["summaries"].apply(preprocess_text_basic)

# Display sample processed text
df[["processed_titles", "processed_summaries"]].head()

Unnamed: 0,processed_titles,processed_summaries
0,enforcing geometric constraints virtual normal...,monocular depth prediction plays crucial role ...
1,chart autoencoders manifold structured,deep generative models made tremendous advance...
2,saso joint d semanticinstance segmentation via...,novel d point cloud segmentation framework nam...
3,revisiting knowledge transfer training object ...,revisit knowledge transfer training object det...
4,selfattention based bilstmcnn classifier predi...,heart failure major component healthcare expen...


In [None]:
# Combine processed titles and summaries
df["combined_text"] = df["processed_titles"] + " " + df["processed_summaries"]

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 3),
    sublinear_tf=True,
    norm='l2',
    min_df=2,
    max_df=0.8
)

# Limit to top 10000 words

# Fit and transform the combined text
tfidf_matrix = tfidf_vectorizer.fit_transform(df["combined_text"])

# Convert to DataFrame for better visualization (optional, for checking)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [None]:
# Get feature names (words) from the TF-IDF model
feature_names = tfidf_vectorizer.get_feature_names_out()

# Extract TF-IDF scores for the first 5 documents (sparse format)
sample_indices = [0, 1, 2, 3, 4]
tfidf_sample = tfidf_matrix[sample_indices].toarray()

# Create a DataFrame with TF-IDF scores for these documents
tfidf_sample_df = pd.DataFrame(tfidf_sample, columns=feature_names)

# Display the first few rows of the TF-IDF scores
tfidf_sample_df.iloc[:, :10]  # Show only first 10 words for readability

Unnamed: 0,abdominal,abilities,ability,ability capture,ability generate,ability learn,ablation,ablation studies,ablation study,able
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064116
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084648


In [None]:
# Function to get top words with highest TF-IDF scores for a given document index
def get_top_tfidf_words(doc_index, top_n=5):
    row = tfidf_matrix[doc_index].toarray().flatten()  # Convert sparse row to array
    top_indices = np.argsort(row)[::-1][:top_n]  # Get indices of top N words
    top_words = [(feature_names[i], row[i]) for i in top_indices if row[i] > 0]  # Filter non-zero scores
    return top_words

# Get top words for the first 5 documents
top_tfidf_words = {f"Paper {i+1}": get_top_tfidf_words(i) for i in range(5)}

# Display results
top_tfidf_words

{'Paper 1': [('geometric constraints', np.float64(0.3166927951567094)),
  ('depth prediction', np.float64(0.29376547100252565)),
  ('depth', np.float64(0.20421780287243982)),
  ('normal', np.float64(0.19891373640945037)),
  ('geometric', np.float64(0.19123321071633861))],
 'Paper 2': [('cae', np.float64(0.33879368694080136)),
  ('manifold', np.float64(0.2604811263804938)),
  ('flat', np.float64(0.21134078217727334)),
  ('latent space', np.float64(0.19975830307067174)),
  ('proximity', np.float64(0.19441486267669666))],
 'Paper 3': [('instance segmentation', np.float64(0.19994373975363)),
  ('segmentation task', np.float64(0.18249761293666444)),
  ('clustering', np.float64(0.17896265698296684)),
  ('association', np.float64(0.17625536332188196)),
  ('segmentation tasks', np.float64(0.1700385748550361))],
 'Paper 4': [('knowledge transfer', np.float64(0.2762324679932268)),
  ('objectness', np.float64(0.2020168306270958)),
  ('weakly supervised', np.float64(0.20146968989692426)),
  ('weak

In [None]:
# Compute pairwise cosine similarity between papers
similarity_matrix = cosine_similarity(tfidf_matrix)

In [None]:
def get_similar_papers(paper_index, top_n=5):
    sim_scores = similarity_matrix[paper_index]
    top_indices = sim_scores.argsort()[::-1][1:top_n+1]  # Skip self
    return [(i, sim_scores[i]) for i in top_indices]

In [None]:
def recommend_papers_partial(query, top_n=5, boost_per_term=0.05, max_boost=2.0):
    processed_query = preprocess_text_basic(query)
    query_vector = tfidf_vectorizer.transform([processed_query])
    sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    query_terms = set(processed_query.split())

    # Boost based on frequency of query terms in each document
    def keyword_boost(doc):
        words = doc.split()
        match_count = sum(words.count(term) for term in query_terms)
        normalized = match_count / max(len(words), 1)  # prevent division by zero
        boost = 1 + boost_per_term * match_count
        return min(boost, max_boost)  # cap the boost to prevent domination

    boosts = df["combined_text"].apply(keyword_boost)
    sim_scores *= boosts

    top_indices = np.argsort(sim_scores)[::-1][:top_n]
    recommendations = [(df.iloc[i]['titles'], sim_scores[i]) for i in top_indices]
    return recommendations


In [None]:
def preprocess_text_improved(text):
    # Lowercase
    text = text.lower()

    # Remove special characters and numbers (keep only letters)
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = text.split()

    # Remove stopwords and very short words (e.g., length <= 2)
    tokens = [word for word in tokens if word not in custom_stopwords and len(word) > 2]

    return ' '.join(tokens)


In [None]:
processed_query = preprocess_text_improved("""
Personalized Image Semantic Segmentation Semantic segmentation models trained on public datasets have achieved great
success in recent years. However, these models didn't consider the
personalization issue of segmentation though it is important in practice. In
this paper, we address the problem of personalized image segmentation. The
objective is to generate more accurate segmentation results on unlabeled
personalized images by investigating the data's personalized traits. To open up
future research in this area, we collect a large dataset containing various
users' personalized images called PIS (Personalized Image Semantic
Segmentation). We also survey some recent researches related to this problem
and report their performance on our dataset. Furthermore, by observing the
correlation among a user's personalized images, we propose a baseline method
that incorporates the inter-image context when segmenting certain images.
Extensive experiments show that our method outperforms the existing methods on
the proposed dataset. The code and the PIS dataset will be made publicly
available.

""")


In [None]:
recommend_papers_partial(processed_query, top_n=5)

[('Mask-based Data Augmentation for Semi-supervised Semantic Segmentation',
  np.float64(0.37028058308620115)),
 ('CaDIS: Cataract Dataset for Image Segmentation',
  np.float64(0.34970948217824155)),
 ('Real time backbone for semantic segmentation',
  np.float64(0.316072162052734)),
 ('MAVNet: an Effective Semantic Segmentation Micro-Network for MAV-based Tasks',
  np.float64(0.3061565289792893)),
 ('Attention-based fusion of semantic boundary and non-boundary information to improve semantic segmentation',
  np.float64(0.306055141102605))]