In [2]:
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")














In [3]:
papers = pd.read_csv('dblp-v10.csv')
print(papers.head())

                                            abstract  \
0  In this paper, a robust 3D triangular mesh wat...   
1  We studied an autoassociative neural network w...   
2  It is well-known that Sturmian sequences are t...   
3  One of the fundamental challenges of recognizi...   
4  This paper generalizes previous optimal upper ...   

                                             authors  n_citation  \
0             ['S. Ben Jabra', 'Ezzeddine Zagrouba']          50   
1  ['Joaquín J. Torres', 'Jesús M. Cortés', 'Joaq...          50   
2           ['Genevi eve Paquin', 'Laurent Vuillon']          50   
3  ['Yaser Sheikh', 'Mumtaz Sheikh', 'Mubarak Shah']         221   
4  ['Efraim Laksman', 'Håkan Lennerstad', 'Magnus...           0   

                                          references  \
0  ['09cb2d7d-47d1-4a85-bfe5-faa8221e644b', '10aa...   
1  ['4017c9d2-9845-4ad2-ad5b-ba65523727c5', 'b118...   
2  ['1c655ee2-067d-4bc4-b8cc-bc779e9a7f10', '2e4e...   
3  ['056116c1-9e7a-4f9b-a918-4

In [4]:
print(papers.isnull().sum())

papers = papers.dropna(subset=["id", "title", "abstract"])

print(papers.isnull().sum())


texts = papers.apply(lambda row: row["title"] + " " + row["abstract"], axis=1).tolist()

texts = texts[:5000]  

batch_size = 3000  
embeddings = []

for i in range(0, len(texts), batch_size):
    batch = texts[i:i + batch_size]
    batch_embeddings = use_model(batch).numpy()
    embeddings.append(batch_embeddings)


embeddings = np.vstack(embeddings)

abstract      172467
authors            2
n_citation         0
references    124417
title              0
venue         177755
year               0
id                 0
dtype: int64
abstract           0
authors            1
n_citation         0
references     40309
title              0
venue         103765
year               0
id                 0
dtype: int64


In [5]:
similarity_matrix = cosine_similarity(embeddings, embeddings)

similarity_df = pd.DataFrame(
    similarity_matrix,
    index=papers.iloc[:len(embeddings)]["id"],  
    columns=papers.iloc[:len(embeddings)]["id"]
)

print(similarity_df)

id                                    4ab3735c-80f1-472d-b953-fa0557fed28b  \
id                                                                           
4ab3735c-80f1-472d-b953-fa0557fed28b                              1.000000   
4ab39729-af77-46f7-a662-16984fb9c1db                              0.344426   
4ab3a4cf-1d96-4ce5-ab6f-b3e19fc260de                              0.276626   
4ab3a98c-3620-47ec-b578-884ecf4a6206                              0.509301   
4ab3b585-82b4-4207-91dd-b6bce7e27c4e                              0.391865   
...                                                                    ...   
4c6e6a06-c1c9-4ce5-b4c0-4517c4f6554a                              0.329636   
4c6e6bc2-a8eb-4960-8644-ca7897c5f995                              0.322888   
4c6e7ec2-5629-4bc4-99eb-438b3f134448                              0.239762   
4c6e8dd2-3d46-4267-923d-a428c91b6902                              0.402745   
4c6eade4-29cf-452b-a3f2-e2c7c5b6617b                            

In [10]:
def recommend_papers(paper_id, similarity_df, top_n=3):
    similar_papers = similarity_df[paper_id].sort_values(ascending=False)[1:top_n+1]

    recommended_ids = similar_papers.index.tolist()

    recommended_papers = papers[papers["id"].isin(recommended_ids)][["id","title","abstract"]]

    original_title, original_abstract = papers.loc[papers["id"] == paper_id, ["title", "abstract"]].iloc[0]
    print(similar_papers)
    print(f"\n\n**Original Paper**")
    print(f"\n**Paper ID** {paper_id}\n **Title:** {original_title}\n\n **Abstract:** {original_abstract}\n{'-'*80}")

    print(f"\n**Recommended papers**")
        
    for _, row in recommended_papers.iterrows():
        print(f"\n**Paper ID** {row['id']}\n **Similarity Score** {similar_papers[row['id']]}\n **Title:** {row['title']}\n\n **Abstract:** {row['abstract']}\n{'-'*80}")


recommend_papers("4ab3735c-80f1-472d-b953-fa0557fed28b", similarity_df, top_n=5)

id
4ac41ffa-6888-4ab8-9229-06f537601fba    0.725780
4be7d99a-c1b5-4715-888c-24aa3154f7a5    0.720463
4b56b243-db11-4dab-aa62-3b93e3bfdc36    0.707107
4afe9fc0-269b-43df-8662-1d3dfacbc4be    0.699947
4b0e003c-1e6e-4759-837e-5b92b20dc4a3    0.697228
Name: 4ab3735c-80f1-472d-b953-fa0557fed28b, dtype: float32


**Original Paper**

**Paper ID** 4ab3735c-80f1-472d-b953-fa0557fed28b
 **Title:** A new approach of 3D watermarking based on image segmentation

 **Abstract:** In this paper, a robust 3D triangular mesh watermarking algorithm based on 3D segmentation is proposed. In this algorithm three classes of watermarking are combined. First, we segment the original image to many different regions. Then we mark every type of region with the corresponding algorithm based on their curvature value. The experiments show that our watermarking is robust against numerous attacks including RST transformations, smoothing, additive random noise, cropping, simplification and remeshing.
-------------------