To cluster the documents and annotate the clusters according to the main topic, we can use various natural language 
processing and machine learning techniques. Here is a general framework that can be used to accomplish this task:

1.Load the dataset into a pandas dataframe.
2.Clean the data by removing any null values or duplicates.
3.Preprocess the textual data (i.e., the abstracts) by removing stopwords, punctuation, and other noise. This can be done using various libraries like NLTK or spaCy.
4.Convert the preprocessed abstracts into a numerical representation using a technique like TF-IDF or Doc2Vec.
5. Find the relevant document 
6.To find the main topic of each document, we will use a topic modeling algorithm such as Latent Dirichlet Allocation (LDA) or Non-negative Matrix Factorization (NMF)
6.For clustering and visualization, we will use a dimensionality reduction technique such as Principal Component Analysis (PCA) or t-SNE, and a visualization library such as bokeh.''' 

In [1]:
'''Packages for preprocessing'''
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import re
import nltk 
from nltk.tokenize import word_tokenize,sent_tokenize

'''Pakages to load dataset'''
import pandas as pd

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from bokeh.plotting import figure, show, output_file, ColumnDataSource
from bokeh.models import HoverTool


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


nltk.download('punkt')
nltk.download('stopwords')

from sklearn.metrics.pairwise import cosine_similarity



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sitas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sitas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
''' Loading the dataset and subsetting it'''
data = pd.read_csv("Dataset_1.csv")


# # Prompt the user to enter one or more search queries separated by commas
# query_str = input('Enter one or more search queries separated by commas: ')
# queries = [q.strip() for q in query_str.split(',')]

FileNotFoundError: [Errno 2] No such file or directory: 'papers.csv'

In [None]:
'''THis block of code pre processes the data'''# stemmer = SnowballStemmer("english") 
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text, for_embedding=False):
    """
        - remove any html tags (< /br> often found)
        - Keep only ASCII + European Chars and whitespace, no digits
        - remove single letter chars
        - convert all whitespaces (tabs etc.) to single wspace
        if not for embedding (but e.g. tdf-idf):
        - all lowercase
        - remove stopwords, punctuation and stemm
    """
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE) #remove white space
    RE_TAGS = re.compile(r"<[^>]+>") #removes tags
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE) #keep only asscii character
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)
    if for_embedding:
        # Keep punctuation
        RE_ASCII = re.compile(r"[^A-Za-zÀ-ž,.!? ]", re.IGNORECASE)
        RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž,.!?]\b", re.IGNORECASE)

    text = re.sub(RE_TAGS, " ", str(text))
    text = re.sub(RE_ASCII, " ", str(text))
    text = re.sub(RE_SINGLECHAR, " ", str(text))
    text = re.sub(RE_WSPACE, " ", str(text))

    word_tokens = word_tokenize(text)
    words_tokens_lower = [word.lower() for word in word_tokens]

    if for_embedding:
        # no stemming, lowering and punctuation / stop words removal
        words_filtered = word_tokens
    else:
        words_filtered = [lemmatizer.lemmatize(w) for w in words_tokens_lower if w not in stop_words]
        
#         words_filtered = [
#             stemmer.stem(word) for word in words_tokens_lower if word not in stop_words
#         ]

    text_clean = " ".join(words_filtered)
    return text_clean


In [None]:
'''This block is to add the cleaned abstract back to df and drop any rows which are empty'''

# data[['clean_abstract', 'tokens']] = data['abstract'].apply(clean_text).apply(pd.Series)

clean_abstract = []
for comment in data.abstract:
    clean_abstract.append(clean_text(comment))
    
data['clean_abstract']= clean_abstract

'''Drop unwanted column and null rows'''

data = data.drop('abstract', axis=1)
data.dropna(
    axis=0,
    how='any',
    thresh=None,
    subset=None,
    inplace=True
)
data.replace('', np.nan, inplace = True)
data = data.dropna()

display(data)

In [None]:
"""  In this block the tokens is vectorized then perform topic modeling"""

vectorizer = TfidfVectorizer(stop_words='english')
word_vector = vectorizer.fit_transform(data['clean_abstract'])

# Normalize the feature matrix
word_vector = normalize(word_vector)


# Fit NMF model to the data
nmf_model = NMF(n_components=5, init='nndsvd')
nmf_model.fit(word_vector)

# Add topic column to dataframe

data['topic'] = nmf_model.transform(word_vector).argmax(axis=1)

# get the top word for each topic
topic_words = []
n_words = 5
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf_model.components_):
    top_words_idx = topic.argsort()[:-n_words - 1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    topic_words.append(', '.join(top_words))

data['topic_words'] = [topic_words[i] for i in data['topic']]
data

In [None]:
# Group the dataframe by the 'topic' column and aggregate the 'topic_word' column
grouped = data.groupby('topic')['topic_words'].unique()

# Print the unique values in the 'topic' column and their corresponding values in the 'topic_word' column
for topic, words in grouped.items():
    print(f"Topic {topic}: {', '.join(words)}")

In [None]:
# Perform dimensionality reduction using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(word_vector.toarray())

# Perform dimensionality reduction using t-SNE
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, n_iter=1000, random_state=42)
X_tsne = tsne.fit_transform(word_vector.toarray())


In [None]:
# Create Bokeh plot
source = ColumnDataSource(data=dict(
    x=X_tsne[:,0],
    y=X_tsne[:,1],
    color=data['topic'].map({0:'red', 1:'blue', 2:'green', 3:'purple', 4:'orange'}),
    topic=data['topic'],
    topics=data['topic_words'],
    title=data['title'],
    authors=data['authors'],
    journal=data['journal'],
    doi=data['doi'],
    citation=data['citation_count'],
    published=data['published_at']
))
p = figure(title='Topic Clustering of Documents', plot_width=800, plot_height=800, tools='hover,box_zoom,reset')
p.scatter(x='x', y='y', color='color', source=source, size=10, legend_group='topics')
p.legend.title = 'Topic'
p.hover.tooltips = [
    ('Title', '@title'),
    ('Authors', '@authors'),
    ('doi', '@doi')
]
show(p)

In [None]:

'''-----------------------------------------------------------------------------------------------------------------'''

In [None]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

# load the dataset
data = pd.read_csv('dataset.csv')

# preprocess the data
data = data.dropna()
data = data.reset_index(drop=True)

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['abstract'], data['citation count'], test_size=0.2, random_state=42)

# extract features using TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# train a logistic regression model to predict citation count
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train, y_train)

# make predictions on the test set and calculate NDCG score
y_pred = clf.predict(X_test)
ndcg = ndcg_score(y_test.reshape(1, -1), y_pred.reshape(1, -1))

# rank the documents based on predicted citation count
data['citation count predicted'] = clf.predict(vectorizer.transform(data['abstract']))
ranked_data = data.sort_values(by=['citation count predicted'], ascending=False)

# print the ranked dataframe
print(ranked_data)

# print the NDCG score
print('NDCG score:', ndcg)
