In [1]:
import json

# Read JSON file
def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Extract titles and summaries
def extract_titles_and_summaries(json_data):
    titles = [item.get("title","") for item in json_data]
    abstracts = [item.get("abstract","") for item in json_data]
    return titles, abstracts

# File path (update with actual file path)
file_path = "/kaggle/input/hepth-metadata-file/abs_metadata.json"

data = read_json_file(file_path)
titles, abstracts = extract_titles_and_summaries(data)

# Print results
# for title, summary in titles_summaries:
#     print(f"Title: {title}\nSummary: {summary}\n")

len(titles), len(abstracts)

(29555, 29555)

In [2]:
import gensim
data = abstracts

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [3]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [4]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

# Check if stopwords are accessible
stop_words = stopwords.words('english')
print(stop_words)

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [5]:
from gensim.utils import simple_preprocess
import spacy

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_trigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [6]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
print(id2word)

Dictionary<16326 unique tokens: ['airy', 'algebraic', 'aspect', 'behaviour', 'character']...>


In [7]:
from gensim.models.ldamodel import LdaModel
from gensim import corpora

# dictionary = corpora.Dictionary(texts)
lda_model = LdaModel.load("/kaggle/input/sciqagtopic-model-output/lda_model/model_lda_100_6.model")

In [8]:
new_doc = "This is a new document to infer topic distribution from."
new_doc_tokens = new_doc.lower().split()  # Apply your actual preprocessing here
new_doc_bow = id2word.doc2bow(new_doc_tokens)

In [9]:
from gensim.models import Nmf

nmf = Nmf.load(f"/kaggle/input/sciqagtopic-model-output/nmf_model/model_nmf_100_17.model")

In [10]:
# Assume your model and dictionary are already loaded
# nmf = Nmf.load("/kaggle/working/model_nmf_100_10.model")
# id2word = ... (your original Dictionary)

# New document
new_doc = "deep learning in computer vision applications"

# Step 1: Preprocess (tokenize, lowercase, remove stopwords, etc.)
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords

tokens = simple_preprocess(remove_stopwords(new_doc))

# Step 2: Convert to BoW
bow_vector = id2word.doc2bow(tokens)

# Step 3: Infer topics
topics = nmf.get_document_topics(bow_vector, minimum_probability=0.00)
print(topics)
# Output inferred topics
for topic_id, prob in topics:
    print(f"Topic {topic_id}: Probability = {prob:.4f}")


[(0, 0.017756917854348808), (4, 0.09999500069294077), (6, 0.2269878932152492), (8, 0.09574597512918757), (10, 0.18604006085773608), (11, 0.291873557487767), (12, 0.055104131658698416), (16, 0.02649646310407202)]
Topic 0: Probability = 0.0178
Topic 4: Probability = 0.1000
Topic 6: Probability = 0.2270
Topic 8: Probability = 0.0957
Topic 10: Probability = 0.1860
Topic 11: Probability = 0.2919
Topic 12: Probability = 0.0551
Topic 16: Probability = 0.0265


In [12]:
topic_distribution = lda_model.get_document_topics(new_doc_bow)
print(topic_distribution)

[(0, 0.17805512), (1, 0.063830435), (2, 0.18805428), (3, 0.3780252), (4, 0.027414372), (5, 0.16462056)]


In [13]:
dense_dist = lda_model.get_document_topics(new_doc_bow, minimum_probability=0.0)
dense_vector = [prob for topic_id, prob in dense_dist]

In [14]:
dense_vector

[0.17804936, 0.063830435, 0.18805426, 0.37803096, 0.027414372, 0.1646206]

In [19]:
for data_item in data:
    abstract = data_item
    # print(abstract)
    new_doc_tokens = abstract.lower().split()  # Apply your actual preprocessing here
    new_doc_bow = id2word.doc2bow(new_doc_tokens)
    # dense_dist = lda_model.get_document_topics(new_doc_bow, minimum_probability=0.0)
    # dense_vector = [prob for topic_id, prob in dense_dist]
    num_topics = nmf.num_topics
    topics_sparse = dict(nmf.get_document_topics(bow_vector, minimum_probability=0.0))
    full_probs = [topics_sparse.get(i, 0.0) for i in range(num_topics)]


In [19]:
datas[0]

{'id': 'http://arxiv.org/abs/hep-ph/9907233v3',
 'guidislink': True,
 'link': 'http://arxiv.org/abs/hep-ph/9907233v3',
 'updated': '2000-02-24T05:29:39Z',
 'updated_parsed': [2000, 2, 24, 5, 29, 39, 3, 55, 0],
 'published': '1999-07-05T14:17:06Z',
 'published_parsed': [1999, 7, 5, 14, 17, 6, 0, 186, 0],
 'title': 'The Radiative Decay of Vector Mesons',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': '',
  'value': 'The Radiative Decay of Vector Mesons'},
 'summary': "In this paper, radiative decays $\\rho^0 \\to \\pi^+\\pi^-\\gamma,\n\\pi^0\\pi^0\\gamma$ ,$\\phi \\to K^+K^-\\gamma, K^0 \\bar{K^0}\\gamma$ are studied\nsystematically in the U(3)$_L\\timesU(3)_R$ chiral theory of mesons. The\ntheoretical differential spectrum with respect to photon energy and branch\nratio for $\\rho^0 \\to \\pi^+\\pi^-\\gamma$ agree well with the experimental data.\nDifferential spectrums and branch ratios for $\\rho^0 \\to \\pi^0\\pi^0\\gamma, \\phi\n\\to K^+ K^-\\gamma,\\phi \\to 

In [20]:
import json
import itertools
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict

coauthor_topics = defaultdict(list)

for paper in datas:
    authors = [a["name"] for a in paper.get("authors", []) if "name" in a]
    topic_dis = paper.get("topic_dis",[0,0,0,0])
    
    if not topic_dis or len(authors) < 2:
        continue

    for author1, author2 in itertools.combinations(sorted(authors), 2):
        pair = tuple(sorted([author1, author2]))
        coauthor_topics[pair].append(topic_dis)

edges = {}
for pair, topic_dis_list in coauthor_topics.items():
    topic_array = np.array(topic_dis_list)
    avg_topic = topic_array.mean(axis=0)
    edges[pair] = avg_topic

G = nx.Graph()

author_topics = defaultdict(list)

for paper in datas:
    topic_dis = paper.get("topic_dis")
    authors = [a.get("name") for a in paper.get("authors", []) if "name" in a]

    if topic_dis is None or not authors:
        continue

    for author in authors:
        author_topics[author].append(topic_dis)

for author, distributions in author_topics.items():
    
    avg_topic = np.mean(distributions, axis=0)
    G.add_node(author, weight=avg_topic.tolist())


for (author1, author2), topic_vec in edges.items():
    G.add_edge(author1, author2, weight=topic_vec.tolist())



In [None]:
author_articles = {}

for article in datas:

    authors = article.get('authors', [])
    article_id = article.get('id')

    for author in authors:
        name = author.get('name', 'Unknown Author')
        if name not in author_articles:
            author_articles[name] = []
        author_articles[name].append(article_id)


In [22]:
import pickle

with open("graph.pkl", "wb") as f:
    pickle.dump(G, f)

In [None]:
for author, ids in author_articles.items():
    author_articles[author] = [id.replace("http://arxiv.org/abs/", "") for id in ids]

In [None]:
author_articles

In [None]:
import json

with open("author_papers.json", "w", encoding="utf-8") as f:
    json.dump(author_articles, f, ensure_ascii=False, indent=4)