# Topic modeling
Topic modeling on citation network datasets.

We can perform topic modeling on titles and abstracts to get a better sense of what's in our dataset. We followed [this tutorial](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0) to get started; however, since our results didn't make a ton of sense compared to what we actually observed in the dataset, we're going to implement some suggestions from [another tutorial](https://towardsdatascience.com/6-tips-to-optimize-an-nlp-topic-model-for-interpretability-20742f3047e2) to try and improve the usefulness of our results.

In [6]:
import jsonlines
import re
import gensim
from gensim.utils import simple_preprocess
import nltk
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models
from langdetect import detect
import spacy
import pandas as pd

## Defining modeling functions
First, we'll write functions to perform topic modeling with various options so that we can run this easily on titles, abstarcts, and various datasets.

### Preprocessing functions

In [27]:
def get_text(dataset, text_type):
    """
    Helper for preprocess_for_tm, gets text to preprocess.
    
    parameters:
        dataset, list of dict: flattened list of dataset papers, each element is a document
        text_type, str: "abstract", "title", or "both",, only articles posessing an abstract will be included
    
    returns:
        texts, dict: keys are paperId's, values are texts
    """
    dropped_papers = 0
    texts = {}
    if text_type == 'title':
        for paper in dataset:
            pid = paper['paperId']
            texts[pid] = paper['title']
    elif text_type == 'abstract':
        for paper in dataset:
            pid = paper['paperId']
            try:
                abst = paper['abstract']
                if abst is not None:
                    texts[pid] = abst
                else:
                    dropped_papers += 1
            except KeyError:
                dropped_papers += 1
        print(f'{dropped_papers} were dropped because they did not have an abstract')
    elif text_type == 'both':
        for paper in dataset:
            pid = paper['paperId']
            title = paper['title']
            try:
                abst = paper['abstract']
                if abst is not None:
                    texts[pid] = title + ' ' + abst
                else:
                    dropped_papers += 1
            except KeyError:
                dropped_papers += 1
        print(f'{dropped_papers} were dropped because they did not have an abstract')

    return texts

In [68]:
# Adapted from first tutorial
stop_words = stopwords.words('english')
def sent_to_words(sentence):
    return gensim.utils.simple_preprocess(str(sentence), deacc=True)
def remove_stopwords(text):
    return [word for word in simple_preprocess(str(text)) 
             if word not in stop_words]

In [29]:
# From second tutorial
# Filter for bigrams with only noun-type structures
def bigram_filter(bigram):
    tag = nltk.pos_tag(bigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['NN']:
        return False
    if bigram[0] in stop_words or bigram[1] in stop_words:
        return False
    if 'n' in bigram or 't' in bigram:
        return False
    if '-PRON-' in bigram:
        return False
    return True
# Filter for trigrams with only noun-type structures
def trigram_filter(trigram):
    tag = nltk.pos_tag(trigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['JJ','NN']:
        return False
    if trigram[0] in stop_words or trigram[-1] in stop_words or trigram[1] in stop_words:
        return False
    if 'n' in trigram or 't' in trigram:
         return False
    if '-PRON-' in trigram:
        return False
    return True 

In [30]:
def get_grams(text_df):
    """
    Code from second tutorial, for getting significant bigrams and trigrams from text.
    
    parameters:
        text_df, df: preprocessed text
        
    returns:
        bigrams, list of str: bigrams
        trigrams, list of str: trigrams
    """
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_documents([comment for comment in text_df.processed_text])
    # Filter only those that occur at least 50 times
    finder.apply_freq_filter(50)
    bigram_scores = finder.score_ngrams(bigram_measures.pmi)
    
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    finder = nltk.collocations.TrigramCollocationFinder.from_documents([comment for comment in text_df.processed_text])
    # Filter only those that occur at least 50 times
    finder.apply_freq_filter(50)
    trigram_scores = finder.score_ngrams(trigram_measures.pmi)

    bigram_pmi = pd.DataFrame(bigram_scores)
    bigram_pmi.columns = ['bigram', 'pmi']
    bigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)

    trigram_pmi = pd.DataFrame(trigram_scores)
    trigram_pmi.columns = ['trigram', 'pmi']
    trigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)
    
    # Can set pmi threshold to whatever makes sense - eyeball through and select threshold where n-grams stop making sense
    # choose top 500 ngrams in this case ranked by PMI that have noun like structures
    filtered_bigram = bigram_pmi[bigram_pmi.apply(lambda bigram:\
                                                  bigram_filter(bigram['bigram'])\
                                                  and bigram.pmi > 5, axis = 1)][:500]

    filtered_trigram = trigram_pmi[trigram_pmi.apply(lambda trigram: \
                                                     trigram_filter(trigram['trigram'])\
                                                     and trigram.pmi > 5, axis = 1)][:500]


    bigrams = [' '.join(x) for x in filtered_bigram.bigram.values if len(x[0]) > 2 or len(x[1]) > 2]
    trigrams = [' '.join(x) for x in filtered_trigram.trigram.values if len(x[0]) > 2 or len(x[1]) > 2 and len(x[2]) > 2]
    
    return bigrams, trigrams

In [31]:
# From second tutorial
# Concatenate n-grams
def replace_ngram(x):
    for gram in trigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    for gram in bigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    return x

In [32]:
# From second tutorial
# Filter for only nouns
def noun_only(x):
    pos_comment = nltk.pos_tag(x)
    filtered = [word[0] for word in pos_comment if word[1] in ['NN']]
    # to filter both noun and verbs
    #filtered = [word[0] for word in pos_comment if word[1] in ['NN','VB', 'VBD', 'VBG', 'VBN', 'VBZ']]
    return filtered

In [91]:
def preprocess_for_tm(dataset, text_type='title', use_lemmas=False, use_grams=False, only_nouns=False, shuffle=False):
    """
    Perform preprocessing steps for topic modeling. Default steps are lowercasing, removing punctuation, and removing
    stop words. All other steps are optional and are passed as parameters.
    
    parameters:
        dataset, list of dict: flattened list of dataset papers, each list element is a document
        text_type, str: "abstract", "title", or "both". For "abstract" and "both", only articles posessing an abstract
            will be included
        use_lemmas, bool: whether or not to lemmatize text as part of pre-processing
        use_grams, bool: whether or not to use high-relevance bi- and tri-grams in place of their component unigrams
        only_nouns, bool: whether or not to remove any words/grams that are not nouns
        shuffle, bool: whether or not to shuffle the dataset before returning
    
    returns:
        text_df, df: index is paperId, columns are 'text' and 'processed_text', where 'text' is the raw text and
            'processed_text' should be used downstream
    """
    # Get the text
    texts = get_text(dataset, text_type)
    text_df = pd.DataFrame.from_dict(texts, orient='index', columns=['text'])
    
    # Remove punct and lowercase
    text_df['processed_text'] = text_df['text'].map(lambda x: re.sub('[,.!?:;]', '', x))
    text_df['processed_text'] = text_df['processed_text'].map(lambda x: x.lower())
    
    # Lemmatize if requested, remove stopwords regardless
    if use_lemmas:
        nlp = spacy.load("en_core_sci_sm", disable=['parser', 'ner'])
        text_df['processed_text'] = text_df['processed_text'].map(lambda x: sent_to_words(x))
        text_df['processed_text'] = text_df['processed_text'].map(lambda x: [nlp(' '.join(sent)) for sent in remove_stopwords(x)])
        print(text_df.iloc[0,1])
        text_df['processed_text'] = text_df['processed_text'].map(lambda x: [token.lemma_ for token in x])
    else:
        text_df['processed_text'] = text_df['processed_text'].map(lambda x: sent_to_words(x))
        text_df['processed_text'] = text_df['processed_text'].map(lambda x: remove_stopwords(x))
        
    # Get bi- and tri-grams if requested
    if use_grams:
        bigrams, trigrams = get_grams(text_df)
        text_df['processed_text'] = text_df['processed_text'].str.join(' ')
        text_df['processed_text'] = text_df['processed_text'].map(lambda x: replace_ngram(x))
        text_df = text_df['processed_text'].map(lambda x: [word for word in x.split()
                                                 if word not in stop_words and len(word) > 2])

    # Keep only nouns if requested
    if only_nouns:
        text_df['processed_text'] = text_df['processed_text'].map(lambda x: noun_only(x))
    
    # Shuffle if requested
    if shuffle:
        return text_df.sample(frac=1)
    else:
        return text_df

### Modeling functions

In [34]:
def create_corpus(text_df):
    """
    Does final preprocessing for LDA model.
    
    parameters:
        text_df, df: index is paperId, columns are 'text' and 'processed_text', where 'text' is the raw text and
            'processed_text' should be used downstream
    
    returns:
        corpus, list: list of texts formatted for LDA input
        id2word, Dictionary object: vocabulary for model
    """
    # Create Dictionary
    id2word = corpora.Dictionary(text_df['processed_text'])
    # Create Corpus
    texts = text_df['processed_text']
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    
    return corpus, id2word

In [35]:
def optimize_topic_num(text_df, id2word, savename_prefix):
    """
    Optimizes the number of topics to use in a topic model. Code from the second tutorial.
    
    parameters:
        text_df, df: index is paperId, columns are 'text' and 'processed_text', where 'text' is the raw text and
            'processed_text' should be used downstream
        id2word, Dictionary object: vocabulary for model
        savename_prefix, str: string to prepend to save name
    """
    doc_term_matrix = [id2word.doc2bow(doc) for doc in text_df['processed_text']]
    coherence = []
    for k in range(5,25):
        print('Round: '+str(k))
        Lda = gensim.models.ldamodel.LdaModel
        ldamodel = Lda(doc_term_matrix, num_topics=k, 
                   id2word = id2word, passes=40,
                   iterations=200, chunksize = 10000, eval_every = None)

        cm = gensim.models.coherencemodel.CoherenceModel(
             model=ldamodel, texts=text_df['processed_text'],
             dictionary=id2word_titles, coherence='c_v')   

        coherence.append((k,cm.get_coherence()))
        
    x_val = [x[0] for x in coherence]
    y_val = [x[1] for x in coherence]
    to_save = pd.DataFrame({'x': x_val, 'y': y_val})
    to_save.to_csv(f'../data/citation_network/{savename_prefix}_topic_modeling_number_opt_numerical_results.csv')
    
    plt.plot(x_val,y_val)
    plt.scatter(x_val,y_val)
    plt.title('Number of Topics vs. Coherence')
    plt.xlabel('Number of Topics')
    plt.ylabel('Coherence')
    plt.xticks(x_val)
    plt.savefig(f'../data/citation_network/{savename_prefix}_topic_modeling_number_optimization_plot.png', format='png', dpi=600, bbox_inches='tight')

## Read in the data

In [16]:
with jsonlines.open('../data/semantic_scholar/vitrification_2000_22Nov2023.jsonl') as reader:
    vitrification = []
    for obj in reader:
        vitrification.append(obj)

In [21]:
def flatten_papers(papers):
    """
    Flatten the output of pull_papers.py
    """
    pids = []
    flat_papers = []
    for paper in papers:
        if paper['paperId'] not in pids:
            flat_papers.append({k: v for k,v in paper.items() if k != 'references'})
            pids.append(paper['paperId'])
        for ref in paper['references']:
            if ref['paperId'] not in pids:
                flat_papers.append(ref)
                pids.append(ref['paperId'])

    assert len(flat_papers) == len(set(pids))
    print(f'There are {len(flat_papers)} unique papers in this dataset.')

    return flat_papers

In [22]:
vitrification_flat = flatten_papers(vitrification)

There are 19398 unique papers in this dataset.


## Perform topic modeling

### Most basic preprocessing
Let's start with just the preprocessing options recommended by the original tutorial and only titles.

In [77]:
text_df = preprocess_for_tm(vitrification_flat, text_type='title')

In [78]:
text_df.head()

Unnamed: 0,text,processed_text
0edd534ccb9179f19583ef3fc416cd609039a387,Vitrification and Nanowarming of Kidneys,"[vitrification, nanowarming, kidneys]"
bf386927425fcc6ed46ddcb063fd95bfaa0f5d5c,"Perfusion, cryopreservation, and nanowarming o...","[perfusion, nanowarming, whole, hearts, using,..."
ee125be2113053283312659836d845135caf247d,Diffusion Limited Cryopreservation of Tissue w...,"[diffusion, limited, tissue, radiofrequency, h..."
55151ba4f1901f890e9ccfd2652be48c6a53222e,Magnetic heating of nanoparticles as a scalabl...,"[magnetic, heating, nanoparticles, scalable, t..."
adf4dbd6cd3e7ad5f09995c10051af6fbc023ea2,Hypothermic machine perfusion is superior to s...,"[hypothermic, machine, perfusion, superior, st..."


In [79]:
corpus, id2word = create_corpus(text_df)

In [83]:
num_topics = 10
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

In [84]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.display(lda_display)

### Shuffling only
How does shuffling affect these results?

In [85]:
text_df = preprocess_for_tm(vitrification_flat, text_type='title', shuffle=True)

In [86]:
text_df.head()

Unnamed: 0,text,processed_text
13dd77b5430943115ca57c46e49d25204c842032,Setting characteristics and cavity adaptation ...,"[setting, characteristics, cavity, adaptation,..."
ff0231446dbbc789b08397508b8870a805eae426,Collection and Characterization of Citrus indi...,"[collection, citrus, indica, tanaka, macropter..."
c5f3b640c64494c8b8ed7b0cb1fa94429ee57e0c,Clinical review 126: Roles and novel regimens ...,"[clinical, review, roles, novel, regimens, lut..."
916d95391f9215ce0631e4ae15f9da8744712668,Cryopreservation of kiwi shoot tips.,"[kiwi, shoot, tips]"
05c90f3979c4c1ce1788395df1bccd6877f04efc,First-order dissolution rate law and the role ...,"[first, order, dissolution, rate, law, role, s..."


In [87]:
corpus, id2word = create_corpus(text_df)
num_topics = 10
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

In [88]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.display(lda_display)

It looks like this dataset gets dominated by the animal/medical papers, which anecdotally appears to be true based on my small sample maunal classification and looking at the descriptive stats. What happens if we add the rest of the preprocessing options?

### All options

In [92]:
text_df = preprocess_for_tm(vitrification_flat, text_type='title', use_lemmas=True, use_grams=True, only_nouns=True, shuffle=True)

[v i t r i f i c a t i o n, n a n o w a r m i n g, k i d n e y s]


AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'lemma_'

In [None]:
text_df.head()