In [1]:
import numpy as np
import pandas as pd
import pickle

import gensim
import nltk
import spacy
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models.coherencemodel import CoherenceModel


### Training the model

In [3]:
abstracts = pd.read_csv('./data/lda_abstacts_sample.csv')

In [6]:
abstracts['abstract_processed'] = abstracts['abstract'].map(lambda x: re.sub('[,\.!?]', '', x))
abstracts['abstract_processed'] = abstracts['abstract'].map(lambda x: x.lower())

In [7]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [8]:
data = abstracts.abstract_processed.values.tolist()
data_words = list(sent_to_words(data))

KeyboardInterrupt: 

In [13]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=300)
# trigram = gensim.models.Phrases(bigram[data_words], threshold=300)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)

NameError: name 'bigram' is not defined

In [8]:
additional_stop_words = ['use', 'study', 'result', 'also', 'may', 'find', 'method', 'system',
                         'however', 'suggest', 'paper', 'include', 'increase', 'solution', 'change', 
                         'process', 'different', 'base', 'effect', 'rate']

In [9]:
stop_words = stopwords.words('english') + additional_stop_words

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [11]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [12]:
print(data_lemmatized[:1])

[['amino', 'acid', 'use', 'protein', 'formation', 'methylation', 'cancer', 'cell', 'require', 'particularly', 'high', 'methionine', 'supply', 'homeostasis', 'successful', 'approach', 'decrease', 'methionine', 'concentration', 'base', 'systemic', 'delivery', 'methionine', 'lyase', 'study', 'demonstrate', 'efficacy', 'cancer', 'therapy', 'mechanism', 'explain', 'cancer', 'cell', 'suffer', 'absence', 'significantly', 'non', 'malignant', 'cell', 'still', 'unclear', 'analyze', 'human', 'colorectal', 'adenocarcinoma', 'cancer', 'cell', 'line', 'exposure', 'monitor', 'cell', 'viability', 'expression', 'histone', 'post', 'translational', 'modification', 'presence', 'spurious', 'transcription', 'rationale', 'verify', 'reduce', 'methionine', 'supply', 'would', 'affect', 'decondensation', 'change', 'level', 'histone', 'methylation', 'therefore', 'increase', 'genomic', 'instability', 'treatment', 'show', 'time', 'dependent', 'cancer', 'cell', 'µg', 'hs', 'normal', 'cell', 'less', 'affected', 'sub'

In [13]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 6), (11, 8), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 1), (32, 5), (33, 2), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1), (46, 2), (47, 1), (48, 1), (49, 1), (50, 5), (51, 4), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 2), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 2), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 3), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1)]]


In [14]:
with open('./topic_modeling/corpus_newstop.pickle', 'wb') as c:
    pickle.dump(corpus, c)
with open('./topic_modeling/id2word_newstop.pickle', 'wb') as i:
    pickle.dump(id2word, i)
with open('./topic_modeling/data_lemmatized_newstop.pickle', 'wb') as d:
    pickle.dump(data_lemmatized, d)

In [21]:
# with open('./topic_modeling/corpus.pickle', 'wb') as c:
#     pickle.dump(corpus, c)
# with open('./topic_modeling/id2word.pickle', 'wb') as i:
#     pickle.dump(id2word, i)
# with open('./topic_modeling/data_words.pickle', 'wb') as d:
#     pickle.dump(data_words, d)
# with open('./topic_modeling/data_words_bigrams.pickle', 'wb') as d:
#     pickle.dump(data_words_bigrams, d)
# with open('./topic_modeling/data_lemmatized.pickle', 'wb') as d:
#     pickle.dump(data_lemmatized, d)

In [3]:
corpus = pickle.load(open('./topic_modeling/corpus_newstop.pickle', 'rb'))
id2word = pickle.load(open('./topic_modeling/id2word_newstop.pickle', 'rb'))
data_lemmatized = pickle.load(open('./topic_modeling/data_lemmatized_newstop.pickle', 'rb'))

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=42,
                                           chunksize=500)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    temp_file = datapath("/home/lliang06/sloan/topic_modeling/lda" + str(k) + "d_newstop")
    lda_model.save(temp_file)
    
    return coherence_model_lda.get_coherence()


In [None]:
k= 110
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=k, 
                                       random_state=42,
                                       chunksize=1000,
                                       passes = 5,
                                       alpha=0.01,
                                       eta=0.31
                                      )

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')

temp_file = datapath("/home/lliang06/sloan/topic_modeling/lda" + str(k) + "d_tuned_alpha_beta_newstop")
lda_model.save(temp_file)

### Getting LDAs

In [6]:
abstracts = pd.read_parquet('potential_abstracts.parquet')

In [7]:
dimension_energy_pi_pd = pd.read_csv('./dimension_energy_pi_abstracts.csv')

In [13]:
len(abstracts)

5866978

In [8]:
dimension_abstracts_df = pd.concat([abstracts, dimension_energy_pi_pd]).drop_duplicates()

In [11]:
dimension_abstracts_df['abstract_processed'] = dimension_abstracts_df['abstract'].map(lambda x: re.sub('[,\.!?]', '', x))
dimension_abstracts_df['abstract_processed'] = dimension_abstracts_df['abstract'].map(lambda x: x.lower())

In [None]:
dimension_abstracts_df[['publication_id', 'abstract_processed']].to_parquet('./all_abstracts.parquet', index = False)

In [2]:
dimension_abstracts_df = pd.read_parquet('./all_abstracts.parquet')

In [3]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

additional_stop_words = ['use', 'study', 'result', 'also', 'may', 'find', 'method', 'system',
                         'however', 'suggest', 'paper', 'include', 'increase', 'solution', 'change', 
                         'process', 'different', 'base', 'effect', 'rate']
        
stop_words = stopwords.words('english') + additional_stop_words

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [6]:
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])


for i in tqdm(range(len(dimension_abstracts_df) // 1000000 + 1)):
    left = i * 1000000
    right = i * 1000000 + 999999
    
    data = dimension_abstracts_df.loc[left:right].abstract_processed.values.tolist()
    data_words = list(sent_to_words(data))

    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=300)
    bigram_mod = gensim.models.phrases.Phraser(bigram)

    data_words_nostops = remove_stopwords(data_words)
    data_words_bigrams = make_bigrams(data_words_nostops)
    
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    with open('./full_abstracts/data_full_abstracts_' + str(i) + '.pickle', 'wb') as c:
        pickle.dump(data_lemmatized, c)

#     id2word = corpora.Dictionary(data_lemmatized)
    
#     corpus = [id2word.doc2bow(text) for text in data_lemmatized]
    
#     with open('./full_abstracts/corpus_full_abstracts_' + str(i) + '.pickle', 'wb') as c:
#         pickle.dump(corpus, c)

        
    del data
    del data_words
    del bigram
    del bigram_mod
    del data_words_nostops
    del data_words_bigrams
    del data_lemmatized
#     del id2word
#     del corpus

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [17]:
# corpus = pickle.load(open('./topic_modeling/corpus_sampled_PI_old_id.pickle', 'rb'))
id2word = pickle.load(open('./topic_modeling/id2word_newstop.pickle', 'rb'))

In [12]:
temp_file = datapath("/home/lliang06/sloan/topic_modeling/lda110d_tuned_alpha_beta_newstop")
lda = LdaModel.load(temp_file)

In [34]:
for j in range(7):
    data = pickle.load(open('./full_abstracts/data_full_abstracts_' + str(j) + '.pickle', 'rb'))
    corpus = [id2word.doc2bow(text) for text in data]
    vec_iter = lda.get_document_topics(corpus, minimum_probability = 0)
    
    for i, v in tqdm(enumerate(vec_iter)):
        if ((i % 10000 == 0) | (i == 0)):
            if i == 10000:
                with open('./full_abstracts/lda_110d_fin_sampled_PI_abstracts_' + str(j) + '.npy', 'wb') as f:
                    np.save(f, lda_vec)

            if i > 10000:
                with open('./full_abstracts/lda_110d_fin_sampled_PI_abstracts_' + str(j) + '.npy', 'rb') as f:
                    all_vec = np.load(f)
                all_vec = np.vstack([all_vec, lda_vec])
                with open('./full_abstracts/lda_110d_fin_sampled_PI_abstracts_' + str(j) + '.npy', 'wb') as f:
                    np.save(f, all_vec)

            lda_vec = np.array([w[1] for w in v])
        else:
            lda_vec = np.vstack([lda_vec, np.array([w[1] for w in v])])

    with open('./full_abstracts/lda_110d_fin_sampled_PI_abstracts_' + str(j) + '.npy', 'rb') as f:
        all_vec = np.load(f)
    all_vec = np.vstack([all_vec, lda_vec])

    with open('./full_abstracts/lda_110d_fin_sampled_PI_abstracts_' + str(j) + '.npy', 'wb') as f:
        np.save(f, all_vec)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [76]:
dimension_abstracts_df[['publication_id']].to_csv('./lda_pubid.csv', index = False)