In [1]:
import pandas as pd

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint


In [3]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:
import spacy
import nltk

In [5]:
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

In [6]:
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [7]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\liuyi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
'is' in stop_words

True

In [9]:
df = pd.read_csv('start_db.csv')

In [10]:
data = df.abstract.values.tolist()

In [11]:
data[0]

'Two flexible subcomponents, namely tris(4-formylphenyl)phosphate and tris(2-aminoethyl)amine, are assembled into a tetrapodal [4\u2009+\u20094] cage depending on the solvent effect. Single-crystal structure analysis reveals that the caivity is surrounded by four phosphate uints. Good selectivity of CO2 adsorption over CH4 is demonstrated by the gas adsorption experiment.'

In [12]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
data_words = list(sent_to_words(data))

In [13]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['two', 'flexible', 'subcomponents', 'namely', 'tris', 'formylphenyl', 'phosphate', 'and', 'tris', 'aminoethyl', 'amine', 'are', 'assembled', 'into', 'tetrapodal', 'cage', 'depending', 'on', 'the', 'solvent', 'effect', 'single', 'crystal', 'structure', 'analysis', 'reveals', 'that', 'the', 'caivity', 'is', 'surrounded', 'by', 'four', 'phosphate', 'uints', 'good', 'selectivity', 'of', 'co', 'adsorption', 'over', 'ch', 'is', 'demonstrated', 'by', 'the', 'gas', 'adsorption', 'experiment']


In [14]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [15]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)


In [16]:
import sys
!{sys.executable} -m spacy download en

symbolic link created for C:\Users\liuyi\Anaconda3\envs\my_py_env\lib\site-packages\spacy\data\en <<===>> C:\Users\liuyi\Anaconda3\envs\my_py_env\lib\site-packages\en_core_web_sm
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
[+] Linking successful
C:\Users\liuyi\Anaconda3\envs\my_py_env\lib\site-packages\en_core_web_sm -->
C:\Users\liuyi\Anaconda3\envs\my_py_env\lib\site-packages\spacy\data\en
You can now load the model via spacy.load('en')


In [17]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [18]:
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [42]:
data_lemmatized[0]

['flexible',
 'subcomponent',
 'namely',
 'aminoethyl',
 'assemble',
 'cage',
 'depend',
 'solvent',
 'effect',
 'single',
 'crystal',
 'structure',
 'analysis',
 'reveal',
 'caivity',
 'surround',
 'phosphate',
 'good',
 'selectivity',
 'co',
 'adsorption',
 'demonstrate',
 'gas',
 'experiment']

In [19]:
id2word = corpora.Dictionary(data_lemmatized)

In [20]:
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)]]


In [21]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [22]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.025*"polymer" + 0.023*"swnt" + 0.015*"high" + 0.013*"synthesis" + '
  '0.010*"conjugate" + 0.008*"demonstrate" + 0.008*"approach" + 0.008*"yield" '
  '+ 0.008*"condition" + 0.008*"use"'),
 (1,
  '0.020*"macrocycle" + 0.012*"polymer" + 0.012*"cage" + 0.009*"formation" + '
  '0.009*"synthesis" + 0.008*"crystal" + 0.008*"show" + 0.007*"material" + '
  '0.006*"imine" + 0.006*"group"'),
 (2,
  '0.013*"cage" + 0.013*"material" + 0.011*"imine" + 0.010*"precursor" + '
  '0.010*"assembly" + 0.010*"geometry" + 0.010*"link" + 0.009*"porous" + '
  '0.008*"high" + 0.008*"organic"')]


In [41]:
len(doc_lda)

19

In [48]:
for i, row in enumerate(doc_lda):
    r = sorted(row)

TypeError: '<' not supported between instances of 'tuple' and 'int'

In [23]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) 


Perplexity:  -6.456093171717518


In [24]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word,coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [25]:
coherence_lda

0.4129310809353775

In [26]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [55]:
pyLDAvis.save_html(vis, 'lda.html')


In [34]:
lda_model.show_topics()

[(0,
  '0.025*"polymer" + 0.023*"swnt" + 0.015*"high" + 0.013*"synthesis" + 0.010*"conjugate" + 0.008*"demonstrate" + 0.008*"approach" + 0.008*"yield" + 0.008*"condition" + 0.008*"use"'),
 (1,
  '0.020*"macrocycle" + 0.012*"polymer" + 0.012*"cage" + 0.009*"formation" + 0.009*"synthesis" + 0.008*"crystal" + 0.008*"show" + 0.007*"material" + 0.006*"imine" + 0.006*"group"'),
 (2,
  '0.013*"cage" + 0.013*"material" + 0.011*"imine" + 0.010*"precursor" + 0.010*"assembly" + 0.010*"geometry" + 0.010*"link" + 0.009*"porous" + 0.008*"high" + 0.008*"organic"')]

In [47]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

IndexError: list index out of range

In [49]:
lda_model.show_topic(0)

[('polymer', 0.025048088),
 ('swnt', 0.022585034),
 ('high', 0.015334685),
 ('synthesis', 0.012981965),
 ('conjugate', 0.010492758),
 ('demonstrate', 0.008078129),
 ('approach', 0.008076609),
 ('yield', 0.008076005),
 ('condition', 0.008074787),
 ('use', 0.008071121)]

In [54]:
lda_model[corpus][1]

([(2, 0.99852234)],
 [(4, [2]),
  (24, [2]),
  (25, [2]),
  (26, [2]),
  (27, [2]),
  (28, [2]),
  (29, [2]),
  (30, [2]),
  (31, [2]),
  (32, [2]),
  (33, [2]),
  (34, [2]),
  (35, [2]),
  (36, [2]),
  (37, [2]),
  (38, [2]),
  (39, [2]),
  (40, [2]),
  (41, [2]),
  (42, [2]),
  (43, [2]),
  (44, [2]),
  (45, [2]),
  (46, [2]),
  (47, [2]),
  (48, [2]),
  (49, [2]),
  (50, [2]),
  (51, [2]),
  (52, [2]),
  (53, [2]),
  (54, [2]),
  (55, [2]),
  (56, [2]),
  (57, [2]),
  (58, [2]),
  (59, [2]),
  (60, [2]),
  (61, [2]),
  (62, [2]),
  (63, [2]),
  (64, [2]),
  (65, [2]),
  (66, [2]),
  (67, [2])],
 [(4, [(2, 2.999971)]),
  (24, [(2, 0.9999236)]),
  (25, [(2, 0.9999235)]),
  (26, [(2, 0.9999765)]),
  (27, [(2, 0.99998254)]),
  (28, [(2, 0.9999236)]),
  (29, [(2, 0.9999237)]),
  (30, [(2, 0.99992365)]),
  (31, [(2, 0.9999237)]),
  (32, [(2, 0.99997646)]),
  (33, [(2, 0.9999236)]),
  (34, [(2, 0.99992526)]),
  (35, [(2, 0.9999236)]),
  (36, [(2, 0.99992365)]),
  (37, [(2, 0.9999236)]),
  