<a href="https://colab.research.google.com/github/pyaguega/LDA-document-ranking/blob/main/Merck_LDA_Document_Ranking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def extract_keywords(text, n_keywords=15):
    # Assuming text is a list of sentences/passages
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    tfidf_matrix = vectorizer.fit_transform(text)

    # Get the words in the vocabulary
    feature_names = vectorizer.get_feature_names_out()

    # Sum tf-idf weights for each term through all documents
    dense = tfidf_matrix.todense()
    dense_sum = np.sum(dense, axis=0)

    # Sort words based on sum of tf-idf weights
    sorted_items = dense_sum.argsort()

    # Get the top n_keywords
    keywords = [feature_names[i] for i in sorted_items[0, -n_keywords:]]

    return keywords

# Example usage
text = ["""The ability of a chromatographic method to successful separate, identify and quantitate species
is determined by many factors, many of which are in the control of the experimenter. When attempting to discover
the important factors and then optimise a response by tuning these factors, experimental design (design of experiments, DoE)
gives a powerful suite of statistical methodology. Advantages include modelling by empirical functions, not requiring detailed
knowledge of the underlying physico-chemical properties of the system, a defined number of experiments to be performed, and
available software to accomplish the task. Two uses of DoE in chromatography are for showing lack of significant effects in
robustness studies for method validation, and for identifying significant factors and then optimising a response with respect
to them in method development. Plackett–Burman designs are widely used in validation studies, and fractional factorial designs
and their extensions such as central composite designs are the most popular optimisers. Box–Behnken and Doehlert designs are
becoming more used as efficient alternatives. If it is not possible to practically realise values of the factors required by
experimental designs, or if there is a constraint on the total number of experiments that can be done, then D-optimal designs
can be very powerful. Examples of the use of DoE in chromatography are reviewed. Recommendations are given on how to report DoE
studies in the literature."""]
print(extract_keywords(text))

[array([['experimental', 'chromatography', 'significant', 'used',
        'response', 'design', 'number', 'validation', 'powerful',
        'studies', 'method', 'experiments', 'doe', 'factors', 'designs']],
      dtype=object)]


In [None]:
# LDA CODE BELOW

In [None]:
# [(0,
#   '0.083*"chromatography" + 0.029*"liquid" + 0.023*"used" + 0.021*"separation" '
#   '+ 0.019*"phase" + 0.017*"substances" + 0.017*"analysis" + 0.017*"form" + '
#   '0.016*"ion" + 0.016*"thermally"'),
#  (1,
#   '0.044*"chromatography" + 0.022*"liquid" + 0.022*"phase" + '
#   '0.022*"separation" + 0.019*"used" + 0.016*"ion" + 0.016*"technique" + '
#   '0.016*"form" + 0.016*"analysis" + 0.015*"organic"'),
#  (2,
#   '0.044*"chromatography" + 0.024*"liquid" + 0.019*"used" + 0.019*"phase" + '
#   '0.018*"separation" + 0.017*"thermally" + 0.016*"technique" + '
#   '0.016*"organic" + 0.016*"mobile" + 0.015*"ion"'),
#  (3,
#   '0.066*"chromatography" + 0.028*"liquid" + 0.024*"phase" + '
#   '0.024*"separation" + 0.022*"used" + 0.018*"mobile" + 0.018*"gas" + '
#   '0.018*"organic" + 0.017*"materials" + 0.016*"thermally"')]


# Topic: 0
# Words: chromatography|liquid|used|separation|phase|substances|analysis|form|ion|thermally|technique|materials|gas|organic|mobile|phases|particularly|various|increasing|proteins|species|affinities|reference|reagent|two|stationary|ions|allows|stable|simple
# Topic: 1
# Words: chromatography|liquid|phase|separation|used|ion|technique|form|analysis|organic|gas|mobile|materials|thermally|substances|exchange|chemicals|complex|cations|chemical|useful|thin|number|purity|analytical|branches|simple|scope|also|principles
# Topic: 2
# Words: chromatography|liquid|used|phase|separation|thermally|technique|organic|mobile|ion|substances|gas|materials|analysis|form|branches|compounds|volatile|main|assay|identification|analytical|determination|number|whether|allows|structure|mixture|useful|various
# Topic: 3
# Words: chromatography|liquid|phase|separation|used|mobile|gas|organic|materials|thermally|substances|form|ion|technique|analysis|molecular|exchange|principally|compounds|diverse|scope|divided|metal|individual|mixture|differences|ranging|determined|assay|volatile

In [None]:
pip install stop-words

Collecting stop-words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-py3-none-any.whl size=32896 sha256=c7e0439e825fdc69dd42e61e4448a77c0d61edaadb08be7a9cca4fdf1ce99055
  Stored in directory: /root/.cache/pip/wheels/d0/1a/23/f12552a50cb09bcc1694a5ebb6c2cd5f2a0311de2b8c3d9a89
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23


In [None]:
import pandas as pd
import os
import gensim
from stop_words import get_stop_words

from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
abstract = "Chromatography is an analytical technique used in the quantitative determination of the purity of most organic and an increasing number of inorganic reagent chemicals and standard-grade reference materials. The broad scope of chromatography allows it to be used in the separation, identification, and assay of diverse chemical species, ranging from simple metal ions to compounds of complex molecular structure, such as proteins. In chromatography, the separation of individual components in a mixture is achieved when a mobile phase is passed over a stationary phase. Differences in affinities of various substances for these phases result in their separation. Chromatography can be divided into two main branches, depending on whether the mobile phase is a gas or a liquid. Gas chromatography is principally used for analysis of volatile, thermally stable materials. Liquid chromatography is particularly useful for analysis of nonvolatile or thermally unstable organic substances. Ion chromatography, a technique in which anions and cations can be determined by using the principles of ion exchange, is a form of liquid chromatography. Thin-layer chromatography, often called planar chromatography, is also a form of liquid chromatography."
# abstract as string (for demo)

# "Chromatography is an analytical technique used in
# the quantitative determination of the purity of most organic and an
# increasing number of inorganic reagent chemicals and standard-grade
# reference materials. The broad scope of chromatography allows it to
# be used in the separation, identification, and assay of diverse
# chemical species, ranging from simple metal ions to compounds of
# complex molecular structure, such as proteins. In chromatography,
# the separation of individual components in a mixture is achieved
# when a mobile phase is passed over a stationary phase. Differences
# in affinities of various substances for these phases result in their
# separation. Chromatography can be divided into two main branches,
# depending on whether the mobile phase is a gas or a liquid.
# Gas chromatography is principally used for analysis of volatile,
# thermally stable materials. Liquid chromatography is particularly
# useful for analysis of nonvolatile or thermally unstable organic
# substances. Ion chromatography, a technique in which anions and
# cations can be determined by using the principles of ion exchange,
# is a form of liquid chromatography. Thin-layer chromatography,
# often called planar chromatography, is also a form of liquid
# chromatography."


print(abstract)

Chromatography is an analytical technique used in the quantitative determination of the purity of most organic and an increasing number of inorganic reagent chemicals and standard-grade reference materials. The broad scope of chromatography allows it to be used in the separation, identification, and assay of diverse chemical species, ranging from simple metal ions to compounds of complex molecular structure, such as proteins. In chromatography, the separation of individual components in a mixture is achieved when a mobile phase is passed over a stationary phase. Differences in affinities of various substances for these phases result in their separation. Chromatography can be divided into two main branches, depending on whether the mobile phase is a gas or a liquid. Gas chromatography is principally used for analysis of volatile, thermally stable materials. Liquid chromatography is particularly useful for analysis of nonvolatile or thermally unstable organic substances. Ion chromatograp

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

en_stop = get_stop_words('en')

p_stemmer = PorterStemmer()

raw = abstract.lower()
tokens = tokenizer.tokenize(raw)

stopped_tokens = [i for i in tokens if not i in en_stop]


print(stopped_tokens)
# split the words into individual elements

['chromatography', 'analytical', 'technique', 'used', 'quantitative', 'determination', 'purity', 'organic', 'increasing', 'number', 'inorganic', 'reagent', 'chemicals', 'standard', 'grade', 'reference', 'materials', 'broad', 'scope', 'chromatography', 'allows', 'used', 'separation', 'identification', 'assay', 'diverse', 'chemical', 'species', 'ranging', 'simple', 'metal', 'ions', 'compounds', 'complex', 'molecular', 'structure', 'proteins', 'chromatography', 'separation', 'individual', 'components', 'mixture', 'achieved', 'mobile', 'phase', 'passed', 'stationary', 'phase', 'differences', 'affinities', 'various', 'substances', 'phases', 'result', 'separation', 'chromatography', 'can', 'divided', 'two', 'main', 'branches', 'depending', 'whether', 'mobile', 'phase', 'gas', 'liquid', 'gas', 'chromatography', 'principally', 'used', 'analysis', 'volatile', 'thermally', 'stable', 'materials', 'liquid', 'chromatography', 'particularly', 'useful', 'analysis', 'nonvolatile', 'thermally', 'unstab

In [None]:
stop_words = stopwords.words('english')

texts = []

# clean and tokenize document string
raw = abstract.lower()
tokens = tokenizer.tokenize(raw)

# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in stop_words]

# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

# add tokens to list
#texts.append(stemmed_tokens)
texts.append(stopped_tokens)

#print(stopped_tokens)
#print(stemmed_tokens)
print(texts)


[['chromatography', 'analytical', 'technique', 'used', 'quantitative', 'determination', 'purity', 'organic', 'increasing', 'number', 'inorganic', 'reagent', 'chemicals', 'standard', 'grade', 'reference', 'materials', 'broad', 'scope', 'chromatography', 'allows', 'used', 'separation', 'identification', 'assay', 'diverse', 'chemical', 'species', 'ranging', 'simple', 'metal', 'ions', 'compounds', 'complex', 'molecular', 'structure', 'proteins', 'chromatography', 'separation', 'individual', 'components', 'mixture', 'achieved', 'mobile', 'phase', 'passed', 'stationary', 'phase', 'differences', 'affinities', 'various', 'substances', 'phases', 'result', 'separation', 'chromatography', 'divided', 'two', 'main', 'branches', 'depending', 'whether', 'mobile', 'phase', 'gas', 'liquid', 'gas', 'chromatography', 'principally', 'used', 'analysis', 'volatile', 'thermally', 'stable', 'materials', 'liquid', 'chromatography', 'particularly', 'useful', 'analysis', 'nonvolatile', 'thermally', 'unstable', '

In [None]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

print(corpus)

# corpus has id number (0) for a word and
# the frequency (1) the word shows up in the document

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 11), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (33, 1), (34, 1), (35, 4), (36, 1), (37, 2), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 2), (46, 1), (47, 1), (48, 3), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 3), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 2), (69, 2), (70, 2), (71, 1), (72, 1), (73, 1), (74, 3), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1)]]


In [None]:
from pprint import pprint
# number of topics: 2

ldamodel = gensim.models.LdaMulticore(corpus=corpus,
                                      id2word=dictionary,
                                      num_topics=2,
                                      passes=10)
# num_topics = number of topics to generate
#id2word = dictionary to map IDS to strings
# passes = number of laps the model will take through corpus (more accurate the more passes)
pprint(ldamodel.print_topics())

# topic 1 - separation (of chemicals)
# topic 2 - types (of chromatography)

[(0,
  '0.073*"chromatography" + 0.030*"liquid" + 0.023*"used" + 0.023*"phase" + '
  '0.022*"separation" + 0.017*"gas" + 0.017*"organic" + 0.017*"materials" + '
  '0.017*"ion" + 0.016*"substances"'),
 (1,
  '0.045*"chromatography" + 0.020*"liquid" + 0.019*"separation" + '
  '0.018*"phase" + 0.018*"used" + 0.015*"form" + 0.015*"technique" + '
  '0.015*"analysis" + 0.015*"thermally" + 0.015*"mobile"')]


In [None]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import matplotlib.pyplot as plt
%matplotlib inline

# Compute Perplexity
print('\nPerplexity: ', ldamodel.log_perplexity(corpus))
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Lower the perplexity better the model.
# Higher the topic coherence, the topic is more human interpretable.


Perplexity:  -4.826256224868494

Coherence Score:  0.9999999999999998


In [None]:
"""
print out the topics, the keyword for each topic, and frequency
of keyword in the topic
"""

from pprint import pprint

# printing option 1
ldamodel = gensim.models.LdaMulticore(corpus=corpus,
                                               id2word=dictionary,
                                               num_topics=4)
pprint(ldamodel.print_topics())

print("\n")

# printing option 2
for idx, topic in ldamodel.show_topics(formatted=False, num_words= 30):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))



[(0,
  '0.075*"chromatography" + 0.033*"liquid" + 0.025*"used" + 0.021*"phase" + '
  '0.019*"separation" + 0.018*"ion" + 0.018*"thermally" + 0.017*"analysis" + '
  '0.017*"technique" + 0.017*"form"'),
 (1,
  '0.031*"chromatography" + 0.017*"liquid" + 0.016*"used" + 0.016*"separation" '
  '+ 0.016*"phase" + 0.015*"gas" + 0.015*"thermally" + 0.014*"substances" + '
  '0.014*"organic" + 0.014*"mobile"'),
 (2,
  '0.070*"chromatography" + 0.025*"liquid" + 0.024*"phase" + '
  '0.024*"separation" + 0.022*"used" + 0.018*"materials" + 0.017*"form" + '
  '0.017*"gas" + 0.017*"substances" + 0.016*"organic"'),
 (3,
  '0.059*"chromatography" + 0.025*"liquid" + 0.024*"separation" + '
  '0.020*"phase" + 0.018*"used" + 0.018*"organic" + 0.017*"mobile" + '
  '0.017*"ion" + 0.016*"substances" + 0.015*"materials"')]


Topic: 0 
Words: chromatography|liquid|used|phase|separation|ion|thermally|analysis|technique|form|gas|mobile|substances|materials|organic|standard|mixture|determination|volatile|purity|refe