In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm 
import pickle
import re
from pprint import pprint

import gensim
from gensim.models.ldamulticore import LdaModel
from gensim.test.utils import datapath
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.porter import PorterStemmer
from gensim.utils import simple_preprocess
from gensim.models.nmf import Nmf
import gensim.corpora as corpora
from nltk.corpus import stopwords
import spacy

from utils import cleantext

In [5]:
abstracts = pd.read_csv('./data/lda_abstacts_sample.csv')

In [6]:
abstracts.shape

(640908, 2)

In [6]:
abstracts['abstract_processed'] = abstracts['abstract'].map(lambda x: cleantext.clean_abstract(x))

In [10]:
additional_stop_words = ['use', 'study', 'result', 'also', 'may', 'find', 'method', 'system',
                         'however', 'suggest', 'paper', 'include', 'increase', 'solution', 
                         'change', 'process', 'different', 'base', 'effect', 'rate', 'Â°']

stop_words = stopwords.words('english') + additional_stop_words

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

def remove_lowinfo_words(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in tqdm(texts):
        doc = nlp(sent) 
        texts_out.append([str(token.lemma_) for token in doc 
                          if token.pos_ in allowed_postags 
                          if str(token) not in stop_words 
                          if str(token.lemma_) not in stop_words
                          if len(str(token)) > 1
                          if len(str(token.lemma_)) > 1
                         ])
    return texts_out

In [11]:
unigram = remove_lowinfo_words(abstracts['abstract_processed'].values.tolist())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=640908.0), HTML(value='')))




In [12]:
# bigram = abstracts['abstract_processed'].apply(lambda x: cleantext.make_ngrams(x.split(), ngram_range = (2, 2), stop_words = stop_words)).values.tolist()
sents = list(map(' '.join, unigram))
bigram = list(map(lambda x: cleantext.make_ngrams(x.split(), ngram_range = (2, 2), stop_words = stop_words), sents))

In [13]:
bi_stopwords = ['title_introduction', 'section_title', 'title_introduction', 'chapter_present', 'chapter_contain', 
                'contain_section', 'article_doi', 'Â°_c', 'review_ref', 'review_recent', '%_%']

In [14]:
bi_gram = []
for s in bigram:
    bi_gram.append(['_'.join(b.split()) for b in s])    

In [17]:
bi_gram_stop = []
for s in bi_gram:
    bi_gram_stop.append([a for a in s if a not in bi_stopwords])

In [20]:
tokens = [i[0] + i[1] for i in zip(unigram, bi_gram_stop)]

In [3]:
tokens = pickle.load(open('./data/lda/tokens_lemm_bistop.pickle', 'rb'))
corpus_tfidf = pickle.load(open('./data/lda/corpus_sktoken_lemm_bistop.pickle', 'rb'))
id2word = pickle.load(open('./data/lda/id2word_sktoken_lemm_bistop.pickle', 'rb'))

In [4]:
lda_model = gensim.models.LdaMulticore(corpus=corpus_tfidf,
                                       id2word=id2word,
                                       num_topics=100, 
                                       random_state=100,
                                       chunksize=100, passes = 3, workers = 8)

In [5]:
coherence_model_lda = CoherenceModel(model=lda_model, corpus = corpus_tfidf, dictionary=id2word, coherence='u_mass')
print(coherence_model_lda.get_coherence())

-14.427762400511604


In [6]:
lda_model.print_topics(num_topics=100, num_words=50)

[(0,
  '0.000*"flow_lead" + 0.000*"atom_occur" + 0.000*"adduct_form" + 0.000*"symmetry_analysis" + 0.000*"similar_distribution" + 0.000*"function_characterize" + 0.000*"follow_similar" + 0.000*"emission_region" + 0.000*"distribution_derive" + 0.000*"deterministic_stochastic" + 0.000*"analysis_distribution" + 0.000*"demonstrate_complete" + 0.000*"compare_analogous" + 0.000*"temperature_substrate" + 0.000*"substrate_water" + 0.000*"substrate_give" + 0.000*"solvent_medium" + 0.000*"reaction_equilibrium" + 0.000*"product_yield" + 0.000*"parameter_yield" + 0.000*"influence_reaction" + 0.000*"response_improve" + 0.000*"contain_molecule" + 0.000*"dmp" + 0.000*"form_structure" + 0.000*"formation_structure" + 0.000*"bed_bioreactor" + 0.000*"etoposide" + 0.000*"susceptibility_datum" + 0.000*"non_coding" + 0.000*"need_validate" + 0.000*"highlight_critical" + 0.000*"visible_absorption" + 0.000*"shift_nm" + 0.000*"quantum_size" + 0.000*"ordinate" + 0.000*"demonstrate_reaction" + 0.000*"ion_co" + 0.

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus_tfidf,
                                       id2word=id2word,
                                       num_topics=100, 
                                       random_state=100,
                                       chunksize=100, alpha = 0.6, eta = 0.6, passes = 10, workers = 8)

In [14]:
coherence_model_lda = CoherenceModel(model=lda_model, corpus = corpus_tfidf, dictionary=id2word, coherence='u_mass')
print(coherence_model_lda.get_coherence())

-2.0436493058460257


In [15]:
lda_model.print_topics(num_topics=100, num_words=50)

[(0,
  '0.001*"film" + 0.001*"surface" + 0.001*"structure" + 0.001*"temperature" + 0.001*"model" + 0.001*"material" + 0.001*"phase" + 0.001*"energy" + 0.001*"device" + 0.000*"reaction" + 0.000*"cell" + 0.000*"show" + 0.000*"crystal" + 0.000*"high" + 0.000*"thin" + 0.000*"metal" + 0.000*"optical" + 0.000*"interface" + 0.000*"light" + 0.000*"dynamic" + 0.000*"range" + 0.000*"ion" + 0.000*"yield" + 0.000*"large" + 0.000*"time" + 0.000*"single" + 0.000*"property" + 0.000*"growth" + 0.000*"substrate" + 0.000*"low" + 0.000*"observe" + 0.000*"density" + 0.000*"polymer" + 0.000*"condition" + 0.000*"potential" + 0.000*"bond" + 0.000*"measurement" + 0.000*"layer" + 0.000*"nanoparticle" + 0.000*"demonstrate" + 0.000*"interaction" + 0.000*"mechanism" + 0.000*"control" + 0.000*"molecular" + 0.000*"chemical" + 0.000*"oxide" + 0.000*"form" + 0.000*"laser" + 0.000*"solid" + 0.000*"charge"'),
 (1,
  '0.001*"surface" + 0.001*"film" + 0.001*"temperature" + 0.001*"high" + 0.001*"material" + 0.001*"device"

In [None]:
lda_model.save('./data/lda/d100/lda100_66.model')