In [97]:
import numpy as np
import pandas as pd
import re, nltk, gensim, spacy
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
# Plotting tools
import pyLDAvis
# import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline
import PyPDF2
import os

In [99]:
# def extract_paragraphs_from_pdf(pdf_path):
#     paragraphs = []
    
#     with open(pdf_path, "rb") as pdf_file:
#         pdf_reader = PyPDF2.PdfReader(pdf_file)
        
#         for page_num in range(len(pdf_reader.pages)):
#             page = pdf_reader.pages[page_num]
#             text = page.extract_text()
#             paragraphs.extend(text.split('\n'))
    
#     return paragraphs

# pdf_path = r"C:\Users\phili\techlabs\nlp_delft\Al_rabiah, Abdulrahman_Process_2022.pdf"
# paragraphs = extract_paragraphs_from_pdf(pdf_path)

# # Create a DataFrame
# data = {'Paragraph': paragraphs}
# df = pd.DataFrame(data)

# # Display the DataFrame
# print(df)


In [100]:
# def extract_text_from_pdf(pdf_path):
#     with open(pdf_path, 'rb') as pdf_file:
#         pdf_reader = PyPDF2.PdfReader(pdf_file)
#         text = ''
#         for page in pdf_reader.pages:
#             text += page.extract_text()
#     return text


def extract_paragraphs_from_pdf(pdf_path):
    paragraphs = []
    
    with open(pdf_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            paragraphs.extend(text.split('\n'))
    
    return paragraphs

In [101]:
pdf_folder = r'C:\Users\phili\techlabs\nlp_delft\paper'
pdf_files = [file for file in os.listdir(pdf_folder) if file.endswith('.pdf')]

In [102]:
pdf_texts = []
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    text = extract_paragraphs_from_pdf(pdf_path)
    df_text = pd.DataFrame(text)
    pdf_texts.append(df_text)


In [103]:
# Concatenate DataFrames vertically
concatenated_df = pd.concat(pdf_texts, ignore_index=True)

print(concatenated_df)

                                                       0
0                  Citation: Al-Rabiah, A.A.; Alkathiri,
1                            R.R.; Bagabas, A.A. Process
2                        Development for Methyl Isobutyl
3                            Ketone Production Using the
4                        Low-Pressure One-Step Gas-Phase
...                                                  ...
16878  268. Yoshihiko, H.; Katsuhiko, H. Production o...
16879  269. Zhang, J. Catalyst for Synthesizing Isoph...
16880  Disclaimer/Publisher’s Note: The statements, o...
16881  author(s) and contributor(s) and not of MDPI a...
16882  people or property resulting from any ideas, m...

[16883 rows x 1 columns]


In [104]:
# df['count'] = df['Paragraph'].str.split().str.len()
concatenated_df = concatenated_df.rename(columns={0: "Paragraph"})
concatenated_df['count'] = concatenated_df['Paragraph'].str.split().str.len()


In [105]:
concatenated_df

Unnamed: 0,Paragraph,count
0,"Citation: Al-Rabiah, A.A.; Alkathiri,",4
1,"R.R.; Bagabas, A.A. Process",4
2,Development for Methyl Isobutyl,4
3,Ketone Production Using the,4
4,Low-Pressure One-Step Gas-Phase,3
...,...,...
16878,"268. Yoshihiko, H.; Katsuhiko, H. Production o...",18
16879,"269. Zhang, J. Catalyst for Synthesizing Isoph...",13
16880,"Disclaimer/Publisher’s Note: The statements, o...",17
16881,author(s) and contributor(s) and not of MDPI a...,20


In [106]:
df = concatenated_df

Data Cleaning

In [107]:
# Convert to list
data = df.Paragraph.values.tolist()
# Remove Emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub(r"\'", "", sent) for sent in data]
pprint(data[:1])

['Citation: Al-Rabiah, A.A.; Alkathiri,']


In [108]:
data

['Citation: Al-Rabiah, A.A.; Alkathiri,',
 'R.R.; Bagabas, A.A. Process',
 'Development for Methyl Isobutyl',
 'Ketone Production Using the',
 'Low-Pressure One-Step Gas-Phase',
 'Selective Hydrogenation of Acetone.',
 'Processes 2022 ,10, 1992. https://',
 'doi.org/10.3390/pr10101992',
 'Academic Editor: Jean-Claude Assaf',
 'Received: 1 September 2022',
 'Accepted: 27 September 2022',
 'Published: 2 October 2022',
 'Publisher’s Note: MDPI stays neutral',
 'with regard to jurisdictional claims in',
 'published maps and institutional afﬁl-',
 'iations.',
 'Copyright: © 2022 by the authors.',
 'Licensee MDPI, Basel, Switzerland.',
 'This article is an open access article',
 'distributed under the terms and',
 'conditions of the Creative Commons',
 'Attribution (CC BY) license (https://',
 'creativecommons.org/licenses/by/',
 '4.0/).',
 'processes',
 'Article',
 'Process Development for Methyl Isobutyl Ketone Production',
 'Using the Low-Pressure One-Step Gas-Phase Selective',
 'Hydrogen

In [109]:
len(data)

16883

Tokenize

In [110]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['citation', 'al', 'rabiah', 'alkathiri']]


Stemming

In [111]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [112]:
# Initialize spacy ‘en’ model, keeping only tagger component (for efficiency)
# Run in terminal: python -m spacy download en
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB']) #select noun and verb
print(data_lemmatized[:2])

['citation alkathiri', 'process']


In [113]:
len(data_lemmatized)

16883

In [114]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,

                             stop_words='english',             

                             lowercase=True,                   

                             token_pattern='[a-zA-Z0-9]{3,}',  

                             )
data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [115]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=20,               # Number of topics
                                      max_iter=10,               
# Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          
# Random state
                                      batch_size=128,            
# n docs in each learning iter
                                      evaluate_every = -1,       
# compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               
# Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_components=20, n_jobs=-1,
                          random_state=100)


In [116]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
 evaluate_every=-1, learning_decay=0.7,
 learning_method='online', learning_offset=10.0,
 max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
 n_components=10, n_jobs=-1, perp_tol=0.1,
 random_state=100, topic_word_prior=None,
 total_samples=1000000.0, verbose=0)

In [117]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),n_jobs=1,
       param_grid={'n_topics': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [126]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -81144.98250100095
Model Perplexity:  465.02424296580364


In [127]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ['Topic' + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ['Doc' + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.05,0.05,0.05,0.05,0.05,0.55,0.05,0.05,0.05,0.05,5
Doc1,0.55,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0
Doc2,0.03,0.03,0.03,0.03,0.03,0.03,0.77,0.03,0.03,0.03,6
Doc3,0.03,0.03,0.03,0.27,0.03,0.03,0.52,0.03,0.03,0.03,6
Doc4,0.02,0.02,0.02,0.02,0.82,0.02,0.02,0.02,0.02,0.02,4
Doc5,0.03,0.03,0.03,0.03,0.7,0.03,0.03,0.03,0.03,0.03,4
Doc6,0.37,0.03,0.03,0.37,0.03,0.03,0.03,0.03,0.03,0.03,0
Doc7,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc8,0.05,0.55,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,1
Doc9,0.05,0.05,0.05,0.55,0.05,0.05,0.05,0.05,0.05,0.05,3


In [132]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names_out()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

Unnamed: 0,absorber,absorption,accept,access,accord,account,acetaldehyde,acetate,acetone,achieve,...,weight,wind,work,world,write,writing,www,year,yield,zeolite
Topic0,0.123892,0.112921,0.155779,0.109354,2.244197,0.120924,0.110786,0.111434,0.593155,0.115855,...,0.109928,0.110761,33.057783,0.111966,0.120586,0.110672,0.182158,0.114501,0.17287,0.114097
Topic1,0.129481,6.783568,0.109126,0.109702,0.111744,27.673511,0.297472,0.122899,0.387372,0.114218,...,9.46874,0.11015,0.124912,0.112354,0.110017,0.11026,0.108493,0.123749,26.45079,0.108625
Topic2,0.112389,0.110651,0.110878,0.111143,0.116659,0.111234,0.143868,0.114358,129.71122,0.112892,...,0.116795,0.110508,0.127889,0.110264,0.110538,0.110175,0.109747,0.112402,1.428716,0.118492
Topic3,0.109398,1.059315,0.132986,0.159043,0.113867,0.119431,0.112552,1.165307,0.185772,0.118329,...,0.116548,0.108099,0.125815,0.205962,0.11138,0.109555,12.458878,0.117669,49.396785,0.11336
Topic4,13.534032,0.943739,0.109119,0.111161,0.161669,0.113165,0.11012,0.109921,105.206629,86.551566,...,0.206687,0.109671,24.150836,0.118094,0.109758,0.109379,0.120794,0.110119,12.443179,0.110918


In [135]:
df_topic_keywords['process']

Topic0    1547.553083
Topic1       0.125779
Topic2       0.117633
Topic3       0.141266
Topic4     546.991015
Topic5       0.127254
Topic6       0.710533
Topic7      90.933882
Topic8       9.321654
Topic9       0.112728
Name: process, dtype: float64

In [134]:
df_topic_keywords['production']

Topic0     98.383176
Topic1      0.124122
Topic2      0.122356
Topic3    706.075914
Topic4     92.380567
Topic5      4.563065
Topic6      0.159052
Topic7      0.192292
Topic8      0.144863
Topic9      0.115852
Name: production, dtype: float64

In [130]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,process,figure,heat,table,section,material,simulation,condition,waste,present,review,exchanger,integrate,production,balance
Topic 1,water,product,energy,reactor,result,utility,increase,cool,stream,content,extraction,reduce,separation,lead,heating
Topic 2,stream,hydrogen,case,feed,conversion,electrolysis,recycle,reactor,number,ratio,acetone,mix,methanol,stage,oxygen
Topic 3,production,methanol,formaldehyde,ton,obtain,design,chem,ethylene,chemical,market,formalin,crossref,methane,performance,impact
Topic 4,cost,gas,process,phase,pressure,step,steam,method,equipment,condensation,butanol,cation,liquid,reaction,hydrogenation
Topic 5,acid,column,reaction,produce,distillation,isophorone,use,component,mixture,selectivity,access,patent,point,composition,anhydride
Topic 6,catalyst,use,model,rate,electricity,synthesis,oxide,consumption,concentration,equation,technology,study,membrane,equilibrium,calculate
Topic 7,methanol,analysis,value,unit,solvent,dme,fuel,solution,carbon,plant,price,energy,power,source,footprint
Topic 8,base,use,cost,capital,year,estimate,investment,aspen,operate,decrease,recovery,application,apply,follow,property
Topic 9,temperature,datum,pem,vapor,propose,remove,variable,research,form,publish,guration,sub,exergy,effect,comparison
