In [3]:
from tqdm.notebook import tqdm
import visualize as vis
import glob
import pandas as pd
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from utils import unpickler, pickler

import utils as ut

## Model Coherence

In [4]:
def extract_topic_keys(file:str) -> list:
	"""Get list of topics's keywords"""
	topic_keys = pd.read_csv(file, header=None, delimiter='\t').set_index(0, drop=True)
	topic_keys[2] = topic_keys[2].apply(lambda x: x.strip().split(" "))
	return	topic_keys[2].to_list()


In [1]:
def coherence_model(name:str, corpus:str) -> dict:
	# Get all topic models from patents
	path = fr'results/{name}/'
	topic_keys_txt = [f for f in glob.glob(path + r'**/topickeys.txt', recursive=True)]
	# Tokenized texts (list of list of str)
	texts = unpickler(corpus)

	# Gensim dictionary mapping of id word to create corpus
	# Create dictionary of tokens
	D = Dictionary(texts)
	n_tokens = len(D)
	print('The dictionary contains', n_tokens, 'terms')
	coherence = {}

	for file in tqdm(topic_keys_txt):
		# Get topics as lists of words (save in dictionary)
		topic_keys = extract_topic_keys(file)
		# Run Coherence Model
		coherence[len(topic_keys)] = CoherenceModel(topics=topic_keys, texts=texts,
		dictionary=D, coherence='c_v').get_coherence()

	return coherence

# Model Selection and Visualization

### Patents: Coherence

In [5]:
coherence = coherence_model(name='patents', corpus='corpus/patents_corpus.pkl')
pickler('app_data/data/patents_coherence.pkl', coherence)
fig1 = vis.display_coherence(coherence)

The dictionary contains 9006 terms


  0%|          | 0/6 [00:00<?, ?it/s]

In [6]:
fig1.write_image("img/patents_coherence.png", format='png',engine='kaleido')

### Patents: Topic Modeling for n-Topics

In [8]:
# Select model
n = 20
report = f'results/patents/lda_mallet_model_{n}/mallet_output/topic-report.xml'
# Display
topic_report, n = ut.read_report(report)
fig2 = vis.display_topics(topic_report, n)
# Generate wordclouds:
for i in range(len(topic_report)):
	vis.generate_wordcloud(topic_report,i,'patents')

In [9]:
fig2.write_image("img/patents_topics.png", format='png',engine='kaleido')

In [10]:
# Save topic_report data
pickler('app_data/data/patents_topic_report.pkl', topic_report)

## Cordis Projetcs

### Cordis: Coherence

In [11]:
coherence = coherence_model(name='cordis', corpus='corpus/cordis_corpus.pkl')
pickler('app_data/data/cordis_coherence.pkl', coherence)
fig3 = vis.display_coherence(coherence)

The dictionary contains 74695 terms


  0%|          | 0/6 [00:00<?, ?it/s]

In [12]:
fig3.write_image("img/cordis_coherence.png", format='png',engine='kaleido')

### Cordis: Topic Modeling for n-Topics

In [13]:
# Select model
n = 15
report = f'results/cordis/lda_mallet_model_{n}/mallet_output/topic-report.xml'
# Display
topic_report, n = ut.read_report(report)
fig4 = vis.display_topics(topic_report, n)

# Generate wordclouds:
for i in range(len(topic_report)):
	vis.generate_wordcloud(topic_report,i,'cordis')

In [14]:
fig4.write_image("img/cordis_topics.png", format='png',engine='kaleido')

In [15]:
# Save topic_report data
pickler('app_data/data/cordis_topic_report.pkl', topic_report)


### Semantic Scholar: Coherence

In [None]:
coherence = coherence_model(name='semantic_scholar_subsample', corpus='corpus/semantic_scholar_subsample_corpus.pkl')
pickler('app_data/data/semantic_scholar_coherence.pkl', coherence)
fig5 = vis.display_coherence(coherence)

In [None]:
fig5.write_image("img/semantic_scholar_coherence.png", format='png',engine='kaleido')

### Semantic Scholar: Topic Modeling for n-Topics

In [16]:
# Select model
n = 20
report = f'results/semantic_scholar/lda_mallet_model_{n}/mallet_output/topic-report.xml'
# Display
topic_report, n = ut.read_report(report)
fig6 = vis.display_topics(topic_report, n)
# Generate wordclouds:
for i in range(len(topic_report)):
	vis.generate_wordcloud(topic_report,i,'semantic_scholar')

In [17]:
fig6.write_image("img/semantic_scholar_topics.png", format='png',engine='kaleido')

In [18]:
# Save topic_report data
pickler('app_data/data/patents_topic_report.pkl', topic_report)