# **Data Acquisition**

load data in to the workspace
- AG News 
- COVID-19 Twitter

In [1]:
from IPython.display import clear_output
import gzip
import shutil
import os
import wget
import csv
import linecache
from shutil import copyfile
import ipywidgets as widgets

import pandas as pd
import numpy as np
import gensim
import pythainlp
from datasets import load_dataset

In [5]:
!pip install twarc 
!pip install tweepy 
!pip install argparse 
!pip install xtract 
!pip install wget
clear_output()

### **AG News**

In [6]:
ag_news = load_dataset("ag_news")
clear_output()

In [7]:
train_ag = ag_news['train']['text']
test_ag = ag_news['test']['text']

### **COVID-19 Twitter**

In [29]:
twitter = pd.read_csv("full_dataset-clean.tsv")

In [30]:
twitter.tail()

Unnamed: 0,tweet_id	date	time
7479935,1241575699707805698\t2020-3-22\t4:1:28
7479936,1241575699791699969\t2020-3-22\t4:1:28
7479937,1241575699921674240\t2020-3-22\t4:1:28
7479938,1241575703155363840\t2020-3-22\t4:1:29
7479939,1241575703247835136\t2020-3-22\t4:1:29


In [48]:
dataset_URL = "https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2022-03-19/2022-03-19-dataset.tsv.gz?raw=true"

#Downloads the dataset (compressed in a GZ format)
#!wget dataset_URL -O clean-dataset.tsv.gz
wget.download(dataset_URL, out='clean-dataset.tsv.gz')

#Unzips the dataset and gets the TSV dataset
with gzip.open('clean-dataset.tsv.gz', 'rb') as f_in:
    with open('clean-dataset.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

#Deletes the compressed GZ file
os.unlink("clean-dataset.tsv.gz")

100% [..........................................................................] 2491959 / 2491959

In [53]:
df = pd.read_csv('clean-dataset.tsv', sep = "\t")
lang_list = df.lang.unique()
lang_list= sorted(np.append(lang_list,'all'))
lang_picker = widgets.Dropdown(options=lang_list, value="all")
lang_picker

Dropdown(options=('all', 'am', 'ar', 'bg', 'bn', 'ca', 'ckb', 'cs', 'cy', 'da', 'de', 'dv', 'el', 'en', 'es', …

In [54]:
#Creates a new clean dataset with the specified language (if specified)
filtered_language = lang_picker.value

#If no language specified, it will get all records from the dataset
if filtered_language == "":
  copyfile('clean-dataset.tsv', 'clean-dataset-filtered.tsv')

#If language specified, it will create another tsv file with the filtered records
else:
  filtered_tw = list()
  current_line = 1
  with open("clean-dataset.tsv") as tsvfile:
    tsvreader = csv.reader(tsvfile, delimiter="\t")

    if current_line == 1:
      filtered_tw.append(linecache.getline("clean-dataset.tsv", current_line))

      for line in tsvreader:
        if line[3] == filtered_language:
          filtered_tw.append(linecache.getline("clean-dataset.tsv", current_line))
        current_line += 1

  print('\033[1mShowing first 5 tweets from the filtered dataset\033[0m')
  print(filtered_tw[1:(6 if len(filtered_tw) > 6 else len(filtered_tw))])

  with open('clean-dataset-filtered.tsv', 'w') as f_output:
      for item in filtered_tw:
          f_output.write(item)

[1mShowing first 5 tweets from the filtered dataset[0m
['1505031523711344643\t2022-03-19\t04:00:52\ten\tNULL\n', '1505031527251394561\t2022-03-19\t04:00:52\ten\tNULL\n', '1505031528551948293\t2022-03-19\t04:00:53\ten\tNULL\n', '1505031529117982724\t2022-03-19\t04:00:53\ten\tNULL\n', '1505031530783121408\t2022-03-19\t04:00:53\ten\tNULL\n']


In [55]:
import json
import tweepy
from tweepy import OAuthHandler

# Authenticate
CONSUMER_KEY = "" #@param {type:"string"}
CONSUMER_SECRET_KEY = "" #@param {type:"string"}
ACCESS_TOKEN_KEY = "" #@param {type:"string"}
ACCESS_TOKEN_SECRET_KEY = "" #@param {type:"string"}

#Creates a JSON Files with the API credentials
with open('api_keys.json', 'w') as outfile:
    json.dump({
    "consumer_key":CONSUMER_KEY,
    "consumer_secret":CONSUMER_SECRET_KEY,
    "access_token":ACCESS_TOKEN_KEY,
    "access_token_secret": ACCESS_TOKEN_SECRET_KEY
     }, outfile)

# **Data Preparation**

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
import re

In [9]:
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
tokenizer = RegexpTokenizer(r'\w+')

In [12]:
ps = PorterStemmer()
lcst = LancasterStemmer()

In [13]:
def remove_special(sentence, output_option = "list"):
    # special character removal
#     pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
#     cleanr = re.compile('<.*?>')
#     cleantext = re.sub(pat, '', sentence)
    cleantext = sentence.lower()
    
    # tokenziation
    cleantext = tokenizer.tokenize(cleantext)
    
    # stop word removal
    cleantext = [w for w in cleantext if not w.lower() in stop_words]
    
    # remove number but not word that contains number
    cleantext = [w for w in cleantext if not w.isnumeric()]
    
    # remove 1 length word
    cleantext = [w for w in cleantext if len(w) > 1]
    # Stemmer or Lemmer
#     stem_words=[stemmer.stem(w) for w in filtered_words]
#     lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    if output_option == "list":
        return cleantext
    elif output_option == "text":
        return " ".join(cleantext)
    # return cleantext

In [14]:
remove_special(train_ag[10], "text")

'oil economy cloud stocks outlook new york reuters soaring crude prices plus worries economy outlook earnings expected hang stock market next week depth summer doldrums'

In [15]:
preprocess_ag = list(map(remove_special, train_ag))

In [43]:
# preprocess_ag_text = list(map(remove_special, train_ag))

In [17]:
preprocess_ag

[['wall',
  'st',
  'bears',
  'claw',
  'back',
  'black',
  'reuters',
  'reuters',
  'short',
  'sellers',
  'wall',
  'street',
  'dwindling',
  'band',
  'ultra',
  'cynics',
  'seeing',
  'green'],
 ['carlyle',
  'looks',
  'toward',
  'commercial',
  'aerospace',
  'reuters',
  'reuters',
  'private',
  'investment',
  'firm',
  'carlyle',
  'group',
  'reputation',
  'making',
  'well',
  'timed',
  'occasionally',
  'controversial',
  'plays',
  'defense',
  'industry',
  'quietly',
  'placed',
  'bets',
  'another',
  'part',
  'market'],
 ['oil',
  'economy',
  'cloud',
  'stocks',
  'outlook',
  'reuters',
  'reuters',
  'soaring',
  'crude',
  'prices',
  'plus',
  'worries',
  'economy',
  'outlook',
  'earnings',
  'expected',
  'hang',
  'stock',
  'market',
  'next',
  'week',
  'depth',
  'summer',
  'doldrums'],
 ['iraq',
  'halts',
  'oil',
  'exports',
  'main',
  'southern',
  'pipeline',
  'reuters',
  'reuters',
  'authorities',
  'halted',
  'oil',
  'export',


## **Bag-of-Word (Gensim)**

In [18]:
from gensim.corpora import Dictionary

In [19]:
dictionary = Dictionary(preprocess_ag)
clear_output()

In [20]:
print(dictionary)

Dictionary(63540 unique tokens: ['back', 'band', 'bears', 'black', 'claw']...)


In [23]:
corpus = [dictionary.doc2bow(doc) for doc in preprocess_ag] # convert each documents in preprocessed_ag to bag of word

In [24]:
corpus[:10]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 2),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 2)],
 [(8, 2),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1)],
 [(8, 2),
  (29, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 2),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1)],
 [(8, 2),
  (49, 3),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 2),
  (69, 2),
  (70, 1),
  (71, 1),
  (72, 2),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 2),
  (78, 1)],
 [(45, 1),
  (49, 2),
  (52, 2),
  (79, 2),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 2

## **CountVectorizer sklearn**

In [51]:
vectorizer_test = CountVectorizer(tokenizer= remove_special)

In [65]:
vectorizer = CountVectorizer(preprocessor=None)
countvec_ag = vectorizer.fit_transform(preprocess_ag)
# vectorizer.get_feature_names_out()

AttributeError: 'list' object has no attribute 'lower'

In [61]:
vectorizer_test.vocabulary_

{'wall': 61092,
 'st': 53420,
 'bears': 6058,
 'claw': 11252,
 'back': 5173,
 'black': 7001,
 'reuters': 47449,
 'short': 51061,
 'sellers': 50190,
 'street': 54214,
 'dwindling': 17563,
 'band': 5495,
 'ultra': 58686,
 'cynics': 14207,
 'seeing': 50097,
 'green': 24161,
 'carlyle': 9655,
 'looks': 33252,
 'toward': 57510,
 'commercial': 11986,
 'aerospace': 2079,
 'private': 44083,
 'investment': 28913,
 'firm': 21047,
 'group': 24375,
 'reputation': 47084,
 'making': 34035,
 'well': 61618,
 'timed': 56982,
 'occasionally': 39496,
 'controversial': 12828,
 'plays': 42877,
 'defense': 14949,
 'industry': 28020,
 'quietly': 45145,
 'placed': 42742,
 'bets': 6592,
 'another': 3507,
 'part': 41378,
 'market': 34479,
 'oil': 39709,
 'economy': 17826,
 'cloud': 11447,
 'stocks': 53997,
 'outlook': 40495,
 'soaring': 52344,
 'crude': 13799,
 'prices': 43987,
 'plus': 43015,
 'worries': 62497,
 'earnings': 17661,
 'expected': 19718,
 'hang': 25058,
 'stock': 53979,
 'next': 38509,
 'week': 61

In [23]:
countvec_ag

NameError: name 'countvec_ag' is not defined

## **TF-IDF**

In [16]:
from gensim.models import TfidfModel

In [21]:
tfidf = TfidfModel(corpus)

collecting document frequencies
PROGRESS: processing document #0
PROGRESS: processing document #10000
PROGRESS: processing document #20000
PROGRESS: processing document #30000
PROGRESS: processing document #40000
PROGRESS: processing document #50000
PROGRESS: processing document #60000
PROGRESS: processing document #70000
PROGRESS: processing document #80000
PROGRESS: processing document #90000
PROGRESS: processing document #100000
PROGRESS: processing document #110000
TfidfModel lifecycle event {'msg': 'calculated IDF weights for 120000 documents and 63540 features (2657542 matrix non-zeros)', 'datetime': '2022-03-28T12:36:27.328966', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22563-SP0', 'event': 'initialize'}


In [22]:
corpus_tfidf = tfidf[corpus]

## **WORD2VEC**

In [78]:
from gensim.test.utils import common_texts

In [77]:
from gensim.models import Word2Vec

In [None]:
word2vec_model = Word2Vec()

In [80]:
model.train([["hello", "world"]], total_examples=1, epochs=1)

AttributeError: 'LdaModel' object has no attribute 'train'

## **Uncategorized Code**

In [25]:
from pythainlp.word_vector import WordVector

loading projection weights from C:\Users\pond\pythainlp-data\thai2vec.bin
KeyedVectors lifecycle event {'msg': 'loaded (51358, 300) matrix of type float32 from C:\\Users\\pond\\pythainlp-data\\thai2vec.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2022-03-25T19:16:31.125386', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22543-SP0', 'event': 'load_word2vec_format'}


In [26]:
from gensim.models import Word2Vec

In [27]:
wv = WordVector()

loading projection weights from C:\Users\pond\pythainlp-data\thai2vec.bin
KeyedVectors lifecycle event {'msg': 'loaded (51358, 300) matrix of type float32 from C:\\Users\\pond\\pythainlp-data\\thai2vec.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2022-03-25T19:16:33.251466', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22543-SP0', 'event': 'load_word2vec_format'}


In [28]:
sentence = 'ฉันรักประเทศไทย'
sentence_vector = wv.sentence_vectorizer(sentence)
print(sentence_vector)

[[ 1.71394671e-01 -2.43284330e-01 -1.41679967e-02  2.96576001e-01
  -2.22379004e-01 -7.44023304e-02 -5.33566376e-03  9.41766674e-03
  -1.74809662e-01 -1.44934667e-01 -3.31633329e-01 -1.30679997e-01
   1.81364005e-01  1.67169669e-01 -2.02283661e-01 -1.57379980e-02
  -7.14666670e-02  2.12028998e-01  4.89439977e-02 -2.99740005e-02
  -1.26108664e-01  2.74677332e-01  9.74936659e-02  6.31487002e-01
  -3.15139999e-01  4.49893996e-01  1.27577665e-01 -1.58133171e-03
  -2.14869662e-01 -5.12753278e-02 -1.92380051e-02 -2.24013329e-01
   2.40447673e-01 -2.18494669e-01 -1.98223218e-02 -2.72138665e-01
  -1.06574662e-01  1.65506682e-02  1.22765000e-01  8.32156638e-02
   1.25510022e-02  1.22485672e-01 -3.44349996e-02 -6.50096685e-02
  -3.66518664e-01 -3.11753343e-01  3.20426704e-02  2.93954653e-01
   3.07067662e-01  3.33423336e-01  1.35850003e-02  8.08779954e-02
   1.23463670e-01 -1.24875989e-01 -1.04736676e-01 -5.93169990e-02
   5.11276013e-01 -3.85653277e-02  1.80944003e-01 -1.93367337e-01
   9.31639

In [17]:
porter = PorterStemmer()
print(porter.stem("connection"))

connect


In [18]:
from nltk.tokenize import word_tokenize

In [26]:
sentence = "I love Thailand"
print(word_tokenize(sentence))

['I', 'love', 'Thailand']


In [23]:
import nltk

In [24]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [None]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

Creating C:\Users\pond/gensim-data




In [28]:
import pandas as pd

In [38]:
test = {
    "This is": [1],
    "is a ": [1],
    "a sentence": [1],
    "This": [1],
    "is": [1],
    "a": [1],
    "sentence": [1]
}

In [39]:
pd.DataFrame(test, index=["document"])

Unnamed: 0,This is,is a,a sentence,This,is,a,sentence
document,1,1,1,1,1,1,1


# **Modeling**

## **Latent Dirichlet Allocation**

In [25]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
clear_output()

In [27]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10 # number of topic want to extract from the corpus
chunksize = 2000 # 
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus, # corpus, the set of documents
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations, # number of iteration
    num_topics=num_topics, # k topics to extract
)
clear_output()

In [28]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

CorpusAccumulator accumulated stats from 1000 documents
CorpusAccumulator accumulated stats from 2000 documents
CorpusAccumulator accumulated stats from 3000 documents
CorpusAccumulator accumulated stats from 4000 documents
CorpusAccumulator accumulated stats from 5000 documents
CorpusAccumulator accumulated stats from 6000 documents
CorpusAccumulator accumulated stats from 7000 documents
CorpusAccumulator accumulated stats from 8000 documents
CorpusAccumulator accumulated stats from 9000 documents
CorpusAccumulator accumulated stats from 10000 documents
CorpusAccumulator accumulated stats from 11000 documents
CorpusAccumulator accumulated stats from 12000 documents
CorpusAccumulator accumulated stats from 13000 documents
CorpusAccumulator accumulated stats from 14000 documents
CorpusAccumulator accumulated stats from 15000 documents
CorpusAccumulator accumulated stats from 16000 documents
CorpusAccumulator accumulated stats from 17000 documents
CorpusAccumulator accumulated stats from

Average topic coherence: -5.0547.
[([(0.026744228, 'gt'),
   (0.026690995, 'lt'),
   (0.013989052, 'reuters'),
   (0.013848806, 'company'),
   (0.013833823, 'said'),
   (0.01213395, 'inc'),
   (0.010857178, 'new'),
   (0.009798156, 'million'),
   (0.0097926045, 'corp'),
   (0.009592543, 'microsoft'),
   (0.008994899, 'com'),
   (0.00870745, 'software'),
   (0.008641773, 'billion'),
   (0.008235959, 'deal'),
   (0.007094845, 'fullquote'),
   (0.006827113, 'business'),
   (0.0064928657, 'sales'),
   (0.006482671, 'year'),
   (0.0055644177, 'target'),
   (0.005401946, 'buy')],
  -2.6898171940113476),
 ([(0.023934258, 'oil'),
   (0.01949743, 'reuters'),
   (0.014882142, 'us'),
   (0.013683643, 'prices'),
   (0.011415926, 'dollar'),
   (0.01103332, 'new'),
   (0.0086157005, 'stocks'),
   (0.007903513, 'friday'),
   (0.007734053, 'japan'),
   (0.0070846006, 'york'),
   (0.0063824137, 'market'),
   (0.0061792247, 'nuclear'),
   (0.006160749, 'record'),
   (0.006081711, 'percent'),
   (0.00599

In [29]:
model.print_topics()

topic #0 (0.313): 0.018*"new" + 0.013*"mobile" + 0.011*"wireless" + 0.011*"internet" + 0.010*"oracle" + 0.010*"phone" + 0.010*"peoplesoft" + 0.009*"december" + 0.009*"security" + 0.009*"sony"
topic #1 (0.383): 0.019*"game" + 0.014*"season" + 0.014*"ap" + 0.012*"night" + 0.011*"new" + 0.008*"red" + 0.008*"two" + 0.007*"sports" + 0.007*"first" + 0.007*"nasa"
topic #2 (0.330): 0.015*"world" + 0.015*"first" + 0.009*"one" + 0.009*"test" + 0.008*"time" + 0.007*"second" + 0.007*"win" + 0.007*"final" + 0.007*"christmas" + 0.007*"top"
topic #3 (0.277): 0.021*"people" + 0.013*"least" + 0.011*"drug" + 0.010*"desktop" + 0.009*"said" + 0.009*"bomb" + 0.008*"killed" + 0.008*"police" + 0.007*"two" + 0.007*"sun"
topic #4 (0.617): 0.027*"gt" + 0.027*"lt" + 0.014*"reuters" + 0.014*"company" + 0.014*"said" + 0.012*"inc" + 0.011*"new" + 0.010*"million" + 0.010*"corp" + 0.010*"microsoft"
topic #5 (0.375): 0.013*"said" + 0.011*"reuters" + 0.011*"united" + 0.010*"palestinian" + 0.008*"un" + 0.008*"talks" + 0

[(0,
  '0.018*"new" + 0.013*"mobile" + 0.011*"wireless" + 0.011*"internet" + 0.010*"oracle" + 0.010*"phone" + 0.010*"peoplesoft" + 0.009*"december" + 0.009*"security" + 0.009*"sony"'),
 (1,
  '0.019*"game" + 0.014*"season" + 0.014*"ap" + 0.012*"night" + 0.011*"new" + 0.008*"red" + 0.008*"two" + 0.007*"sports" + 0.007*"first" + 0.007*"nasa"'),
 (2,
  '0.015*"world" + 0.015*"first" + 0.009*"one" + 0.009*"test" + 0.008*"time" + 0.007*"second" + 0.007*"win" + 0.007*"final" + 0.007*"christmas" + 0.007*"top"'),
 (3,
  '0.021*"people" + 0.013*"least" + 0.011*"drug" + 0.010*"desktop" + 0.009*"said" + 0.009*"bomb" + 0.008*"killed" + 0.008*"police" + 0.007*"two" + 0.007*"sun"'),
 (4,
  '0.027*"gt" + 0.027*"lt" + 0.014*"reuters" + 0.014*"company" + 0.014*"said" + 0.012*"inc" + 0.011*"new" + 0.010*"million" + 0.010*"corp" + 0.010*"microsoft"'),
 (5,
  '0.013*"said" + 0.011*"reuters" + 0.011*"united" + 0.010*"palestinian" + 0.008*"un" + 0.008*"talks" + 0.007*"china" + 0.007*"apple" + 0.007*"peace" 

### **LDA visualization**

In [30]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [31]:
lda_viz = gensimvis.prepare(model, corpus, dictionary)
clear_output()

In [32]:
lda_viz

## **Embedded Topic Model**

In [75]:
pip install embedded_topic_model

Collecting embedded_topic_model
  Using cached embedded_topic_model-1.0.2-py3-none-any.whl (17 kB)
Collecting numpy==1.19.5
  Using cached numpy-1.19.5-cp38-cp38-win_amd64.whl (13.3 MB)
Note: you may need to restart the kernel to use updated packages.
Collecting scipy==1.5.2

ERROR: Cannot install embedded-topic-model==0.1.0, embedded-topic-model==0.1.1, embedded-topic-model==1.0.0, embedded-topic-model==1.0.1 and embedded-topic-model==1.0.2 because these package versions have conflicting dependencies.
ERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/user_guide/#fixing-conflicting-dependencies



  Downloading scipy-1.5.2-cp38-cp38-win_amd64.whl (31.4 MB)
Collecting gensim==3.8.3
  Downloading gensim-3.8.3-cp38-cp38-win_amd64.whl (24.2 MB)
Collecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp38-cp38-win_amd64.whl (6.8 MB)
Collecting nltk==3.5
  Downloading nltk-3.5.zip (1.4 MB)
Collecting embedded_topic_model
  Using cached embedded_topic_model-1.0.1-py3-none-any.whl (17 kB)
  Using cached embedded_topic_model-1.0.0-py3-none-any.whl (17 kB)
  Using cached embedded_topic_model-0.1.1-py3-none-any.whl (17 kB)
  Using cached embedded_topic_model-0.1.0-py3-none-any.whl (17 kB)
Collecting numpy==1.20.0
  Downloading numpy-1.20.0-cp38-cp38-win_amd64.whl (13.7 MB)

The conflict is caused by:
    embedded-topic-model 1.0.2 depends on torch==1.6.0
    embedded-topic-model 1.0.1 depends on torch==1.6.0
    embedded-topic-model 1.0.0 depends on torch==1.6.0
    embedded-topic-model 0.1.1 depends on torch==1.6.0
    embedded-topic-model 0.1.0 depends on torch==1.6.0

To fix 

In [76]:
from embedded_topic_model.utils import preprocessing
import json

# Loading a dataset in JSON format. As said, documents must be composed by string sentences
corpus_file = 'datasets/example_dataset.json'
documents_raw = json.load(open(dataset, 'r'))
documents = [document['body'] for document in documents_raw]

# Preprocessing the dataset
vocabulary, train_dataset, _, = preprocessing.create_etm_datasets(
    documents, 
    min_df=0.01, 
    max_df=0.75, 
    train_size=0.85, 
)

ModuleNotFoundError: No module named 'embedded_topic_model'

## **Evaluation**

In [25]:
from gensim.models.coherencemodel import CoherenceModel

In [39]:
cm = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
coherence = cm.get_coherence()
clear_output()

In [40]:
coherence

-4.152167124089409