In [1]:
import re
import string
import pandas as pd 
from tqdm import tqdm 
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
if ".." not in sys.path:
    sys.path.append("..")
    
import torch 
from skorch import callbacks
import sktopic
from sktopic.utils import manual_seed
#from sktopic.models.base import ELBO
#from sktopic.trainers import Trainer

from sktopic.models import GaussianStickBreakingModel
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

tqdm.pandas()

from octis.evaluation_metrics.coherence_metrics import WECoherencePairwise
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

import optuna
from sktopic.callbacks import TopicQualityScoring, WECoherenceScoring
from sktopic.metrics.npmi import NormalizedPointwiseMutualInformation as NPMI

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#from sktopic import utils
from sktopic.datasets import fetch_20NewsGroups
from sklearn.model_selection import train_test_split

manual_seed()
dataset = fetch_20NewsGroups()
X_tr,X_te = train_test_split(dataset["X"])
id2word = dataset["id2word"]
coherence = WECoherencePairwise()
diversity = TopicDiversity()
coherence_npmi = NPMI(dataset["X"], id2word=id2word)

Using: SEED=950331


In [13]:
# manual_seed()
# corpus_train = fetch_20newsgroups(subset="train", remove=["headers","footers","quotes"])
# corpus_test = fetch_20newsgroups(subset="test", remove=["headers","footers","quotes"])

# def clean_text(sentence):
#     # remove non alphabetic sequences
#     pattern = re.compile(r'[^a-z]+')
#     sentence = sentence.lower()
#     sentence = pattern.sub(' ', sentence).strip()
#     # Tokenize
#     word_list = word_tokenize(sentence)
    
#     # stop words
#     stopwords_list = set(stopwords.words('english'))
#     # puctuation
#     punct = set(string.punctuation)
    
#     # remove stop words
#     word_list = [word for word in word_list if word not in stopwords_list]
#     # remove very small words, length < 3
#     # they don't contribute any useful information
#     word_list = [word for word in word_list if len(word) > 2]
#     # remove punctuation
#     word_list = [word for word in word_list if word not in punct]
#     # remove number 
#     word_list = [word for word in word_list if not word.isdigit()]
#     # lemmatize
#     lemma = WordNetLemmatizer()
#     word_list = [lemma.lemmatize(word) for word in word_list]

#     # remove stop words
#     word_list = [word for word in word_list if word not in stopwords_list]
#     # list to sentence
#     sentence = ' '.join(word_list)
#     return sentence

# df_train = pd.DataFrame({'News': corpus_train.data,
#                        'Target': corpus_train.target})
# df_train['News'] = df_train['News'].apply(lambda x: clean_text(str(x)))

# df_test = pd.DataFrame({'News': corpus_test.data,
#                        'Target': corpus_test.target})
# df_test['News'] = df_test['News'].apply(lambda x: clean_text(str(x)))

# vectorizer = CountVectorizer(dtype=np.float32,lowercase=True, max_features=2000, max_df=0.5, min_df=10,stop_words="english")
# X_tr = vectorizer.fit_transform(df_train['News'].to_list())
# X_te = vectorizer.transform(df_test['News'].to_list())

# id2word = {k:v for k,v in enumerate(vectorizer.get_feature_names())}

# mask_tr = np.array(X_tr.sum(1) > 0).flatten()
# mask_te = np.array(X_te.sum(1) > 0).flatten()
# X_tr = X_tr[mask_tr]
# X_te = X_te[mask_te]

# print(f"{X_tr.shape=}")
# print(f"{X_te.shape=}")

# #coherence = WECoherencePairwise()
# diversity = TopicDiversity()
# coherence_npmi = NPMI(X_tr, id2word=id2word)

In [14]:
manual_seed()
V = X_tr.shape[1]
K = 20

from torch import nn 
from sparsemax import Sparsemax
from typing import Sequence, Optional
from sktopic.components.mmd_loss import MMDLoss
from sktopic.models import ProductOfExpertsLatentDirichletAllocation as ProdLDA
import sktopic.models as mm 

optimizer_cls = torch.optim.Adam#ASGD

model = mm.ProductOfExpertsLatentDirichletAllocation(
    vocab_size=V, n_components=K,
    optimizer=optimizer_cls,
    batch_size=1000, lr=0.001,max_epochs=10,
    device="cpu",verbose=1,
    callbacks=[
        WECoherenceScoring(id2word, coherence_object=coherence),
        callbacks.EarlyStopping(patience=5),
        callbacks.LRScheduler(),
        callbacks.GradientNormClipping(gradient_clip_value=1.0)
        ],
    #criterion=MMDLoss,
    activation_hidden="Tanh",
    ) 
#mu, sigma = model.module.encoder(torch.from_numpy(X_te.toarray()))
#(sigma == 0.0).sum()


model.fit(X_tr)
model_output = model.get_model_outputs(X_tr,id2word=id2word)

try:
    wetc = coherence.score(model_output)
except:
    wetc = 0.0
td = diversity.score(model_output)
npmi = coherence_npmi(model_output["topics"]) 
tq = npmi * td

print(f"{tq=},{wetc=},{td=},{npmi=}")

Using: SEED=950331


  epoch    train_loss    train_ppl    valid_loss    valid_ppl    wetc_pw      lr     dur
-------  ------------  -----------  ------------  -----------  ---------  ------  ------
      1     [36m3809.7927[0m    [32m1488.4971[0m     [35m2345.2207[0m    [31m1159.8865[0m     [94m0.0887[0m  0.0500  0.8452
      2     [36m1857.0911[0m    [32m1181.4513[0m      [35m728.4693[0m    [31m1088.0944[0m     [94m0.1029[0m  0.0488  0.8595
      3      [36m967.9099[0m    [32m1127.3795[0m      [35m531.0765[0m    [31m1078.3788[0m     [94m0.1138[0m  0.0452  0.8666
      4      [36m560.6776[0m    [32m1108.0131[0m      [35m403.6926[0m    [31m1075.1239[0m     0.1072  0.0397  0.8662
      5      [36m415.8797[0m    [32m1098.7426[0m      [35m364.6209[0m    [31m1073.7463[0m     0.0990  0.0327  0.8547
      6      [36m364.9222[0m    [32m1095.9938[0m      [35m347.7520[0m    [31m1072.9577[0m     0.0965  0.0250  0.8541
      7      [36m346.9776[0m    [32m1093.0

100%|██████████| 2250/2250.0 [00:04<00:00, 523.12it/s]

tq=0.10285870801610035,wetc=0.092073,td=0.738,npmi=0.1393749431112471





In [None]:
#tq=0.10514308448896476,wetc=0.08344203,td=0.886,npmi=0.11867165292208211

In [9]:
model.transform(X_te)

ValueError: Expected parameter loc (Tensor of shape (100, 20)) of distribution Normal(loc: torch.Size([100, 20]), scale: torch.Size([100, 20])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], grad_fn=<AddmmBackward0>)

In [None]:
from skorch.utils import to_tensor

In [None]:
pd.DataFrame(model.history_).tail()

Unnamed: 0,batches,epoch,train_batch_count,valid_batch_count,dur,valid_ppl,valid_ppl_best,train_ppl,train_ppl_best,train_loss,train_loss_best,valid_loss,valid_loss_best,event_lr
5,"[{'train_nll': 296.5760803222656, 'train_mmd':...",6,98,25,1.117952,1135.168092,True,1177.813497,True,331.976386,True,323.198382,True,0.025001
6,"[{'train_nll': 297.05474853515625, 'train_mmd'...",7,98,25,1.073643,1129.446305,True,1169.838853,True,331.719853,True,322.99052,True,0.017275
7,"[{'train_nll': 296.5000915527344, 'train_mmd':...",8,98,25,1.053726,1126.280294,True,1163.559574,True,331.364576,True,322.76826,True,0.010306
8,"[{'train_nll': 296.1229248046875, 'train_mmd':...",9,98,25,1.056663,1124.962236,True,1159.023742,True,331.145067,True,322.793836,False,0.004775
9,"[{'train_nll': 295.4415283203125, 'train_mmd':...",10,98,25,1.056243,1124.623689,True,1159.828309,False,331.22028,False,322.803825,False,0.001225


In [None]:
df = model.get_topic_top_words(id2word).T
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic_0,afford,sin,combine,trial,trade,universe,talk,science,critical,punishment
Topic_1,sleep,set,significantly,alive,hole,politic,affect,decent,insist,flow
Topic_2,side,option,graphic,style,heat,doesn,parallel,accident,deserve,minimum
Topic_3,simply,leg,god,kick,staff,surrender,ability,guy,ride,request
Topic_4,wheel,brake,yesterday,public,eat,acceptable,guess,box,campaign,doesn
Topic_5,location,north,primary,examine,principle,truck,debate,student,couple,suspect
Topic_6,decide,main,format,fine,draft,previously,oppose,argue,peace,automatically
Topic_7,threaten,movie,case,exercise,arm,church,implementation,suggestion,intent,method
Topic_8,apply,street,insert,component,role,possibly,recent,instruction,bother,restrict
Topic_9,surface,email,brain,pop,manufacture,refer,historical,reverse,observe,innocent


In [None]:
#!pip install googletrans==4.0.0-rc1

In [None]:
from googletrans import Translator
from time import sleep

In [None]:
trans = Translator()
for ix, line in enumerate(df.to_numpy().tolist()):
    line = ", ".join(line)
    ans = trans.translate(line, src="en",dest="ja")
    print(f"{ix}>>",ans.text)
    #sleep(2)

0>> 余裕、罪、コンバイン、トライアル、貿易、宇宙、話、科学、批判的、罰
1>> 睡眠、設定、大幅に、生きている、穴、政治的、影響、まともな、主張、流れ
2>> 側面、オプション、グラフィック、スタイル、熱、しない、並列、事故、値、最小
3>> 単に足、神、キック、スタッフ、降伏、能力、男、乗る、要求
4>> ホイール、ブレーキ、昨日、一般、食べる、許容できる、推測、箱、キャンペーン、行い
5>> 場所、北、一次、検査、原理、トラック、議論、学生、カップル、疑わしい
6>> 決定、メイン、フォーマット、ファイン、ドラフト、以前は反対、議論、平和、自動的に
7>> 脅迫、映画、ケース、運動、腕、教会、実装、提案、意図、方法
8>> 適用、ストリート、インサート、コンポーネント、役割、おそらく最近の、命令、やはり、制限
9>> 表面、電子メール、脳、ポップ、製造、参照、歴史的、逆、観察、無邪気
10>> 広く、マウス、オリジナル、高度、有能、ヨーロッパ、出版、平和、一貫性、貢献
11>> 存在、レビュー、息子、プロの、マザーボード、反応、MEG、修正、無邪気、遅い
12>> 王、エネルギー、オリジナル、描画、座り、ピッチ、生成、警官、具体的には半分
13>> キャッチ、行動、結果、注意、マウント、スキャン、廃棄物、車両、貧弱な、同意する
14>> もともと、国、ベンダー、死、連邦、国境、サイズ、メジャー、週、wear
15>> クリア、チェック、話、管理、変更、共通、深い、キック、千、検査
16>> 時計、ブーツ、ユニバース、トップ、ガイド、キーボード、衛星、危害、翼、家
17>> タクシー、支払い、強く、追跡、実証、軽減、家、サービス、休息、爆弾
18>> 大学、クラップ、ネットワーク、行動、関連、手、バラエティ、構成、心配、イメージ
19>> サイト、ビット、チェック、段落、同意、賛成、うそ、アイテム、カット、分離
