# Step 1: Dataset preparation

In [1]:
import json
import re
import os

import matplotlib.pyplot as plt
import pickle
import pprint
import artm
from pymystem3 import Mystem
import string
import functools

%load_ext autoreload
%autoreload 2

In [2]:
%%time
DATA = 'data/ydistricts_new_data.json'
with open(DATA) as fh:
    posts = json.load(fh)
    
raw_sents = [sent for sent in list(posts['text'].values()) if sent]

CPU times: user 4.93 s, sys: 1.15 s, total: 6.07 s
Wall time: 11.4 s


In [3]:
from utils.data_process import process_text

[nltk_data] Downloading package stopwords to /home/robez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# the cell takes 2-3 minutes to run..
processed = process_text(raw_sents)

100%|██████████| 196326/196326 [02:35<00:00, 1260.25it/s]


removing short words...
done!


### Pickling preprocessed texts

In [5]:
def get_proccessed_sents_from_pickle(path="processed.pickle", raw_sents=None):
    if not os.path.exists(path) or not os.stat(path).st_size:
        with open(path, 'wb') as fh:
            processed = process_text(raw_sents)
            pickle.dump(processed, fh, protocol=pickle.HIGHEST_PROTOCOL)
    with open(path, 'rb') as fh:
        processed = pickle.load(fh)
    return processed

In [6]:
processed = get_proccessed_sents_from_pickle()
processed[0]

'маленький собачка замерзать припарковывать машина омский улица'

# Step 2: Batches preparation

In [12]:
vocab_path = "wabbit_on_cleaned_texts.txt"

def check_path(fn=None, *, PATH=vocab_path):
    
    if fn is None:
        return lambda fn: check_path(fn, PATH=PATH)

    @functools.wraps(fn)
    def func(*args, **kwargs):
        if not os.path.exists(PATH) or not os.stat(PATH).st_size:
            fn(*args, **kwargs)
        else:
            print(f'{PATH} already exists!')
    return func

In [13]:
@check_path
def vocabulary_prep(text, vocab):
     with open(vocab, 'w') as fh:
        for sent in text:
            fh.write(' |text ' + sent + '\n')

In [14]:
vocabulary_prep(processed, vocab_path)

##### Prepare batches with artm.BatchVectorizer

In [17]:
def batching(batch_path="batches/cleaned_texts", data_path=vocab_path, text_df="vowpal_wabbit", batches_df="batches", batch_size=1024):
    
    if not os.path.exists(batch_path):
        os.makedirs(batch_path)

    if not os.listdir(path=batch_path):
        batch_vectorizer = artm.BatchVectorizer(
                                            data_path=data_path,
                                            data_format=text_df, 
                                            target_folder=batch_path, 
                                            batch_size=batch_size)
    else:
        batch_vectorizer = artm.BatchVectorizer(data_path=batch_path,
                                            data_format=batches_df)
        
    return batch_vectorizer

In [18]:
batch_vectorizer = batching()

In [19]:
# creating a dictionary
dictionary = artm.Dictionary(name='dictionary')
dictionary.gather(batch_vectorizer.data_path)

In [20]:
batch_vectorizer.dictionary.save_text('vocab.txt')

In [21]:
!cat bigartm-book/junk/cooc_dictionary/launch.sh

#! /bin/bash

bigartm -c vw.txt -v vocab.txt --cooc-window 10 --cooc-min-tf 5 --write-cooc-tf cooc_tf_ --cooc-min-df 5 --write-cooc-df cooc_df_ --write-ppmi-tf ppmi_tf_ --write-ppmi-df ppmi_df_


In [22]:
# !cat bigartm-book/junk/cooc_dictionary/vw.txt

In [23]:
# !cd bigartm-book/junk/cooc_dictionary/
# !bash bigartm -c ../../../wabbit_on_cleaned_texts.txt -v vocab.txt --cooc-window 10 --cooc-min-tf 5 --write-cooc-tf cooc_tf_ --cooc-min-df 5 --write-cooc-df cooc_df_ --write-ppmi-tf ppmi_tf_ --write-ppmi-df ppmi_df_
# # Что данные ключи значат, по порядку:
# !cd -
# # -c − прочитать коллекцию;
# # -v − прочитать vocab;
# # --cooc-window − задать ширину окна (со-встречаемость токенов учитывается только внутри некоторого окна);
# # --cooc-min-tf − задать минимальное значение cooc TF (значение ниже данного порога не будут писаться в выходной файл);
# # --cooc-min-df − аналогично предыдущему, только для cooc DF;
# # --write-cooc-tf − записать файл с cooc TF по заданному пути, далее указывается путь;
# # --write-cooc-df, --write-ppmi-tf, --write-ppmi-df − аналогично для файлов cooc DF, PPMI TF, PPMI DF.

# Step 3: Topic modeling

In [24]:
from itertools import product

TOPICS = [5, 10, 25, 40]
TOKENS = 50
PASSES = [1, 3]
sparse_tau = [-5e-2, -1e-1, -1e1, -1e2, -1e3]
    
hyperparameters = product(TOPICS, PASSES)

evals = [
    artm.PerplexityScore(name='PerplexityScore', dictionary='dictionary'),
    artm.SparsityPhiScore(name='SparsityPhiScore', class_id="text"),
    artm.SparsityThetaScore(name='SparsityThetaScore')]

def scores_adder(model, evals=evals):
    for sc in evals:
        model.scores.add(sc, overwrite=True)

In [111]:
# from itertools import product

# TOPICS = [10, 40]
# TOKENS = 50
# PASSES = [1]
# sparse_tau = [-5e-2, -1e1]
    
# hyperparameters = product(TOPICS, PASSES)

# evals = [
#     artm.PerplexityScore(name='PerplexityScore', dictionary='dictionary'),
#     artm.SparsityPhiScore(name='SparsityPhiScore', class_id="text"),
#     artm.SparsityThetaScore(name='SparsityThetaScore')]

# def scores_adder(model, evals=evals):
#     for sc in evals:
#         model.scores.add(sc, overwrite=True)

https://nbviewer.jupyter.org/github/bigartm/bigartm-book/blob/master/junk/cooc_dictionary/example_of_gathering.ipynb

In [112]:
models = []
models_reg = []

def train(hypes=None, num_collection_passes=15):
    
    if not os.path.exists('models'):
        os.mkdir('models')
    
    best_phi_sparsity, best_theta_sparsity, best_model = None, None, None
    best_phi_sparsity_reg, best_theta_sparsity_reg, best_model_reg = None, None, None
    
    for top, passes in hypes:

        print(f"Params: # topics = {top}; # single doc passes = {passes}")

        # initializing the ARTM model
        model = artm.ARTM(
                num_topics=top, 
                topic_names=["topic"+str(i) for i in range(top)],
                class_ids={"text":1},
                reuse_theta=True, 
                cache_theta=True,
                num_document_passes=passes,
                seed=42)

        # adding scores
        scores_adder(model, evals)
        model.scores.add(artm.TopTokensScore(name="top_words", num_tokens=TOKENS, class_id="text"),
                        overwrite=True)

        # initialize the dictionary `dictionary` on the model `model`
        model.initialize('dictionary')

        # training
        model.fit_offline(batch_vectorizer=batch_vectorizer,
                          num_collection_passes=num_collection_passes)

        # choosing the best model by the following criteria
        cur_phi = model.score_tracker["SparsityPhiScore"].last_value
        cur_theta = model.score_tracker["SparsityThetaScore"].last_value

        print(round(cur_phi, 3), round(cur_theta, 3))
        model.dump_artm_model(f'models/model_{top}_{passes}')
        
        if (best_phi_sparsity is None and best_theta_sparsity is None) or \
            (cur_phi >= best_phi_sparsity and cur_theta >= best_theta_sparsity):

            best_phi_sparsity = cur_phi
            best_theta_sparsity = cur_theta
            best_model = model
            
        #####################################################################################################

        print("\nUsing regularizers...")
        model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=0, dictionary=dictionary),
                              overwrite=True)
        model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=0),
                              overwrite=True)

        for st in sparse_tau:
            print('- - - - - - - - - - - -')
            print(f"Tau of sparse phi/theta {st}")
            model.regularizers['SparsePhi'].tau = st
            model.regularizers["SparseTheta"].tau = st
            assert(model.regularizers['SparsePhi'].tau == st)

            model.fit_offline(batch_vectorizer=batch_vectorizer,
                              num_collection_passes=num_collection_passes)
            
            cur_phi_reg = model.score_tracker["SparsityPhiScore"].last_value
            cur_theta_reg = model.score_tracker["SparsityThetaScore"].last_value

            print(round(cur_phi_reg, 3), round(cur_theta_reg, 3))

            # make sure the matrices do not get nulled
            if cur_phi_reg == 1.0 or cur_theta_reg == 1.0:
                continue
            model.dump_artm_model(f'models/reg_model_{top}_{passes}_{st}')

            if (best_phi_sparsity_reg is None and best_theta_sparsity_reg is None) or \
                (cur_phi_reg >= best_phi_sparsity and cur_theta_reg >= best_theta_sparsity_reg):

                best_phi_sparsity_reg = cur_phi_reg
                best_theta_sparsity_reg = cur_theta_reg
                best_model_reg = model
            
        print('===========================================================================')
        
    return best_model, best_model_reg

In [113]:
# train and save every single model

%%time
train(hypes=hyperparameters)

Params: # topics = 10; # single doc passes = 1
0.223 0.0

Using regularizers...
- - - - - - - - - - - -
Tau of sparse phi/theta -0.05
0.571 0.695
- - - - - - - - - - - -
Tau of sparse phi/theta -10.0
0.723 0.958
Params: # topics = 40; # single doc passes = 1
0.433 0.001

Using regularizers...
- - - - - - - - - - - -
Tau of sparse phi/theta -0.05
0.788 0.916
- - - - - - - - - - - -
Tau of sparse phi/theta -10.0
0.895 0.991


TypeError: 'NoneType' object is not iterable

#############

In [28]:
def tokens_printer(model=None, tracker="top_words"):
    s = ''
    tokens = model.score_tracker[tracker].tokens[0]
#     print(model.score_tracker[tracker].last_tokens) #  empty dict
    for topic_name in model.topic_names:
        try:
            s += topic_name +':' + ",".join(tokens[topic_name]) + "\n"
        except KeyError:
            pass
    return s

In [29]:
def save_topics(model=None, tracker="top_words", file="topics.txt"):
    with open(file, 'w') as fh:
        fh.write(tokens_printer(model, tracker))

In [126]:
all_model_names = os.listdir('models') # list of all models (non-reg and reg)
all_models = []
if not os.path.exists('topics'):
    os.mkdir('topics')

for model_name in all_model_names:
    
    model = artm.load_artm_model(f'models/{model_name}')
    all_models.append(model)
    
    n_doc_passes = model.num_document_passes

    if not model_name.startswith('reg'):    
        topics_filename = f'topics/topics_{model.num_topics}_{n_doc_passes}passes.txt'

    else:
        phi = model.regularizers['SparsePhi'].tau
        theta = model.regularizers['SparseTheta'].tau

        topics_filename = f'topics/topics_{model.num_topics}_{n_doc_passes}passes_{phi}phi_{theta}theta.txt'
    save_topics(model, file=topics_filename)

In [None]:
all_models

#### инфер

In [132]:
import numpy as np
np.random.seed(42)

In [148]:
def infer(model=None, n_examples=25):
    r = np.random.randint(0, high=len(predict_t), size=n_examples)
    # get the most probable topic for a processed text
    # https://thispointer.com/pandas-find-maximum-values-position-in-columns-or-rows-of-a-dataframe/
    samples = dict(predict_t.iloc[r].idxmax(axis=1))
#     tokens = model.score_tracker["top_words"].last_tokens
    tokens = model.score_tracker['top_words'].tokens[0]
    for k, v in samples.items():
        s = ''
        try:
            n_topics = model.num_topics
            n_doc_passes = model.num_document_passes
            phi_sparse = model.score_tracker["SparsityPhiScore"].last_value
            theta_sparse = model.score_tracker["SparsityThetaScore"].last_value
            
            phi, theta = None, None
            if model.regularizers:
                phi = model.regularizers['SparsePhi'].tau
                theta = model.regularizers['SparseTheta'].tau
            
            s += f'Модель: {n_topics} тем; {n_doc_passes} проходов по док-у'
            try:
                s += f'; {phi:.2f} phi reg; {theta:.2f} theta reg\n'
            except:
                s += '\n'
                
            s += f'SparsePhi: {phi_sparse:.3f}; SparseTheta: {theta_sparse:.3f}\n'
            s += f'Текст: {processed[k]}\nТокены по теме: {tokens[v]}\n\n'
            return s
        except KeyError:
            print()

In [149]:
with open('examples.txt', 'w') as fh:
    for model in all_models:
        predict = model.transform(batch_vectorizer=batch_vectorizer)
        predict_t = predict.T # theta matrix
        fh.write(infer(model))

In [150]:
!cat examples.txt

Модель: 10 тем; 1 проходов по док-у
Текст: утерять документ паспорт водительский удостоверение имя самохин татьяна юрьевна находить просить вернуть вознаграждение написать лс спасибо
Токены по теме: ['это', 'сосед', 'год', 'человек', 'подсказывать', 'день', 'который', 'район', 'город', 'знать', 'наш', 'свой', 'хотеть', 'ребенок', 'очень', 'новый', 'весь', 'собака', 'мочь', 'привет', 'дом', 'хороший', 'пожалуйста', 'улица', 'время', 'работа', 'сегодня', 'добрый', 'вопрос', 'место', 'друг', 'купить', 'делать', 'находить', 'проходить', 'становиться', 'спасибо', 'магазин', 'просто', 'работать', 'помогать', 'самый', 'школа', 'сделать', 'фото', 'ваш', 'говорить', 'думать', 'вообще', 'житель']

Модель: 40 тем; 1 проходов по док-у
Текст: прием глава город начинаться благоустройство улица чехов парковый фотография
Токены по теме: ['сосед', 'район', 'дом', 'подсказывать', 'который', 'город', 'знать', 'день', 'новый', 'это', 'ребенок', 'привет', 'год', 'мочь', 'наш', 'ул', 'добрый', 'купить

In [None]:
predict = best_model.transform(batch_vectorizer=batch_vectorizer)
predict_t = predict.T # theta

In [None]:
predict_t.index.size, len(predict_t)

In [None]:
predict_t

In [None]:
r = np.random.randint(0, high=len(predict_t), size=20)
print(r)
print(predict_t.iloc[r].values.argmax(axis=1))
print(dict(predict_t.iloc[r].idxmax(axis=1)))

In [None]:
def infer(model=best_model, n_examples=25):
    r = np.random.randint(0, high=len(predict_t), size=n_examples)
    # get the most probable topic for a processed text
    # https://thispointer.com/pandas-find-maximum-values-position-in-columns-or-rows-of-a-dataframe/
    samples = dict(predict_t.iloc[r].idxmax(axis=1))
    tokens = model.score_tracker["top_words"].last_tokens
    for k, v in samples.items():
        print()
        try:
            print(f"Текст: {procced[k]}\nТокены по теме: {tokens[v]}")
            print()
        except KeyError:
            ...

In [None]:
infer()

In [None]:
infer(best_model_reg)