In [123]:
import re
import nltk
import unicodedata
import contractions
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
from collections import defaultdict
from scipy import spatial


from sklearn.feature_extraction.text import CountVectorizer

from pathlib2 import Path
from datasets import Dataset, load_from_disk
from tqdm import tqdm
import artm
import numpy as np

import matplotlib.pyplot as plt

def plot_list(values):
    plt.plot(values)
    plt.show()

In [76]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('words')

In [77]:
def get_boundaries(labels):
    assert len(labels) > 1
    boundaries = '0'
    for i in range(1, len(labels)):
        if labels[i] != labels[i-1]:
            boundaries += '1'
        else:
            boundaries += '0'
    return boundaries


class WikiDataset:
    def __init__(self, root):
        self.textfiles = self._get_files(root)
        self.separator = '========'

    def _get_files(self, path):
        'Ref: https://github.com/koomri/text-segmentation'
        all_objects = Path(path).glob('**/*')
        files = [str(p) for p in all_objects if p.is_file()]
        return files

    def _get_sections(self, lines):
        '''Divide the text using separator on parts of text, where each part
        has a different topic. Later we'll use it for a segmentation.
        '''
        sections = []
        labels = []
        last_is_sep = False
        topic_id = 0
        
        for line in lines:
            line = line.replace('\n', '')
            if len(line):
                if self.separator in line:
                    if not last_is_sep:
                        topic_id += 1
                        last_is_sep = True
                else:
                    last_is_sep = False
                    sections.append(line)
                    labels.append(topic_id)
                    
        return sections, labels

    

    def _get_sample(self):
        for path in self.textfiles:
            with open(path, 'r') as f:
                lines = f.readlines()[1:]  # skip the first separator
            if len(lines) < 1:
                continue
                
            sections, labels = self._get_sections(lines)
            if len(labels) <= 1:
                continue
                
            boundaries = get_boundaries(labels)
            yield {'path': str(path), 
                   'sections': sections, 
                   'labels': labels,
                   'boundaries': boundaries}
            
    def get_generator(self):
        return self._get_sample

In [78]:
def clean_text_func(input_text):    
    
    clean_text = re.sub(r'<[^<]+?>', '', input_text)
    clean_text = re.sub(r'http\S+', '', clean_text)
    clean_text = clean_text.lower()
    clean_text = re.sub(r'\s+', ' ', clean_text)
    clean_text = unicodedata.normalize('NFKD', clean_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    clean_text = contractions.fix(clean_text)
    clean_text = re.sub(r'[^a-zA-Z\s]', '', clean_text)
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(clean_text)
    tokens = [token for token in tokens if token not in stop_words]
    clean_text = ' '.join(tokens)
    clean_text = re.sub(r'[^\w\s]', '', clean_text)
    return clean_text

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_words

def create_dictionary(corpus):
    word_freq = defaultdict(int)
    for document in tqdm(corpus):
        for token in document:
            if len(token) > 2:
                word_freq[token] += 1

    all_words_set = set(words.words())
    word_freq = {word: count for word, count in word_freq.items() if word in all_words_set}        
    filtered_counts = {word: count for word, count in word_freq.items() if count > 30}
    sorted_words = sorted(filtered_counts.items(), key=lambda x: x[0])
    dictionary = {word: (index + 1) for index, (word, _) in enumerate(sorted_words)}

    return dictionary

def convert_to_uci(corpus_tockenized, dictionary, collection_name='my_collection'):
    
    # creating vocab file (all words have default class)
    sorted_words = sorted(dictionary.items(), key=lambda x: x[0])
    
    with open(f'vocab.{collection_name}.txt', 'w') as file:
        for token, _ in sorted_words:
            file.write(f"{token}\n")

    D = len(corpus_tockenized)
    W = len(dictionary)
    output = []
    for docid, doc in enumerate(corpus_tockenized):
        word_freq = defaultdict(int)
        for word in doc:
            if word not in dictionary:
                continue
            word_freq[word] += 1
        sorted_words = sorted(word_freq.items(), key=lambda x: x[0])
        for token, freq in sorted_words:
            output.append([docid+1, dictionary[token], freq])
            
    NNZ = len(output)
    
    with open(f'docword.{collection_name}.txt', 'w') as file:
        file.write(str(D) + '\n')
        file.write(str(W) + '\n')
        file.write(str(NNZ) + '\n')

        for doc_id, word_id, count in output:
            line = f"{doc_id} {word_id} {count}"
            file.write(line + '\n')

def process_corpus(corpus):
    tockenized_corpus = []
    for document in tqdm(corpus):
        cleaned_document = clean_text_func(document)
        tockenized = lemmatize_text(cleaned_document)
        tockenized_corpus.append(tockenized)
    return tockenized_corpus

In [79]:
def uci_to_vowpal(dict_path, file_path, output_path):
    ix_to_word = dict()
    counter = 1
    with open(dict_path, 'r') as file:
        for word in file.readlines():
            ix_to_word[counter] = word.strip()
            counter += 1
            
    new_data = defaultdict(list)
    with open(file_path, 'r') as file:
        for line in file.readlines():
            if len(line.strip().split(' ')) < 3:
                continue
            doc_id, word_ix, counter = list(map(int, line.strip().split(' ')))
            if counter == 1:
                new_data[doc_id].append(ix_to_word[word_ix])
            else:
                new_data[doc_id].append(f'{ix_to_word[word_ix]}:{counter}')
    with open(output_path, 'w') as file:
        for key in new_data.keys():
            word_string = ' '.join(new_data[key])
            file.write(f'doc{key} {word_string}\n')

# Block for creation of new dataset in artm format

In [6]:
batch_vectorizer = artm.BatchVectorizer(data_path='test_segments_batches',
                                        data_format='batches')
my_dictionary = artm.Dictionary()
my_dictionary.load(dictionary_path='train_batches/my_dictionary.dict')


with open('top_tokens.pickle', 'rb') as file:
    topic_tokens = pickle.load(file)
topics_qty = len(topic_tokens)
model = artm.ARTM(num_topics=topics_qty, dictionary=my_dictionary)
model.load('model_dump/n_wt.bin')
model.load('model_dump/p_wt.bin')

In [8]:
topic_probs = model.transform(batch_vectorizer)

In [9]:
columns = topic_probs.columns.values.tolist()
columns = sorted(columns)

In [15]:
columns_set = set(columns)

In [16]:
predicts = []
for col in range(1, columns[-1] + 1):
    if col in columns_set:
        predicts.append(topic_probs[col].to_numpy())
    else:
        predicts.append(predicts[-1])

In [17]:
predicts_numpy = np.array(predicts)

In [20]:
generator = WikiDataset('/home/dparinov/wiki_727/test/').get_generator()
ds = Dataset.from_generator(generator)
predicts_dict = dict()
start = 0
for id, doc in tqdm(enumerate(ds), total=len(ds)):
    shift = len(doc['boundaries'])
    predicts_dict[id] = predicts_numpy[start:start+shift]
    start += shift

Found cached dataset generator (/home/dparinov/.cache/huggingface/datasets/generator/default-e076cdd982e0608b/0.0.0)
100%|███████████████████████████████████| 73232/73232 [00:11<00:00, 6575.84it/s]


In [30]:
with open('predicts.pickle', 'wb') as file:
    pickle.dump(predicts_dict, file)

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

In [219]:
def left_window(input, k=7):
    output = np.zeros(input.shape)
    for ix in range(input.shape[0]):
        if ix < k:
            output[ix] = input[:ix+1].sum(axis=0) / (ix + 1)
        else:
            output[ix] = output[ix - 1] - input[ix - k] / k + input[ix] / k
    return output

def right_window(input, k=7):
    output = np.zeros(input.shape)
    
    for ix in range(input.shape[0]):
        if ix == 0:
            output[ix] = input[:k].sum(axis=0) / k
        elif ix <= input.shape[0] - k:
            output[ix] = output[ix - 1] - input[ix - 1] / k + input[ix + k - 1] / k
        else:
            output[ix] = input[ix:].sum(axis=0) / (input.shape[0] - ix)
    return output

def select_local_maxima(scores, threshold):
    local_maxima = ['0'] * len(scores)
    for ix in range(1, len(scores)):
        if scores[ix] < threshold:
            continue
            
        if ix == len(scores) - 1 and scores[ix] > scores[ix-1]:
            local_maxima[ix] = '1'
        elif scores[ix] > scores[ix-1] and scores[ix] > scores[ix+1]:
            local_maxima[ix] = '1'

    return ''.join(local_maxima)

def boundary_score(input, threshold=None):
    scores = []
    for ix in range(len(input)):
        l_max = input[ix]
        l_ix = ix - 1
        while l_ix >= 0:
            if input[l_ix] > l_max:
                l_max = input[l_ix]
                l_ix -= 1
            else:
                break
        r_max = input[ix]
        r_ix = ix + 1
        while r_ix < len(input):
            if input[r_ix] > r_max:
                r_max = input[r_ix]
                r_ix += 1
            else:
                break
        score = 0.5 * (l_max + r_max) - input[ix]
        scores.append(score)
    if threshold is None:
        threshold = np.mean(scores) + 1 * np.std(scores)
        
    return select_local_maxima(scores, threshold)


def get_similarity(lw, rw):
    values = [1 - spatial.distance.cosine(a, b) for a, b in zip (lw, rw)]
    return values

In [220]:
generator = WikiDataset('/home/dparinov/wiki_727/test/').get_generator()
ds = Dataset.from_generator(generator)
pk = []
wd = []
counter = 1

for id, doc in tqdm(enumerate(ds), total=len(ds)):
    ref = doc['boundaries']
    lw = left_window(predicts_dict[id])
    rw = right_window(predicts_dict[id])
    scores = get_similarity(lw, rw)
    preds = boundary_score(scores)
    pk.append(nltk.pk(ref, preds, k=5))
    wd.append(nltk.windowdiff(ref, preds, 5))

Found cached dataset generator (/home/dparinov/.cache/huggingface/datasets/generator/default-e076cdd982e0608b/0.0.0)
100%|████████████████████████████████████| 73232/73232 [02:55<00:00, 416.25it/s]


In [221]:
sum(pk) / len(pk)

0.4355168432478926

In [222]:
sum(wd) / len(wd)

0.4938090940510369

In [None]:
for doc in tqdm(ds, total=len(ds)):
    doc_dict = dict()
    doc_dict['real'] = doc['boundaries']
    vectors = []
    for sentence in doc['sections']:
        sentence_cleaned = clean_text_func(sentence)
        tockens = lemmatize_text(sentence_cleaned)
        vector = vectorizer.transform([' '.join(tockens)])
        vectors.append(vector.toarray()[0])
    batch_vectorizer = artm.BatchVectorizer(data_format='bow_n_wd',
                                    n_wd=np.array(vectors).T,
                                    vocabulary=words_dict_artm)
    topic_probs = model.transform(batch_vectorizer)
    predicts = topic_probs.to_numpy().argmax(axis=0).tolist()
    predicts_borders = [0]
    for ix in range(1, len(predicts)):
        if predicts[ix] != predicts[ix-1]:
            predicts_borders.append(1)
        else:
            predicts_borders.append(0)
    doc_dict['predict'] = predicts_borders
    break
    result.append(doc_dict)

In [None]:
print(predicts)

In [None]:
wd = []
pk = []
for example in result:
    test = example['real']
    predict = ''.join(list(map(str, example['predict'])))
    pk.append(nltk.pk(test, predict, k=5))
    wd.append(nltk.windowdiff(test, predict, 5))

In [None]:
sum(pk) / len(pk)

In [None]:
wd

In [None]:
sum(wd) / len(wd)

# Block for loading model and evaluation

In [None]:
cooc_dict = artm.Dictionary()
cooc_dict.gather(
    data_path='test_batches',
    cooc_file_path='cooc_df_test',
    vocab_file_path='vocab.train.txt',
    symmetric_cooc_values=True)

with open('top_tokens.pickle', 'rb') as file:
    topic_tokens = pickle.load(file)
topics_qty = len(topic_tokens)
model = artm.ARTM(num_topics=topics_qty, dictionary=cooc_dict)
model.load('model_dump/n_wt.bin')
model.load('model_dump/p_wt.bin')

In [None]:
coherence_score = artm.TopTokensScore(
                            name='TopTokensCoherenceScore',
                            class_id='@default_class',
                            num_tokens=12,
                            dictionary=cooc_dict)

In [None]:
model.scores.add(coherence_score)

In [None]:
batch_vectorizer = artm.BatchVectorizer(data_path='test_batches',
                                        data_format='batches')

In [None]:
p_cd_test = model.transform(batch_vectorizer=batch_vectorizer)

In [None]:
p_cd_test.head(50)

In [None]:
# does not work for some reason, will add manual calculation
model.score_tracker['TopTokensCoherenceScore'].coherence[-1]

In [None]:
# nice -n 5 bigartm -c vw_test.txt -v vocab.train.txt --cooc-window 10 --cooc-min-tf 5 --cooc-min-df 5 --write-cooc-tf cooc_tf_test --write-cooc-df cooc_df_test --write-ppmi-tf ppmi_tf_test --write-ppmi-df ppmi_df_test

In [None]:
with open('top_tokens.pickle', 'rb') as file:
    top_tockens = pickle.load(file)

In [None]:
def calc_coherence(ppmi_file, topic_tockens):
    cooc_dict = dict()
    with open(ppmi_file, 'r') as file:
        for line in file.readlines():
            word, *counts = line.strip().split(' ')
            if word not in cooc_dict:
                cooc_dict[word] = dict()
            else:
                print('something went wrong')
            for counter in counts:
                word_c = counter.split(':')
                cooc_dict[word][word_c[0]] = float(word_c[-1])
    result = dict()
    for topic_name in topic_tockens.keys():
        tockens = topic_tockens[topic_name]
        sum = 0
        for i in range(len(tockens) - 1):
            for j in range(i + 1, len(tockens)):
                sum += cooc_dict.get(tockens[i], dict()).get(tockens[j], 0)
        result[topic_name] = sum * 2 / (len(tockens) * (len(tockens) - 1))
    return result

In [None]:
res = calc_coherence('ppmi_df_test', top_tockens)

In [None]:
sum(list(res.values())) / 50