In [1]:
# imports
import nltk
import pandas as pd
import artm

from nltk.corpus import brown
from nltk.stem import SnowballStemmer
from nltk.util import ngrams

  """
  """
  apply_weight=None, decay_weight=None, async=None):


In [None]:
# work environment setup - one-time actions
nltk.download('stopwords')
nltk.download('punkt')

In [3]:
# MAKE STOP-WORDS DATASET
def make_stop_words_dataset():
    stemmer = SnowballStemmer('russian')
    stop_words= nltk.corpus.stopwords.words('russian')
    stop_words = list(map(lambda x: stemmer.stem(x), stop_words))
    pd.DataFrame(stop_words).to_csv('stop-words-russian-nltk.csv', index=False, header=['stop-word'])


make_stop_words_dataset()

# stop_word=" "
# for i in stop_words:
#         stop_word=  stop_word+" "+i
# print(stop_word)

In [None]:
# DATA PREPARATION
# load raw data
def load_data():
    NROWS = None # 1000
    df = pd.read_csv(
        'vk.csv',
        nrows=NROWS, 
        low_memory=False, 
        dtype={'question': 'str',
               'answer':'str'})
    return df


# stemming
def stemming(df):
    df['question_stem'] = df['question'].apply(lambda x: ' '.join([stemmer.stem(w) for w in str(x).split()]))
    df['answer_stem'] = df['answer'].apply(lambda x: ' '.join([stemmer.stem(w) for w in str(x).split()]))
    # df[['question_stem','answer_stem']]
    # df.to_csv('vk-stemmed.csv', index=False, header=True)

    # remove punctuation & stop-words
    stop_words = list(pd.read_csv('stop-words-russian-nltk.csv')['stop-word'])


def clean_text(s):
    words = nltk.word_tokenize(s)
    words = [w for w in words if w not in stop_words]  # remove stop-words
    words = [w for w in words if w.isalpha() and len(w)>1]  # remove numbers and single-char
    return ' '.join(words)


def data_preparation():
    df = load_data()
    stemming(df)
    
    df['question_clean'] = df['question_stem'].apply(lambda x: clean_text(x))
    df['answer_clean'] = df['answer_stem'].apply(lambda x: clean_text(x))
    df.to_csv('vk-cleaned.csv', index=False, header=True, columns=['question_clean','answer_clean'])


data_preparation()

In [6]:
df = pd.read_csv('vk-cleaned.csv')
df.head()

0    кредитн карт моментум расплачива зарубежн инте...
Name: question_clean, dtype: object

In [30]:
def text2vw(text, label ):
    global line_number
    line_number += 1
    return label + str(line_number) + ' ' + str(text)

line_number = 0
df['question_vw'] = df['question_clean'].apply(lambda x: text2vw(x, 'q'))

line_number = 0
df['answer_vw'] = df['answer_clean'].apply(lambda x: text2vw(x, 'a'))

df[['question_vw', 'answer_vw']].to_csv('vk.vw', index=False, header=False, sep='|')

In [58]:
# create ARTM batches & dictionary - one-time action
batch_vectorizer = artm.BatchVectorizer(
    data_path = 'vk.vw',
    data_format='vowpal_wabbit',
    target_folder='artm')

dictionary = artm.Dictionary()
dictionary.gather(data_path='artm')
dictionary.save(dictionary_path='artm/dictionary')
dictionary.save_text(dictionary_path='artm/dictionary.txt')

In [59]:
dictionary.load(dictionary_path='artm/dictionary.dict')
model = artm.ARTM(num_topics=200, dictionary=dictionary, cache_theta=True)
model.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
model.scores.add(artm.TopTokensScore(name='top_tokens_score'))
model.num_document_passes = 5

In [60]:
model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=25)
#model.score_tracker['perplexity_score'].last_value


In [56]:
print(model.score_tracker['perplexity_score'].value)      # .last_value
print()
print(model.score_tracker['sparsity_phi_score'].value)    # .last_value
print()
print(model.score_tracker['sparsity_theta_score'].value)  # .last_value

[71341.4765625, 1401.7366943359375, 1058.3416748046875, 695.7772216796875, 489.915771484375, 396.8744201660156, 345.58721923828125, 311.3427734375, 287.19354248046875, 270.26739501953125, 257.99127197265625, 248.8828125, 242.0885467529297, 236.82847595214844, 232.6487579345703, 229.35733032226562, 226.8046112060547, 224.83163452148438, 223.2830047607422, 222.04257202148438, 221.00869750976562, 220.13475036621094, 219.37721252441406, 218.72799682617188, 218.174072265625]

[0.0, 1.1741341950255446e-05, 0.0004088670539204031, 0.028938917443156242, 0.15169109404087067, 0.3843666911125183, 0.5906117558479309, 0.7183569669723511, 0.7896561622619629, 0.8311086297035217, 0.8579050898551941, 0.8772653341293335, 0.8925049304962158, 0.9049971103668213, 0.9155004620552063, 0.9244206547737122, 0.9321203231811523, 0.9388247132301331, 0.9446538686752319, 0.9496938586235046, 0.9541117548942566, 0.9579960107803345, 0.9613600373268127, 0.9643058776855469, 0.9669113159179688]

[0.0, 8.72448083555355e-07,

ValueError: cache_theta == False. Set ARTM.cache_theta = True

In [None]:
# def process_ngrams(s):
#     tokens = nltk.word_tokenize(s)
    

# df['question_ngram'] = dfp['question_stem'].apply(
#     lambda x: process_ngrams(x)
# )