In [3]:
from selenium import webdriver
import time
import pymorphy2
import re
import pandas as pd
import numpy as np 
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 

In [2]:
wd = webdriver.Firefox(executable_path=r'D:\geckodriver.exe')

In [3]:
def month(j):
    if j < 10:
        return '0' + str(j)
    return str(j)

In [5]:
#загрузка по годам
for i in range(1900, 2000):
    source_code = ''
    for j in range(1, 13):
        wd.get('http://prozhito.org/notes?date="' + str(i) + '-' + month(j) + '-01"&dateTop="' + str(i) + '-' + month(j) + '-28"')
        elem = wd.find_element_by_xpath("//*")
        time.sleep(4)
        source_code = source_code + elem.get_attribute("outerHTML")
    with open('data_year/diaries' + str(i) + '.txt', 'w', encoding = 'utf-8') as f:
        f.write(source_code)

In [7]:
#парсинг веб-страниц, удаляем теги и т.д.
def diary_parsing(name):
    with open('data_year/' + name + '.txt', 'r', encoding = 'utf-8') as f:
        diary = f.read()
    if diary.find('span') == -1:
        return
    diary = diary.split('span class="note-date"')
    diary[0] = ''
    for i in range (0, len(diary)):
        if diary[i].find('...открыть') > -1:
            diary[i] = ''
        diary[i] = re.sub('[\da-zA-Z<>="&$/()!,_:;\?«»\*\—\.\[\]]', '', diary[i])
    diary = ' '.join(diary)
    diary = re.sub('-закрыть -', '', diary)
    diary = re.sub(' \-+', '', diary)
    a = diary.find('Хотите помочь?')
    with open('Prepared/pr_' + name + '.txt', 'w', encoding = 'utf-8') as f:
        f.write(diary[:a])
    return

In [23]:
#приводим слова к начальной форме
def diary_lemmatize(name):
    with open('Prepared/pr_' + name + '.txt', 'r', encoding = 'utf-8') as f:
        diary = f.read()
    diary = diary.split(' ')
    morph = pymorphy2.MorphAnalyzer()
    for i in range(0, len(diary)):
        if diary[i][1:].islower() and morph.parse(diary[i])[0].tag.POS == "NOUN":
            diary[i] = morph.parse(diary[i].lower())[0].normal_form
        else:
            diary[i] = ''
    with open('Prepared/pr_' + name + '.txt', 'w', encoding = 'utf-8') as f:
        f.write(' '.join(diary))
    return


invalid escape sequence \?


invalid escape sequence \?


invalid escape sequence \?


invalid escape sequence \?



In [9]:
#для просмотра самых частых слов
def word_counter(name):
    with open('Prepared/pr_' + name + '.txt', 'r', encoding = 'utf-8') as f:
        diary = f.read().split(' ')
    yeardict = {}
    with open('stop_words.txt', 'r', encoding = 'utf-8') as f:
        stop = f.read().split('\n')
    for d in diary:
        if len(d) > 2 and d not in stop:
            try:
                yeardict[d] += 1
            except:
                yeardict[d] = 1
    with open('Dicts/dict_' + name + '.txt', 'w', encoding = 'utf-8') as f:
        for w in sorted(yeardict, key=yeardict.get, reverse=True):
            if yeardict[w] < 10:
                break
            f.write(w + ' ' + str(yeardict[w]) + '\n')    
    return

In [29]:
#тексты дневников с удаленными стоп-словами, порядок остальных слов сохранен
def without_stopwords(name):
    with open('Prepared/pr_' + name + '.txt', 'r', encoding = 'utf-8') as f:
        diary = f.read().split(' ')
    with open('stop_words.txt', 'r', encoding = 'utf-8') as f:
        stop = f.read().split('\n')
    for i in range (0, len(diary)):
        if len(diary[i]) < 3 or diary[i] in stop:
            diary[i] = ''
    with open('Prepared/pr_' + name + '.txt', 'w', encoding = 'utf-8') as f:
        f.write(' '.join(diary))    
    return  

In [30]:
#предобработка
for i in range(1900, 2000):
    filename = 'diaries' + str(i)
    diary_parsing(filename)
    diary_lemmatize(filename)
    word_counter(filename)
    without_stopwords(filename)

In [4]:
#частые слова, tf-idf
def commonword(diaries):
    tf_idf = TfidfVectorizer()
    tf_idf.fit(diaries)
    idfs = tf_idf.idf_
    lower_thresh = 3
    often = idfs < lower_thresh
    commonwords = np.array(tf_idf.get_feature_names())[often]
    return commonwords

In [5]:
#очистка от частых слов
def clear_diaries(diaries):
    common = commonword(diaries)
    for i in range(0, len(diaries)):
        d = diaries[i].split(" ")
        for j in range(0, len(d)):
            if d[j] in common:
                d[j] = ''
        diaries[i] = " ".join(d)
    return diaries

In [10]:
#открываем дневники по десятилетиям
def get_diaries(year1, year2):
    diaries = []
    for i in range(year1, year2):
        filename = 'diaries' + str(i)
        with open('Prepared/pr_' + filename + '.txt', 'r', encoding = 'utf-8') as f:
            d = f.read().split('   лет     ')
            for d1 in d:
                diaries.append(d1)
    for j in range (0, len(diaries)):
        diary = diaries[j].split(' ')[5:]
        '''morph = pymorphy2.MorphAnalyzer()
        for i in range(0, len(diary)):
            if morph.parse(diary[i])[0].tag.POS == "NOUN":
                diary[i] = morph.parse(diary[i].lower())[0].normal_form
            else:
                diary[i] = ''
        '''
        diaries[j] = ' '.join(diary)
    return clear_diaries(diaries)

In [11]:
def display_topics(model, feature_names, no_top_words):
    topic_array = []
    for topic_idx, topic in enumerate(model.components_):
        outp = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        topic_array.append(outp.split(' '))
    return topic_array

In [13]:
from sklearn.model_selection import GridSearchCV

In [15]:
#подбор оптимальных параметров
diaries = get_diaries(1900, 1910)
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
tf = tf_vectorizer.fit_transform(diaries)

search_params = {'n_components': [5, 10, 15, 20], 'learning_decay': [.5, .7, .9]}
lda = LatentDirichletAllocation()
model = GridSearchCV(lda, param_grid=search_params)
model.fit(tf)
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_topics': [5, 10, 15, 20], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

best_lda_model = model.best_estimator_
print("Best Model's Params: ", model.best_params_)
print("Best Log Likelihood Score: ", model.best_score_)
print("Model Perplexity: ", best_lda_model.perplexity(tf))



Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -152500.49285957788
Model Perplexity:  774.0404323008058


In [12]:
nmflist = []
ldalist = []
no_topics = 5
for i in range(0, 10):
    year1 = 1900 + 10*i
    diaries = get_diaries(year1, year1 + 10)
    
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)
    tfidf = tfidf_vectorizer.fit_transform(diaries)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
    
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
    tf = tf_vectorizer.fit_transform(diaries)
    tf_feature_names = tf_vectorizer.get_feature_names()
    lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

    nmflist.append(display_topics(nmf, tfidf_feature_names, 10))
    ldalist.append(display_topics(lda, tf_feature_names, 10))



In [None]:
def Jacc(lda_topic, nmf_topic):
    arr = []
    for i in range (0, len(lda_topic)):
        arr1 = []
        for j in range (0, len(nmf_topic)):
            c = len(set(lda_topic[i]) & set(nmf_topic[j]))
            k = round(c / (len(lda_topic[i]) + len(nmf_topic[j]) - c), 2)
            arr1.append(k)
        arr.append(arr1)
    return arr 

In [None]:
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, 10)