# Домашнее задание  № 5. Матричные разложения/Тематическое моделирование

### Задание № 1 (4 балла)

Попробуйте матричные разложения с 4 классификаторами - SGDClassifier, KNeighborsClassifier,  RandomForest, ExtraTreesClassifier (про него подробнее почитайте в документации, он похож на RF). Используйте и NMF и SVD. Сравните результаты на кросс-валидации и выберите лучшее сочетание.

В итоге у вас должно получиться, как минимум 8 моделей (два разложения на каждый классификатор). Используйте 1 и те же параметры кросс-валидации. Параметры векторизации, параметры K в матричных разложениях, параметры классификаторов могут быть разными между экспериментами.

Можете взять поменьше данных, если все будет обучаться слишком долго (не ставьте параметр K слишком большим в NMF, иначе точно будет слишком долго)

In [57]:
import pandas as pd
import numpy as np
import gensim
from string import punctuation
from razdel import tokenize as razdel_tokenize
from pymorphy2 import MorphAnalyzer
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD, NMF, PCA, LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from IPython.display import Image
from IPython.core.display import HTML
from matplotlib import pyplot as plt
import seaborn as sns

In [58]:
morph = MorphAnalyzer()

In [59]:
# добавим лемматизацию
def normalize(text):
  normalized_text = [word.text.strip(punctuation) for word \
                     in razdel_tokenize(text)]
  normalized_text = [word.lower() for word in normalized_text if word and len(word) < 20]
  normalized_text = [morph.parse(word)[0].normal_form for word in normalized_text]
  return ' '.join(normalized_text)


data = pd.read_csv('avito_category_classification.csv')
data['description_norm'] = data['description'].apply(normalize)

In [60]:
data.shape

(9898, 3)

In [61]:
data = data.iloc[:2000]

In [62]:
data.shape

(2000, 3)

In [63]:
data.head()

Unnamed: 0,category_name,description,description_norm
0,Автомобили,"отличное состояние,обслужиание в салоне",отличный состояние обслужиание в салон
1,Детская одежда и обувь,В отличном состоянии. Фирма KIKO. Очень теплый...,в отличный состояние фирма kiko очень тёплый у...
2,Предложение услуг,"Изготовление ограждений, перил,качелей, турник...",изготовление ограждение перила качели турников...
3,Автомобили,Автомобиль в отличном техническом состоянии. О...,автомобиль в отличный технический состояние од...
4,Бытовая техника,"Продается газовая плита ""Гефест"" (Белоруссия) ...",продаваться газовый плита гефест белоруссия б ...


In [64]:
def eval_table(X, y, pipeline, N=6):
    # зафиксируем порядок классов
    labels = list(set(y))

    # метрики отдельных фолдов будет хранить в табличке
    fold_metrics = pd.DataFrame(index=labels)
    # дополнительно также соберем таблицу ошибок
    errors = np.zeros((len(labels), len(labels)))

    # создаем стратегию кросс-валидации
    # shuffle=True (перемешивание) - часто критично важно указать
    # т.к. данные могут быть упорядочены и модель на этом обучится
    kfold = StratifiedKFold(n_splits=N, shuffle=True, )

    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        # fit-predict как и раньше, но сразу пайплайном
        pipeline.fit(X[train_index], y[train_index])
        preds = pipeline.predict(X[test_index])

        # записываем метрику и индекс фолда
        fold_metrics[f'precision_{i}'] = precision_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'recall_{i}'] = recall_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'f1_{i}'] = f1_score(y[test_index], preds, labels=labels, average=None)
        errors += confusion_matrix(y[test_index], preds, labels=labels, normalize='true')

    # таблица для усредненных значений
    # тут мы берем колонки со значениями и усредняем их
    # часто также все метрики сразу суммируют и в конце просто делят на количество фолдов
    # но мы тут помимо среднего также хотим посмотреть на стандартное отклонение
    # чтобы понять как сильно варьируются оценки моделей
    result = pd.DataFrame(index=labels)
    result['precision'] = fold_metrics[[f'precision_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['precision_std'] = fold_metrics[[f'precision_{i}' for i in range(N)]].std(axis=1).round(2)

    result['recall'] = fold_metrics[[f'recall_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['recall_std'] = fold_metrics[[f'recall_{i}' for i in range(N)]].std(axis=1).round(2)

    result['f1'] = fold_metrics[[f'f1_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['f1_std'] = fold_metrics[[f'f1_{i}' for i in range(N)]].std(axis=1).round(2)

    # добавим одну колонку со средним по всем классам
    result.loc['mean'] = result.mean().round(2)
    # проценты ошибок просто усредняем
    errors /= N

    return result, errors

In [65]:
# RandomForest
pipeline_rf_svd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=10))
])

pipeline_rf_nmf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', RandomForestClassifier(n_estimators=200, max_depth=6))
])

In [66]:
#SGDClassifier
pipeline_sc_svd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', SGDClassifier())
])

pipeline_sc_nmf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', SGDClassifier())
])

In [67]:
#KNeighborsClassifier
pipeline_kc_svd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', KNeighborsClassifier())
])

pipeline_kc_nmf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', KNeighborsClassifier())
])

In [68]:
#ExtraTreesClassifier
pipeline_ec_svd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', ExtraTreesClassifier())
])

pipeline_ec_nmf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', ExtraTreesClassifier())
])

In [69]:
metrics_rf_svd, errors_rf_svd = eval_table(data['description_norm'], data['category_name'], pipeline_rf_svd)
metrics_rf_nmf, errors_rf_nmf = eval_table(data['description_norm'], data['category_name'], pipeline_rf_nmf)
metrics_sc_svd, errors_sc_svd = eval_table(data['description_norm'], data['category_name'], pipeline_sc_svd)
metrics_sc_nmf, errors_sc_nmf = eval_table(data['description_norm'], data['category_name'], pipeline_sc_nmf)
metrics_kc_svd, errors_kc_svd = eval_table(data['description_norm'], data['category_name'], pipeline_kc_svd)
metrics_kc_nmf, errors_kc_nmf = eval_table(data['description_norm'], data['category_name'], pipeline_kc_nmf)
metrics_ec_svd, errors_ec_svd = eval_table(data['description_norm'], data['category_name'], pipeline_ec_svd)
metrics_ec_nmf, errors_ec_nmf = eval_table(data['description_norm'], data['category_name'], pipeline_ec_nmf)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [70]:
metrics_rf_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.44,0.5,0.05,0.07,0.08,0.11
Ремонт и строительство,0.0,0.0,0.0,0.0,0.0,0.0
Детская одежда и обувь,0.39,0.02,0.64,0.07,0.48,0.03
Квартиры,0.84,0.07,0.83,0.09,0.83,0.06
"Одежда, обувь, аксессуары",0.42,0.04,0.75,0.03,0.53,0.03
Телефоны,0.83,0.41,0.1,0.07,0.17,0.12
Предложение услуг,0.65,0.14,0.27,0.07,0.38,0.08
Бытовая техника,0.33,0.52,0.03,0.04,0.05,0.08
Автомобили,0.79,0.13,0.3,0.09,0.42,0.09
Товары для детей и игрушки,0.61,0.15,0.11,0.04,0.18,0.06


In [71]:
metrics_rf_nmf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.58,0.49,0.08,0.08,0.12,0.12
Ремонт и строительство,0.0,0.0,0.0,0.0,0.0,0.0
Детская одежда и обувь,0.62,0.04,0.62,0.06,0.62,0.05
Квартиры,0.84,0.05,0.94,0.05,0.89,0.05
"Одежда, обувь, аксессуары",0.4,0.03,0.81,0.02,0.53,0.03
Телефоны,0.82,0.19,0.21,0.14,0.32,0.17
Предложение услуг,0.69,0.09,0.52,0.09,0.59,0.08
Бытовая техника,0.0,0.0,0.0,0.0,0.0,0.0
Автомобили,0.83,0.13,0.74,0.05,0.78,0.08
Товары для детей и игрушки,0.87,0.15,0.35,0.06,0.49,0.08


In [72]:
metrics_rf_nmf - metrics_rf_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.14,-0.01,0.03,0.01,0.04,0.01
Ремонт и строительство,0.0,0.0,0.0,0.0,0.0,0.0
Детская одежда и обувь,0.23,0.02,-0.02,-0.01,0.14,0.02
Квартиры,0.0,-0.02,0.11,-0.04,0.06,-0.01
"Одежда, обувь, аксессуары",-0.02,-0.01,0.06,-0.01,0.0,0.0
Телефоны,-0.01,-0.22,0.11,0.07,0.15,0.05
Предложение услуг,0.04,-0.05,0.25,0.02,0.21,0.0
Бытовая техника,-0.33,-0.52,-0.03,-0.04,-0.05,-0.08
Автомобили,0.04,0.0,0.44,-0.04,0.36,-0.01
Товары для детей и игрушки,0.26,0.0,0.24,0.02,0.31,0.02


In [73]:
metrics_sc_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.55,0.14,0.54,0.11,0.54,0.12
Ремонт и строительство,0.4,0.16,0.28,0.18,0.32,0.17
Детская одежда и обувь,0.64,0.06,0.73,0.06,0.68,0.02
Квартиры,0.95,0.04,0.92,0.06,0.93,0.04
"Одежда, обувь, аксессуары",0.67,0.07,0.67,0.07,0.66,0.04
Телефоны,0.7,0.14,0.66,0.12,0.67,0.12
Предложение услуг,0.74,0.1,0.65,0.12,0.69,0.1
Бытовая техника,0.25,0.1,0.25,0.13,0.24,0.1
Автомобили,0.77,0.05,0.78,0.06,0.77,0.04
Товары для детей и игрушки,0.59,0.1,0.52,0.1,0.55,0.09


In [74]:
metrics_sc_nmf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.71,0.2,0.36,0.08,0.46,0.08
Ремонт и строительство,0.09,0.2,0.02,0.03,0.03,0.05
Детская одежда и обувь,0.53,0.13,0.7,0.23,0.56,0.08
Квартиры,0.7,0.14,0.96,0.03,0.8,0.09
"Одежда, обувь, аксессуары",0.6,0.15,0.61,0.2,0.57,0.04
Телефоны,0.63,0.13,0.45,0.17,0.5,0.1
Предложение услуг,0.87,0.13,0.25,0.1,0.38,0.12
Бытовая техника,0.03,0.05,0.04,0.08,0.04,0.06
Автомобили,0.72,0.16,0.76,0.16,0.71,0.06
Товары для детей и игрушки,0.7,0.19,0.36,0.1,0.46,0.12


In [75]:
metrics_sc_nmf - metrics_sc_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.16,0.06,-0.18,-0.03,-0.08,-0.04
Ремонт и строительство,-0.31,0.04,-0.26,-0.15,-0.29,-0.12
Детская одежда и обувь,-0.11,0.07,-0.03,0.17,-0.12,0.06
Квартиры,-0.25,0.1,0.04,-0.03,-0.13,0.05
"Одежда, обувь, аксессуары",-0.07,0.08,-0.06,0.13,-0.09,0.0
Телефоны,-0.07,-0.01,-0.21,0.05,-0.17,-0.02
Предложение услуг,0.13,0.03,-0.4,-0.02,-0.31,0.02
Бытовая техника,-0.22,-0.05,-0.21,-0.05,-0.2,-0.04
Автомобили,-0.05,0.11,-0.02,0.1,-0.06,0.02
Товары для детей и игрушки,0.11,0.09,-0.16,0.0,-0.09,0.03


In [76]:
metrics_kc_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.3,0.12,0.25,0.08,0.27,0.09
Ремонт и строительство,0.1,0.11,0.05,0.05,0.07,0.06
Детская одежда и обувь,0.42,0.06,0.47,0.07,0.44,0.06
Квартиры,0.87,0.11,0.52,0.15,0.63,0.12
"Одежда, обувь, аксессуары",0.44,0.04,0.56,0.06,0.49,0.04
Телефоны,0.47,0.24,0.19,0.07,0.26,0.09
Предложение услуг,0.47,0.09,0.49,0.09,0.48,0.09
Бытовая техника,0.1,0.06,0.18,0.12,0.13,0.08
Автомобили,0.28,0.08,0.41,0.13,0.32,0.07
Товары для детей и игрушки,0.37,0.12,0.16,0.08,0.22,0.1


In [77]:
metrics_kc_nmf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.4,0.15,0.29,0.09,0.33,0.11
Ремонт и строительство,0.17,0.18,0.07,0.05,0.1,0.07
Детская одежда и обувь,0.49,0.05,0.49,0.06,0.49,0.04
Квартиры,0.8,0.09,0.77,0.13,0.78,0.09
"Одежда, обувь, аксессуары",0.52,0.06,0.53,0.05,0.52,0.04
Телефоны,0.42,0.15,0.25,0.07,0.31,0.09
Предложение услуг,0.38,0.09,0.64,0.09,0.47,0.07
Бытовая техника,0.09,0.06,0.16,0.09,0.11,0.07
Автомобили,0.4,0.14,0.5,0.21,0.43,0.16
Товары для детей и игрушки,0.38,0.08,0.19,0.06,0.25,0.06


In [78]:
metrics_kc_nmf - metrics_kc_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.1,0.03,0.04,0.01,0.06,0.02
Ремонт и строительство,0.07,0.07,0.02,0.0,0.03,0.01
Детская одежда и обувь,0.07,-0.01,0.02,-0.01,0.05,-0.02
Квартиры,-0.07,-0.02,0.25,-0.02,0.15,-0.03
"Одежда, обувь, аксессуары",0.08,0.02,-0.03,-0.01,0.03,0.0
Телефоны,-0.05,-0.09,0.06,0.0,0.05,0.0
Предложение услуг,-0.09,0.0,0.15,0.0,-0.01,-0.02
Бытовая техника,-0.01,0.0,-0.02,-0.03,-0.02,-0.01
Автомобили,0.12,0.06,0.09,0.08,0.11,0.09
Товары для детей и игрушки,0.01,-0.04,0.03,-0.02,0.03,-0.04


In [79]:
metrics_ec_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.71,0.27,0.09,0.03,0.15,0.05
Ремонт и строительство,0.17,0.26,0.02,0.03,0.04,0.06
Детская одежда и обувь,0.36,0.03,0.68,0.04,0.47,0.03
Квартиры,0.63,0.1,0.52,0.1,0.57,0.08
"Одежда, обувь, аксессуары",0.38,0.03,0.65,0.06,0.48,0.04
Телефоны,1.0,0.0,0.1,0.05,0.17,0.08
Предложение услуг,0.64,0.22,0.08,0.03,0.15,0.05
Бытовая техника,0.33,0.41,0.04,0.05,0.08,0.08
Автомобили,0.75,0.27,0.08,0.08,0.14,0.13
Товары для детей и игрушки,0.57,0.17,0.09,0.03,0.15,0.05


In [80]:
metrics_ec_nmf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.68,0.1,0.49,0.08,0.57,0.08
Ремонт и строительство,0.51,0.13,0.25,0.09,0.32,0.08
Детская одежда и обувь,0.69,0.04,0.72,0.05,0.7,0.05
Квартиры,0.92,0.04,0.96,0.04,0.94,0.03
"Одежда, обувь, аксессуары",0.61,0.03,0.75,0.03,0.68,0.03
Телефоны,0.64,0.15,0.58,0.19,0.6,0.17
Предложение услуг,0.65,0.09,0.7,0.06,0.67,0.07
Бытовая техника,0.35,0.18,0.17,0.07,0.23,0.1
Автомобили,0.74,0.09,0.79,0.06,0.76,0.06
Товары для детей и игрушки,0.66,0.09,0.54,0.09,0.59,0.07


In [81]:
metrics_ec_nmf - metrics_ec_svd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,-0.03,-0.17,0.4,0.05,0.42,0.03
Ремонт и строительство,0.34,-0.13,0.23,0.06,0.28,0.02
Детская одежда и обувь,0.33,0.01,0.04,0.01,0.23,0.02
Квартиры,0.29,-0.06,0.44,-0.06,0.37,-0.05
"Одежда, обувь, аксессуары",0.23,0.0,0.1,-0.03,0.2,-0.01
Телефоны,-0.36,0.15,0.48,0.14,0.43,0.09
Предложение услуг,0.01,-0.13,0.62,0.03,0.52,0.02
Бытовая техника,0.02,-0.23,0.13,0.02,0.15,0.02
Автомобили,-0.01,-0.18,0.71,-0.02,0.62,-0.07
Товары для детей и игрушки,0.09,-0.08,0.45,0.06,0.44,0.02


In [82]:
# assuming you have 8 dataframes df1, df2, ..., df8
dataframes = {'metrics_rf_svd': metrics_rf_svd, 'metrics_rf_nmf': metrics_rf_nmf, 'metrics_sc_svd': metrics_sc_svd, 'metrics_sc_nmf': metrics_sc_nmf, 'metrics_kc_svd': metrics_kc_svd, 'metrics_kc_nmf': metrics_kc_nmf, 'metrics_ec_svd': metrics_ec_svd, 'metrics_ec_nmf': metrics_ec_nmf}

column = 'f1'

means = {name: df[column].mean() for name, df in dataframes.items()}

best_name = max(means, key=means.get)

best_dataframe = dataframes[best_name]

print(f'The best dataframe is: {best_name}')

The best dataframe is: metrics_ec_nmf


In [83]:
best_dataframe

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.68,0.1,0.49,0.08,0.57,0.08
Ремонт и строительство,0.51,0.13,0.25,0.09,0.32,0.08
Детская одежда и обувь,0.69,0.04,0.72,0.05,0.7,0.05
Квартиры,0.92,0.04,0.96,0.04,0.94,0.03
"Одежда, обувь, аксессуары",0.61,0.03,0.75,0.03,0.68,0.03
Телефоны,0.64,0.15,0.58,0.19,0.6,0.17
Предложение услуг,0.65,0.09,0.7,0.06,0.67,0.07
Бытовая техника,0.35,0.18,0.17,0.07,0.23,0.1
Автомобили,0.74,0.09,0.79,0.06,0.76,0.06
Товары для детей и игрушки,0.66,0.09,0.54,0.09,0.59,0.07


### Задание № 2 (6 баллов)

В Gensim тоже можно добавить нграммы и tfidf. Постройте 1 модель без них (как в семинаре) и еще 3 модели (1 с нграммами, 1 с tfidf и 1 с нграммами и с tfidf). Сранивте качество с помощью метрик (перплексия, когерентность) и на глаз. Определите лучшую модель. Для каждой модели выберите 1 самую красивую на ваш взгляд тему.

Используйте данные википедии из семинара. Можете взять поменьше данных, если все обучается долго.

Важное требование - получившиеся модели не должны быть совсем плохими. Если хороших тем не получается, попробуйте настроить гиперпараметры, отфильтровать словарь по-другому.

In [86]:
import gensim

In [87]:
def normalize(text):
  normalized_text = [word.text.strip(punctuation) for word \
                     in razdel_tokenize(text)]
  normalized_text = [word.lower() for word in normalized_text if word and len(word) < 20]
  normalized_text = [morph.parse(word)[0].normal_form for word in normalized_text]
  return ' '.join(normalized_text)

In [89]:
texts = open('wiki_data.txt', encoding='UTF8').read().splitlines()[:5000]
texts = ([normalize(text) for text in texts])

dictinary = gensim.corpora.Dictionary((text.split() for text in texts))
corpus = [dictinary.doc2bow(text.split()) for text in texts]

# без

In [90]:
lda = gensim.models.LdaModel(corpus, 200, id2word=dictinary, passes=5)

# нграммы

In [92]:
texts = [text.split() for text in texts]
ph = gensim.models.Phrases(texts, scoring='npmi', threshold=0.4)
p = gensim.models.phrases.Phraser(ph)
ngrammed_texts = p[texts]
print([text for text in ngrammed_texts[:3]])
ngrammed_dictinary = gensim.corpora.Dictionary(ngrammed_texts)
ngrammed_corpus = [ngrammed_dictinary.doc2bow(text) for text in ngrammed_texts]

[['новостройка', 'нижегородский_область', 'новостро́йка', '—', 'сельский', 'посёлок', 'в', 'дивеевский_район', 'нижегородский_область', 'входить', 'в', 'состав_сатисский', 'сельсовет', 'посёлок', 'расположить', 'в', '12,5', 'км', 'к_юг', 'от', 'село_дивеево', 'и', '1_км', 'к_запад', 'от', 'город', 'саров', 'на', 'право_берег', 'река', 'вичкинза', 'правый_приток', 'река', 'сатис', 'окружить', 'смешанный_лес', 'соединить', 'асфальтовый', 'дорога', 'с', 'посёлок', 'цыгановка', '1,5_км', 'и', 'грунтовый', 'просёлочный_дорога', 'с', 'посёлок_сатис', '3,5_км', 'название', 'новостройка', 'являться', 'сугубо', 'официальный', 'местный_население', 'использовать', 'исключительно', 'альтернативный', 'название', '—', 'хитрый', 'употребляться_языковой', 'оборот', '«', '…', 'на', 'хитрый', '»', 'ранее', 'использовать', 'название', '—', 'песчаный', 'известковый', 'основать', 'в', '1920-й', 'год', 'переселенец', 'из', 'соседний', 'село_аламасовый', 'и', 'нарышкино', 'расположить', 'соответственно', 'в'

In [93]:
lda_ngrams = gensim.models.LdaModel(ngrammed_corpus, 200, id2word=ngrammed_dictinary, passes=5)

# tf-idf

In [94]:
tfidf = gensim.models.TfidfModel(corpus, id2word=dictinary)
tfidf_corpus = tfidf[corpus]

In [95]:
lda_tfidf = gensim.models.LdaModel(tfidf_corpus, 200, id2word=dictinary, passes=5)

# tf-idf + нграммы

In [96]:
tfidf_ngrams = gensim.models.TfidfModel(ngrammed_corpus, id2word=ngrammed_dictinary)
tfidf_ngrams_corpus = tfidf_ngrams[ngrammed_corpus]

In [97]:
lda_ngrams_tfidf = gensim.models.LdaModel(tfidf_ngrams_corpus, 200, id2word=ngrammed_dictinary, passes=5)

# Проверяем

In [98]:
# calculate perplexity for each model
perplexity = {
    'lda': lda.log_perplexity(corpus),
    'lda_ngrams': lda_ngrams.log_perplexity(ngrammed_corpus),
    'lda_tfidf': lda_tfidf.log_perplexity(tfidf_corpus),
    'lda_ngrams_tfidf': lda_ngrams_tfidf.log_perplexity(tfidf_ngrams_corpus)
}

# print perplexity for each model
print('Perplexity:')
for model, score in perplexity.items():
    print(f'{model}: {score}')

# calculate coherence for each model
coherence = {
    'lda': gensim.models.CoherenceModel(lda, texts=texts, dictionary=dictinary, coherence='c_v').get_coherence(),
    'lda_ngrams': gensim.models.CoherenceModel(lda_ngrams, texts=ngrammed_texts, dictionary=ngrammed_dictinary, coherence='c_v').get_coherence(),
    'lda_tfidf': gensim.models.CoherenceModel(lda_tfidf, texts=texts, dictionary=dictinary, coherence='c_v').get_coherence(),
    'lda_ngrams_tfidf': gensim.models.CoherenceModel(lda_ngrams_tfidf, texts=ngrammed_texts, dictionary=ngrammed_dictinary, coherence='c_v').get_coherence()
}

# print coherence for each model
print('Coherence:')
for model, score in coherence.items():
    print(f'{model}: {score}')

Perplexity:
lda: -24.35600400761721
lda_ngrams: -27.376026929100693
lda_tfidf: -254.40484788375886
lda_ngrams_tfidf: -243.37492891486426
Coherence:
lda: 0.4074451002876195
lda_ngrams: 0.429019020431781
lda_tfidf: 0.33363131891992537
lda_ngrams_tfidf: 0.36268328668204786


In [100]:
# find the model with the lowest perplexity
best_perplexity_model = min(perplexity, key=perplexity.get)

# find the model with the highest coherence
best_coherence_model = max(coherence, key=coherence.get)

# print the best models
print(f'The model with the lowest perplexity is: {best_perplexity_model}')
print(f'The model with the highest coherence is: {best_coherence_model}')

The model with the lowest perplexity is: lda_tfidf
The model with the highest coherence is: lda_ngrams


# Топики

In [99]:
# number of topics to show
num_topics = 1

# show topics for each model
topics = {
    'lda': lda.show_topics(num_topics=num_topics),
    'lda_ngrams': lda_ngrams.show_topics(num_topics=num_topics),
    'lda_tfidf': lda_tfidf.show_topics(num_topics=num_topics),
    'lda_ngrams_tfidf': lda_ngrams_tfidf.show_topics(num_topics=num_topics)
}

# print topics for each model
for model, topic_list in topics.items():
    print(f'{model}:')
    for topic in topic_list:
        print(f'  {topic}')

lda:
  (125, '0.113*"китай" + 0.077*"китайский" + 0.045*"папа" + 0.037*"провинция" + 0.033*"римский" + 0.027*"чжан" + 0.026*"павел" + 0.020*"католический" + 0.019*"мученик" + 0.019*"вместе"')
lda_ngrams:
  (182, '0.053*"генри" + 0.049*"смит" + 0.035*"челленджер" + 0.022*"третий_раунд" + 0.019*"ovw" + 0.017*"нидерланды" + 0.011*"royal" + 0.011*"шотландия" + 0.011*"джон" + 0.011*"в"')
lda_tfidf:
  (149, '0.000*"нухрат" + 0.000*"нух\xadрат" + 0.000*"паскино" + 0.000*"отяк" + 0.000*"опыт\xadный" + 0.000*"опоя\xadсали" + 0.000*"ом\xadсиный" + 0.000*"нухратец" + 0.000*"нухрата" + 0.000*"пермец"')
lda_ngrams_tfidf:
  (186, '0.000*"503" + 0.000*"пономаренко" + 0.000*"гдол" + 0.000*"1/60" + 0.000*"1/40" + 0.000*"стара-болеслав" + 0.000*"коменский" + 0.000*"брандис-над-лабить" + 0.000*"богемский" + 0.000*"altbunzlau"')
