# Домашнее задание  № 5. Матричные разложения/Тематическое моделирование

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [76]:
import gensim
import pandas as pd
import numpy as np
from pymorphy3 import MorphAnalyzer
from collections import Counter
from string import punctuation
from razdel import tokenize as razdel_tokenize
from IPython.display import Image
from IPython.core.display import HTML 
from sklearn.decomposition import TruncatedSVD, NMF, PCA, LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold
import pyLDAvis.gensim_models


In [52]:
from sklearn.linear_model import SGDClassifier

In [66]:
from sklearn.neighbors import KNeighborsClassifier

In [3]:
morph = MorphAnalyzer()

### Задание № 1 (8 баллов)

Попробуйте матричные разложения с 4 классификаторами - SGDClassifier, KNeighborsClassifier,  RandomForest, ExtraTreesClassifier (про него подробнее почитайте в документации, он похож на RF). Используйте и NMF, SVD и LDA. Сравните результаты на кросс-валидации и выберите лучшее сочетание.

В итоге у вас должно получиться, как минимум 12 моделей (три разложения на каждый классификатор). Используйте 1 и те же параметры кросс-валидации. Параметры векторизации, параметры K в матричных разложениях, параметры классификаторов могут быть разными между экспериментами.

Можете взять поменьше данных, если все будет обучаться слишком долго (не ставьте параметр K слишком большим в NMF и LDA, иначе точно будет слишком долго)

In [4]:
# добавим лемматизацию
def normalize(text):
    normalized_text = [word.text.strip(punctuation) for word \
                                                            in razdel_tokenize(text)]
    normalized_text = [word.lower() for word in normalized_text if word and len(word) < 20 ]
    normalized_text = [morph.parse(word)[0].normal_form for word in normalized_text]
    return ' '.join(normalized_text)

In [5]:
def eval_table(X, y, pipeline, N=6):
    # зафиксируем порядок классов
    labels = list(set(y))
    
    # метрики отдельных фолдов будет хранить в табличке
    fold_metrics = pd.DataFrame(index=labels)
    # дополнительно также соберем таблицу ошибок
    errors = np.zeros((len(labels), len(labels)))
    
    # создаем стратегию кросс-валидации
    # shuffle=True (перемешивание) - часто критично важно указать
    # т.к. данные могут быть упорядочены и модель на этом обучится
    kfold = StratifiedKFold(n_splits=N, shuffle=True, )
    
    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        # fit-predict как и раньше, но сразу пайплайном
        pipeline.fit(X[train_index], y[train_index])
        preds = pipeline.predict(X[test_index])
        
        # записываем метрику и индекс фолда
        fold_metrics[f'precision_{i}'] = precision_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'recall_{i}'] = recall_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'f1_{i}'] = f1_score(y[test_index], preds, labels=labels, average=None)
        errors += confusion_matrix(y[test_index], preds, labels=labels, normalize='true')
    
    # таблица для усредненных значений
    # тут мы берем колонки со значениями и усредняем их
    # часто также все метрики сразу суммируют и в конце просто делят на количество фолдов
    # но мы тут помимо среднего также хотим посмотреть на стандартное отклонение
    # чтобы понять как сильно варьируются оценки моделей
    result = pd.DataFrame(index=labels)
    result['precision'] = fold_metrics[[f'precision_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['precision_std'] = fold_metrics[[f'precision_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['recall'] = fold_metrics[[f'recall_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['recall_std'] = fold_metrics[[f'recall_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['f1'] = fold_metrics[[f'f1_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['f1_std'] = fold_metrics[[f'f1_{i}' for i in range(N)]].std(axis=1).round(2)
    
    # добавим одну колонку со средним по всем классам
    result.loc['mean'] = result.mean().round(2)
    # проценты ошибок просто усредняем
    errors /= N
    
    return result, errors

In [47]:
data = pd.read_csv('avito_category_classification.csv')

In [48]:
data['description_norm'] = data['description'].apply(normalize)

In [8]:
data

Unnamed: 0,category_name,description,description_norm
0,Автомобили,"отличное состояние,обслужиание в салоне",отличный состояние обслужиание в салон
1,Детская одежда и обувь,В отличном состоянии. Фирма KIKO. Очень теплый...,в отличный состояние фирма kiko очень тёплый у...
2,Предложение услуг,"Изготовление ограждений, перил,качелей, турник...",изготовление ограждение перила качели турников...
3,Автомобили,Автомобиль в отличном техническом состоянии. О...,автомобиль в отличный технический состояние од...
4,Бытовая техника,"Продается газовая плита ""Гефест"" (Белоруссия) ...",продаваться газовый плита гефест белоруссия б ...
...,...,...,...
9893,Товары для детей и игрушки,Чтобы посмотреть весь ассортимент нашего магаз...,чтобы посмотреть весь ассортимент наш магазин ...
9894,Детская одежда и обувь,"Весна,осень.74-80.вопросы можно в вайбер,двухс...",весна осень 74-80 вопрос можно в вайбер двухст...
9895,"Одежда, обувь, аксессуары","Кимоно Green Hill. Состояние отличное, рост ...",кимоно green hill состояние отличный рост 160-...
9896,Детская одежда и обувь,Б/у кроссовки на девочку. Носили только в спор...,б у кроссовок на девочка носить только в спортзал


_________________________________________________________________________________________________________________________________________________

## SVD + RandomForest

In [11]:
pipeline_svd_rf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(250)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=10))
])

In [14]:
metrics_svd_rf, errors_svd_rf = eval_table(data['description_norm'], data['category_name'], pipeline_svd_rf)

In [15]:
metrics_svd_rf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.82,0.15,0.05,0.02,0.09,0.04
Квартиры,0.9,0.01,0.89,0.03,0.89,0.02
Предложение услуг,0.68,0.03,0.59,0.06,0.63,0.04
"Одежда, обувь, аксессуары",0.47,0.02,0.75,0.03,0.58,0.02
Телефоны,0.92,0.02,0.39,0.06,0.55,0.06
Детская одежда и обувь,0.47,0.01,0.68,0.02,0.56,0.01
Товары для детей и игрушки,0.8,0.07,0.22,0.04,0.34,0.05
Автомобили,0.83,0.06,0.63,0.05,0.72,0.04
Ремонт и строительство,0.73,0.09,0.12,0.03,0.2,0.04
Бытовая техника,0.86,0.16,0.06,0.02,0.12,0.04


## NMF + RandomForest

In [16]:
pipeline_nmf_rf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', RandomForestClassifier(n_estimators=200, max_depth=6))
])

In [18]:
metrics_nmf_rf, errors_nmf_rf = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_rf)

In [90]:
metrics_nmf_rf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.81,0.11,0.12,0.08,0.19,0.11
Квартиры,0.84,0.04,0.96,0.02,0.9,0.03
Предложение услуг,0.7,0.04,0.47,0.07,0.55,0.04
"Одежда, обувь, аксессуары",0.46,0.05,0.81,0.06,0.59,0.02
Телефоны,0.94,0.06,0.35,0.08,0.5,0.08
Детская одежда и обувь,0.61,0.08,0.7,0.06,0.65,0.03
Товары для детей и игрушки,0.85,0.04,0.38,0.07,0.52,0.07
Автомобили,0.88,0.03,0.77,0.02,0.82,0.02
Ремонт и строительство,0.75,0.18,0.06,0.03,0.11,0.05
Бытовая техника,0.67,0.52,0.03,0.03,0.06,0.06


## LDA + RandomForest

In [49]:
pipeline_lda_rf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', LatentDirichletAllocation(50)),
    ('clf', RandomForestClassifier(n_estimators=200, max_depth=6))
])

In [50]:
metrics_lda_rf, errors_lda_rf = eval_table(data['description_norm'], data['category_name'], pipeline_lda_rf)

In [51]:
metrics_lda_rf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.1,0.24,0.01,0.01,0.01,0.02
Квартиры,0.76,0.06,0.92,0.03,0.83,0.04
Предложение услуг,0.51,0.05,0.24,0.11,0.31,0.11
"Одежда, обувь, аксессуары",0.39,0.02,0.63,0.07,0.48,0.03
Телефоны,0.7,0.08,0.49,0.11,0.57,0.08
Детская одежда и обувь,0.49,0.09,0.6,0.04,0.54,0.06
Товары для детей и игрушки,0.6,0.15,0.27,0.08,0.36,0.08
Автомобили,0.66,0.12,0.67,0.12,0.66,0.1
Ремонт и строительство,0.46,0.2,0.06,0.04,0.1,0.07
Бытовая техника,0.0,0.0,0.0,0.0,0.0,0.0


_________________________________________________________________________________________________________________________________________________

## SVD + SGDClassifier

In [55]:
pipeline_svd_sgd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(250)),
    ('clf', SGDClassifier(tol=1e-3))
])

In [56]:
metrics_svd_sgd, errors_svd_sgd = eval_table(data['description_norm'], data['category_name'], pipeline_svd_sgd)

In [57]:
metrics_svd_sgd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.64,0.06,0.61,0.07,0.62,0.03
Квартиры,0.95,0.02,0.96,0.02,0.95,0.01
Предложение услуг,0.75,0.06,0.75,0.04,0.75,0.04
"Одежда, обувь, аксессуары",0.68,0.06,0.75,0.07,0.71,0.02
Телефоны,0.82,0.09,0.79,0.06,0.8,0.02
Детская одежда и обувь,0.72,0.06,0.75,0.07,0.73,0.02
Товары для детей и игрушки,0.77,0.07,0.6,0.05,0.67,0.03
Автомобили,0.87,0.03,0.88,0.05,0.88,0.01
Ремонт и строительство,0.58,0.05,0.47,0.09,0.51,0.07
Бытовая техника,0.68,0.07,0.47,0.04,0.55,0.02


## NMF + SGDClassifier

In [60]:
pipeline_nmf_sgd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', SGDClassifier(tol=1e-3))
])

In [61]:
metrics_nmf_sgd, errors_nmf_sgd = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_sgd)

In [62]:
metrics_nmf_sgd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.66,0.18,0.34,0.07,0.43,0.02
Квартиры,0.77,0.17,0.94,0.05,0.83,0.1
Предложение услуг,0.72,0.15,0.34,0.16,0.42,0.14
"Одежда, обувь, аксессуары",0.61,0.09,0.71,0.11,0.64,0.03
Телефоны,0.69,0.2,0.53,0.21,0.54,0.08
Детская одежда и обувь,0.62,0.1,0.66,0.14,0.62,0.05
Товары для детей и игрушки,0.7,0.22,0.39,0.12,0.46,0.08
Автомобили,0.62,0.27,0.72,0.22,0.59,0.12
Ремонт и строительство,0.15,0.17,0.15,0.2,0.14,0.16
Бытовая техника,0.63,0.23,0.09,0.07,0.14,0.1


## LDA + SGDClassifier

In [63]:
pipeline_lda_sgd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', LatentDirichletAllocation(50)),
    ('clf', SGDClassifier(tol=1e-3))
])

In [64]:
metrics_lda_sgd, errors_lda_sgd = eval_table(data['description_norm'], data['category_name'], pipeline_lda_sgd)

In [65]:
metrics_lda_sgd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.48,0.14,0.2,0.1,0.26,0.09
Квартиры,0.68,0.11,0.96,0.01,0.79,0.07
Предложение услуг,0.47,0.14,0.45,0.09,0.45,0.08
"Одежда, обувь, аксессуары",0.53,0.05,0.61,0.07,0.57,0.04
Телефоны,0.55,0.13,0.63,0.07,0.58,0.1
Детская одежда и обувь,0.58,0.04,0.56,0.08,0.57,0.05
Товары для детей и игрушки,0.5,0.12,0.37,0.2,0.38,0.13
Автомобили,0.63,0.12,0.7,0.29,0.61,0.21
Ремонт и строительство,0.16,0.14,0.09,0.14,0.1,0.14
Бытовая техника,0.06,0.08,0.04,0.08,0.04,0.07


_________________________________________________________________________________________________________________________________________________

## SVD + KNeighborsClassifier

In [67]:
pipeline_svd_kn = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(250)),
    ('clf', KNeighborsClassifier(n_neighbors=10, metric='cosine'))
])

In [68]:
metrics_svd_kn, errors_svd_kn = eval_table(data['description_norm'], data['category_name'], pipeline_svd_kn)

In [69]:
metrics_svd_kn

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.43,0.07,0.27,0.05,0.33,0.06
Квартиры,0.92,0.03,0.88,0.02,0.9,0.01
Предложение услуг,0.59,0.05,0.68,0.04,0.63,0.02
"Одежда, обувь, аксессуары",0.5,0.01,0.64,0.02,0.56,0.01
Телефоны,0.8,0.02,0.43,0.03,0.56,0.02
Детская одежда и обувь,0.52,0.01,0.66,0.03,0.58,0.02
Товары для детей и игрушки,0.66,0.08,0.32,0.04,0.43,0.05
Автомобили,0.7,0.04,0.71,0.04,0.7,0.03
Ремонт и строительство,0.43,0.08,0.18,0.05,0.25,0.06
Бытовая техника,0.42,0.04,0.26,0.09,0.32,0.07


## NMF + KNeighborsClassifier

In [70]:
pipeline_nmf_kn = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', KNeighborsClassifier(n_neighbors=10, metric='cosine'))
])

In [71]:
metrics_nmf_kn, errors_nmf_kn = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_kn)

In [72]:
metrics_nmf_kn

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.4,0.03,0.29,0.07,0.33,0.05
Квартиры,0.84,0.04,0.8,0.08,0.82,0.05
Предложение услуг,0.57,0.04,0.59,0.05,0.58,0.04
"Одежда, обувь, аксессуары",0.51,0.04,0.59,0.03,0.55,0.03
Телефоны,0.61,0.08,0.46,0.07,0.52,0.07
Детская одежда и обувь,0.54,0.03,0.6,0.05,0.57,0.03
Товары для детей и игрушки,0.51,0.09,0.33,0.07,0.4,0.07
Автомобили,0.58,0.05,0.7,0.07,0.63,0.04
Ремонт и строительство,0.33,0.05,0.21,0.03,0.25,0.03
Бытовая техника,0.23,0.08,0.22,0.07,0.22,0.07


## LDA + KNeighborsClassifier

In [73]:
pipeline_lda_kn = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', LatentDirichletAllocation(50)),
    ('clf', KNeighborsClassifier(n_neighbors=10, metric='cosine'))
])

In [74]:
metrics_lda_kn, errors_lda_kn = eval_table(data['description_norm'], data['category_name'], pipeline_lda_kn)

In [75]:
metrics_lda_kn

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.35,0.08,0.25,0.16,0.28,0.13
Квартиры,0.74,0.06,0.96,0.02,0.83,0.04
Предложение услуг,0.45,0.08,0.4,0.13,0.42,0.08
"Одежда, обувь, аксессуары",0.49,0.04,0.48,0.06,0.48,0.04
Телефоны,0.6,0.08,0.54,0.09,0.57,0.06
Детская одежда и обувь,0.51,0.04,0.61,0.04,0.56,0.04
Товары для детей и игрушки,0.58,0.05,0.4,0.07,0.47,0.06
Автомобили,0.61,0.12,0.78,0.05,0.68,0.09
Ремонт и строительство,0.28,0.05,0.15,0.06,0.19,0.05
Бытовая техника,0.25,0.11,0.15,0.06,0.19,0.07


_________________________________________________________________________________________________________________________________________________

## SVD + ExtraTreesClassifier

In [77]:
pipeline_svd_et = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(250)),
    ('clf', ExtraTreesClassifier(n_estimators=100, random_state=0))
])

In [78]:
metrics_svd_et, errors_svd_et = eval_table(data['description_norm'], data['category_name'], pipeline_svd_et)

In [79]:
metrics_svd_et

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.69,0.15,0.14,0.04,0.23,0.07
Квартиры,0.78,0.02,0.86,0.02,0.82,0.02
Предложение услуг,0.79,0.05,0.4,0.04,0.53,0.05
"Одежда, обувь, аксессуары",0.48,0.01,0.71,0.03,0.57,0.02
Телефоны,0.85,0.06,0.38,0.04,0.53,0.04
Детская одежда и обувь,0.46,0.01,0.71,0.02,0.56,0.01
Товары для детей и игрушки,0.66,0.07,0.23,0.02,0.34,0.03
Автомобили,0.83,0.03,0.59,0.07,0.69,0.05
Ремонт и строительство,0.56,0.11,0.11,0.03,0.19,0.05
Бытовая техника,0.54,0.13,0.15,0.07,0.23,0.1


## NMF + ExtraTreesClassifier

In [80]:
pipeline_nmf_et = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', ExtraTreesClassifier(n_estimators=100, random_state=0))
])

In [81]:
metrics_nmf_et, errors_nmf_et = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_et)

In [82]:
metrics_nmf_et

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.72,0.04,0.57,0.03,0.64,0.03
Квартиры,0.92,0.03,0.96,0.01,0.94,0.01
Предложение услуг,0.7,0.03,0.74,0.03,0.72,0.02
"Одежда, обувь, аксессуары",0.69,0.01,0.79,0.01,0.74,0.01
Телефоны,0.78,0.05,0.78,0.03,0.78,0.04
Детская одежда и обувь,0.73,0.01,0.77,0.02,0.75,0.01
Товары для детей и игрушки,0.75,0.03,0.6,0.05,0.67,0.03
Автомобили,0.86,0.02,0.9,0.04,0.88,0.02
Ремонт и строительство,0.61,0.07,0.46,0.04,0.52,0.05
Бытовая техника,0.68,0.1,0.32,0.02,0.43,0.04


## LDA + ExtraTreesClassifier

In [84]:
pipeline_lda_et = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', LatentDirichletAllocation(50)),
    ('clf', ExtraTreesClassifier(n_estimators=100, random_state=0))
])

In [85]:
metrics_lda_et, errors_lda_et = eval_table(data['description_norm'], data['category_name'], pipeline_lda_et)

In [86]:
metrics_lda_et

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Мебель и интерьер,0.44,0.08,0.29,0.07,0.34,0.08
Квартиры,0.79,0.04,0.92,0.03,0.85,0.03
Предложение услуг,0.48,0.06,0.51,0.09,0.49,0.05
"Одежда, обувь, аксессуары",0.52,0.01,0.54,0.02,0.53,0.01
Телефоны,0.58,0.06,0.54,0.09,0.56,0.07
Детская одежда и обувь,0.53,0.01,0.6,0.03,0.56,0.02
Товары для детей и игрушки,0.48,0.07,0.42,0.08,0.45,0.07
Автомобили,0.65,0.08,0.78,0.04,0.71,0.05
Ремонт и строительство,0.39,0.12,0.22,0.08,0.27,0.07
Бытовая техника,0.31,0.08,0.13,0.05,0.18,0.06


## Оценка
Оценим результаты по F1 (средней)

In [96]:
import pandas as pd

In [97]:
df = pd.DataFrame(index=['SVD', 'NMF', 'LDA'], columns=['RF', 'SGD', 'KN', 'ExT'])

In [107]:
df.loc['SVD'] = pd.Series({'RF':metrics_svd_rf['f1'][10], 'SGD':metrics_svd_sgd['f1'][10], 'KN':metrics_svd_kn['f1'][10], 'ExT':metrics_svd_et['f1'][10]})
df.loc['NMF'] = pd.Series({'RF':metrics_nmf_rf['f1'][10], 'SGD':metrics_nmf_sgd['f1'][10], 'KN':metrics_nmf_kn['f1'][10], 'ExT':metrics_nmf_et['f1'][10]})
df.loc['LDA'] = pd.Series({'RF':metrics_lda_rf['f1'][10], 'SGD':metrics_lda_sgd['f1'][10], 'KN':metrics_lda_kn['f1'][10], 'ExT':metrics_lda_et['f1'][10]})


In [122]:
def color(val):
    color = 'blue' if val > 0.7 else 'white'
    return 'color: % s' % color

In [123]:
df.style.applymap(color).format(precision=2, decimal=".")

Unnamed: 0,RF,SGD,KN,ExT
SVD,0.47,0.72,0.53,0.47
NMF,0.49,0.48,0.49,0.71
LDA,0.39,0.43,0.47,0.49


### Задание № 2 (2 балла)

С помощью gensim постройте 5 тематических моделей с разными параметрами. Как минимум попробуйте: разное количество тем, alpha и eta параметры (в альфа попробуйте "asymmetric" и "auto" помимо 'symmetric' по дефолту, в eta просто попробуйте разные значения), а также разные параметры фильтрации словаря. 

Оцените каждую из моделей с помощью метрик (перплексии и когерентности), а также визуально, просмотрев получаемые темы. Найдите самую хорошую тему для каждой из моделей.

In [125]:
texts = open('wiki_data.txt', encoding='utf-8').read().splitlines()[:5000]
texts = ([normalize(text) for text in texts])

In [126]:
dictinary = gensim.corpora.Dictionary((text.split() for text in texts))

In [127]:
dictinary.filter_extremes(no_above=0.1, no_below=10)
dictinary.compactify()

In [128]:
print(dictinary)

Dictionary<8101 unique tokens: ['1,2', '1,5', '12', '14', '16']...>


In [129]:
corpus = [dictinary.doc2bow(text.split()) for text in texts]

## 1 Модель

In [135]:
lda1 = gensim.models.LdaMulticore(corpus, 
                                 50, # колиество тем
                                 alpha='asymmetric',
                                 id2word=dictinary,
                                 eta=10,
                                 passes=10)

In [134]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda1, corpus, dictinary)

In [136]:
np.exp2(-lda1.log_perplexity(corpus[:1000]))

472.1753831202855

In [137]:
topics = []
for topic_id, topic in lda1.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda1 = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=[text.split() for text in texts], 
                                                   dictionary=dictinary, coherence='c_v')

coherence_model_lda1.get_coherence()

0.3108048824214959

## 2 Модель

In [141]:
lda2 = gensim.models.LdaMulticore(corpus, 
                                 20, # колиество тем
                                 alpha='asymmetric',
                                 id2word=dictinary,
                                 eta=50,
                                 passes=24)

In [142]:
pyLDAvis.gensim_models.prepare(lda2, corpus, dictinary)

In [143]:
np.exp2(-lda2.log_perplexity(corpus[:1000]))

563.8865034369379

In [144]:
topics = []
for topic_id, topic in lda2.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda2 = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=[text.split() for text in texts], 
                                                   dictionary=dictinary, coherence='c_v')

coherence_model_lda2.get_coherence()

0.33357512758842145

## 3 Модель

In [151]:
dictinary3 = gensim.corpora.Dictionary((text.split() for text in texts))
dictinary3.filter_extremes(no_above=0.3, no_below=5)

In [156]:
lda3 = gensim.models.LdaMulticore(corpus, 
                                 30, # колиество тем
                                 alpha='symmetric',
                                 id2word=dictinary3,
                                 eta=1,
                                 passes=5)

In [158]:
pyLDAvis.gensim_models.prepare(lda3, corpus, dictinary3)

In [159]:
np.exp2(-lda3.log_perplexity(corpus[:1000]))

752.6724200257557

In [160]:
topics = []
for topic_id, topic in lda3.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda3 = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=[text.split() for text in texts], 
                                                   dictionary=dictinary, coherence='c_v')

coherence_model_lda3.get_coherence()

0.37559361940053465

## 4 Модель

In [162]:
lda4 = gensim.models.LdaMulticore(corpus, 
                                 17, # колиество тем
                                 alpha='asymmetric',
                                 id2word=dictinary,
                                 eta=0.2,
                                 passes=17)

In [163]:
pyLDAvis.gensim_models.prepare(lda4, corpus, dictinary)

In [164]:
np.exp2(-lda4.log_perplexity(corpus[:1000]))

406.211005718709

In [165]:
topics = []
for topic_id, topic in lda3.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda4 = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=[text.split() for text in texts], 
                                                   dictionary=dictinary, coherence='c_v')

coherence_model_lda4.get_coherence()

0.37559361940053465

## 5 Модель

In [166]:
lda5 = gensim.models.LdaMulticore(corpus, 
                                 30, # колиество тем
                                 alpha='asymmetric',
                                 id2word=dictinary,
                                 eta=333,
                                 passes=3)

In [167]:
pyLDAvis.gensim_models.prepare(lda5, corpus, dictinary)

In [168]:
np.exp2(-lda5.log_perplexity(corpus[:1000]))

537.9745571468823

In [169]:
topics = []
for topic_id, topic in lda3.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda5 = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=[text.split() for text in texts], 
                                                   dictionary=dictinary, coherence='c_v')

coherence_model_lda5.get_coherence()

0.37559361940053465