### SVD в scikit-learn

Поработаем над датасетом с новостями (20 классов по темам). Обработаем и векторизуем данные

In [None]:
import re
import string
import pandas as pd
import nltk

from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
nltk.download('stopwords')

In [None]:
def rmv_emails_websites(string):
    """Function removes emails, websites and numbers"""
    new_str = re.sub(r"\S+@\S+", '', string)
    new_str = re.sub(r"\S+.co\S+", '', new_str)
    new_str = re.sub(r"\S+.ed\S+", '', new_str)
    new_str = re.sub(r"[0-9]+", '', new_str)
    return new_str

tokenizer = RegexpTokenizer(r'\b\w{3,}\b')
stop_words = list(set(stopwords.words("english")))
stop_words += list(string.punctuation)
stop_words += ['__', '___']

In [None]:
X_train, y_train = fetch_20newsgroups(subset='train', return_X_y=True)
X_test, y_test = fetch_20newsgroups(subset='test', return_X_y=True)

In [None]:
X_train = list(map(rmv_emails_websites, X_train))
X_test  = list(map(rmv_emails_websites, X_test))

In [None]:
tfidf = TfidfVectorizer(lowercase=True, 
                        stop_words=stop_words, 
                        tokenizer=tokenizer.tokenize, 
                        max_df=0.2,
                        min_df=0.02
                       )
tfidf_train_sparse = tfidf.fit_transform(X_train)
tfidf_train_df = pd.DataFrame(tfidf_train_sparse.toarray(), 
                        columns=tfidf.get_feature_names_out())
tfidf_train_df.head()

Будем использовать SVD. Оставим 20 компонентов по числу тем. Сигма - это та самая матрица весов $\Sigma$, а V_T - это матрица важности контекстов по документам:

![Image](svd.png)

In [None]:
from sklearn.decomposition import TruncatedSVD

lsa_obj = TruncatedSVD(n_components=20, n_iter=100, random_state=42)
tfidf_lsa_data = lsa_obj.fit_transform(tfidf_train_df)
Sigma = lsa_obj.singular_values_
V_T = lsa_obj.components_.T

Визуализируем сигму: вроде действительно первая самая важная, а остальные равномерно убывают. 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.barplot(x=np.arange(len(Sigma)), y=Sigma)
plt.show()

In [None]:
logreg_lsa = LogisticRegression()
logreg     = LogisticRegression()
logreg_param_grid = [{'tol':[0.0001, 0.0005, 0.001]}]
grid_lsa_log = GridSearchCV(estimator=logreg_lsa,
                        param_grid=logreg_param_grid, 
                        scoring='accuracy', cv=5,
                        n_jobs=-1)
grid_log = GridSearchCV(estimator=logreg,
                        param_grid=logreg_param_grid, 
                        scoring='accuracy', cv=5,
                        n_jobs=-1)
best_lsa_logreg = grid_lsa_log.fit(tfidf_lsa_data, y_train).best_estimator_
best_reg_logreg = grid_log.fit(tfidf_train_df, y_train).best_estimator_
print("Accuracy of Logistic Regression on LSA train data is :", best_lsa_logreg.score(tfidf_lsa_data, y_train))
print("Accuracy of Logistic Regression with standard train data is :", best_reg_logreg.score(tfidf_train_df, y_train))

### LSA в gensim

In [None]:
import pandas as pd

# load data
df = pd.read_csv('Musical_instruments_reviews.csv', usecols=['reviewerID', 'reviewText'])
df.head()

Обработаем данные: удалим стоп-слова, приведем к нижнему регистру, сделаем стемминг - для английского языка его будет достаточно

In [None]:
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation \
                                        , preprocess_string, strip_short, stem_text

# preprocess given text
def preprocess(text):
    
    # clean text based on given filters
    CUSTOM_FILTERS = [lambda x: x.lower(), 
                                remove_stopwords, 
                                strip_punctuation, 
                                strip_short, 
                                stem_text]
    text = preprocess_string(text, CUSTOM_FILTERS)
    
    return text

# apply function to all reviews 
df['Text (Clean)'] = df['reviewText'].astype(str).apply(lambda x: preprocess(x))

In [None]:
df.head()

Создадим словарь (мешок слов), на этом месте у нас возникнет та самая исходная матрица document-term для дальнейшего ее разложения. 

In [None]:
from gensim import corpora

# create a dictionary with the corpus
corpus = df['Text (Clean)']
dictionary = corpora.Dictionary(corpus)

# convert corpus into a bag of words
bow = [dictionary.doc2bow(text) for text in corpus]

[LSI](https://radimrehurek.com/gensim/models/lsimodel.html) в gensim - это реализация SVD. Но прежде чем раскладывать, нам нужно определиться с количеством тем. В отличие от датасета с новостями, тут мы не знаем его заранее и поэтому можем воспользоваться Coherence Model, которая оценит, насколько будут близки друг другу слова в кластерах, переберет разные разбивки и выведет оценки. 

In [None]:
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

for i in range(2,11):
    lsi = LsiModel(bow, num_topics=i, id2word=dictionary)
    coherence_model = CoherenceModel(model=lsi, texts=df['Text (Clean)'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print('Coherence score with {} clusters: {}'.format(i, coherence_score))

Видимо, лучше всего 2 темы. 

In [None]:
lsi = LsiModel(bow, num_topics=2, id2word=dictionary)

Посмотрим по пять первых слов в получившихся темах

In [None]:
for topic_num, words in lsi.print_topics(num_words=5):
    print('Words in {}: {}.'.format(topic_num, words))

Попробуем в тестовом режиме оценить тематику ревью.

In [None]:
# find the scores given between the review and each topic
corpus_lsi = lsi[bow]
score1 = []
score2 = []
for doc in corpus_lsi:
    score1.append(round(doc[0][1],2))
    score2.append(round(doc[1][1],2))

# create data frame that shows scores assigned for both topics for each review
df_topic = pd.DataFrame()
df_topic['Text'] = df['reviewText']
df_topic['Topic 0 score'] = score1
df_topic['Topic 1 score'] = score2
df_topic['Topic']= df_topic[['Topic 0 score', 'Topic 1 score']].apply(lambda x: x.argmax(), axis=1)
df_topic.head(1)

Выведем сэмплы для обеих тем

In [None]:
df_topic0 = df_topic[df_topic['Topic'] == 0]
df_topic1 = df_topic[df_topic['Topic']==1]
print('Sample text from topic 0:\n {}'.format(df_topic0.sample(1, random_state=2)['Text'].values))
print('\nSample text from topic 1:\n {}'.format(df_topic1.sample(1, random_state=2)['Text'].values))

### LDA в sklearn

In [None]:
corpus = list(pd.read_csv('google.csv')['text'])

In [None]:
corpus[0]

In [None]:
!pip install pymorphy2

In [None]:
import re
import pymorphy2
from razdel import tokenize
from string import punctuation
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation as LDA

morph = pymorphy2.MorphAnalyzer()

In [None]:
for i in range(len(corpus)):
    corpus[i] = re.sub(r'https://t\.co/\w+\b', '', corpus[i])

In [None]:
def lemmatize(string):
    tokenized = [t.text for t in tokenize(string) if re.fullmatch(r'(?i)[а-я]+(-[а-я]+)*', t.text)]
    return [morph.parse(token)[0].normal_form for token in tokenized]

In [None]:
count_vect = CountVectorizer(tokenizer=lemmatize, stop_words=stopwords.words('russian'), lowercase=True)
x_counts = count_vect.fit_transform(corpus)
x_counts.todense()

In [None]:
features = list(count_vect.get_feature_names_out())
features[:10]

In [None]:
tfidf_transformer = TfidfTransformer()
x_tfidf = tfidf_transformer.fit_transform(x_counts)

In [None]:
dimension = 20
lda = LDA(n_components = dimension)
lda_array = lda.fit_transform(x_tfidf)

In [None]:
components = [lda.components_[i] for i in range(len(lda.components_))]
important_words = [sorted(features, key = lambda x: components[j][features.index(x)], reverse = True)[:10] for j in range(len(components))]

In [None]:
important_words