In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from flair.embeddings import TransformerDocumentEmbeddings, WordEmbeddings, DocumentLSTMEmbeddings, SentenceTransformerDocumentEmbeddings

import spacy
from torchtext.data import get_tokenizer

from src import utils

import warnings

In [None]:
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("./data/preprocessed_data_200.csv")
data.head()

# Feature extraction

# Tfidf

In [None]:
X, y = data["preprocessed_text"], data["topic"]

(X_train, y_train), (X_test, y_test) = utils.split_data(X, y, test_size=0.3, shuffle=True, validate=False)

In [None]:
tfidf = TfidfVectorizer().fit(X_train)
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

svd = TruncatedSVD(n_components=300).fit(X_train)
X_train = svd.transform(X_train)
X_test = svd.transform(X_test)

In [None]:
np.savetxt('./data/data_tfidf_emb.csv', np.hstack((np.vstack((X_train, X_test)), np.hstack((y_train, y_test)).reshape(-1, 1))))

## Spacy

In [None]:
nlp = spacy.load('/Users/polarized_d/PycharmProjects/CourseProjectNLP/venv/lib/python3.8/site-packages/ru_core_news_md/ru_core_news_md-3.4.0/')
tokenizer = get_tokenizer(nlp)

In [None]:
spacy_embs = utils.prepare_emb_array(df=data,
                                     chunk_size=10000,
                                     text_column='preprocessed_text',
                                     emb_size=300,
                                     tokenizer=tokenizer)

np.savetxt("./data/data_spacy_emb.csv", np.hstack((spacy_embs, data["topic"].values.reshape(-1, 1))))