### Предобработка текста

Тексты нужно подготовить перед использованием:

Привести к нижнему регистру.

Удалить пунктуацию и стоп-слова.

Токенизировать текст (разбить на слова).

In [80]:
!pip install gensim scikit-learn nltk pandas
!python.exe -m pip install --upgrade pip



In [81]:
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\roman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\roman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\roman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [82]:
from nltk.corpus import stopwords


def df_fix(_df, _column_name):
    # Удаление строк с пропущенными значениями
    _df[_column_name] = _df[_column_name].str.lower().str.strip()
    # Нижний регистр, удаление лишних пробелов
    _df[_column_name] = _df[_column_name].str.replace('\n', '', regex=False)
    return _df

def preprocess_text(text):
    # Токенизация
    tokens = word_tokenize(text)
    # Удаление пунктуации и стоп-слов
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    # Лемматизация
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Загрузка из CSV
df = pd.read_csv('commonlit_texts.csv')

columns_to_keep = ['title', 'description', 'is_prose', 'genre', 'intro']
df = df[columns_to_keep]

df = df.dropna()
df_fix(df, 'title')
df_fix(df, 'description')
df_fix(df, 'genre')
df_fix(df, 'intro')

pd.set_option('display.max_colwidth', None)
print(df.sample(n=1)["intro"])

df['processed_intro'] = df['intro'].apply(preprocess_text)

696    when franklin delano roosevelt was elected president of the united states on march 4, 1933, the country was in the grip of the great depression. at his inauguration, roosevelt delivered the following famous speech, in which he addresses the growing fear that plagued a nation in crisis.as you read, identify roosevelt's purpose in the speech and take notes on the rhetorical techniques he uses to make his points.
Name: intro, dtype: object


### Построение TF-IDF

In [83]:
### Построение TF-IDF
# Создание TF-IDF векторизатора
# Ограничим до 5000 слов для простоты
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['processed_intro'])

# Посмотрим на матрицу TF-IDF
# (количество документов, количество уникальных слов)
print(tfidf_matrix.shape)

# Получение словаря TF-IDF
# print(vectorizer.get_feature_names_out())
# print(tfidf_matrix.toarray())

(2000, 5000)


 Построение Word2Vec

In [84]:
# Токенизация текста для Word2Vec
tokenized_intros = df['processed_intro'].apply(lambda x: x.split())

# Создание модели Word2Vec
w2v_model = Word2Vec(sentences=tokenized_intros, vector_size=100, window=5, min_count=1, workers=4)

# Получение вектора для слова
word_vector = w2v_model.wv['example']  # Вектор для слова 'example'
print(word_vector)

[-0.12865202  0.16068581 -0.00903208  0.01996073 -0.00204957 -0.35496235
  0.13322583  0.5026308  -0.13195437 -0.11418425 -0.09907986 -0.34955359
 -0.04566629  0.06991561  0.0721117  -0.19215833  0.09035921 -0.21170096
  0.01925341 -0.37455475  0.15248346  0.03460553  0.23816903 -0.09778078
 -0.0336967   0.03645394 -0.12793027 -0.01531199 -0.22777236  0.03337726
  0.2744209  -0.04598208  0.06480784 -0.15564622 -0.0332095   0.20020145
  0.01436602 -0.09464411 -0.12269801 -0.28855905  0.0778457  -0.16574652
 -0.12166671  0.0539104   0.16844968 -0.09749313 -0.17437445 -0.10189272
  0.06633928  0.18270305  0.0743015  -0.19529557 -0.02657109 -0.04388666
 -0.15187907  0.07686915  0.13252892 -0.02945541 -0.17664932  0.08361544
 -0.04563026 -0.01457122  0.08071989  0.05775671 -0.26300716  0.18690477
  0.0389752   0.17435873 -0.29269823  0.28706557 -0.11392553  0.17251365
  0.2680112  -0.06579765  0.21378314  0.04702346  0.12212237 -0.01939645
 -0.15129063  0.03197882 -0.2222559  -0.1613214  -0

Комбинирование TF-IDF и Word2Vec

In [85]:
# После того как у нас есть TF-IDF и Word2Vec, мы можем объединить их. Например:

# Для каждого документа можно вычислить взвешенную сумму Word2Vec-векторов слов, где веса — это значения TF-IDF.

def document_vector(doc, model, tfidf_vectorizer):
    # Получаем веса TF-IDF для всех слов документа
    tfidf_weights = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.transform([' '.join(doc)]).toarray()[0]))

    # Считаем вектор документа
    vector = np.zeros(model.vector_size)
    for word in doc:
        if word in model.wv and word in tfidf_weights:
            vector += model.wv[word] * tfidf_weights[word]
    return vector

df['document_vector'] = df['processed_intro'].apply(lambda x: document_vector(x.split(), w2v_model, vectorizer))

Операции со словами

In [86]:
def print_similar(_similar_words):
    # это список кортежей, где каждый кортеж содержит слово и его косинусное сходство
    for _word, _similarity in _similar_words:
        print(f"{_word}: {_similarity:.4f}")

def get_result_vector(model, add_words=[], substract_words=[]):
    result_vector = np.zeros(model.vector_size)
    for word in add_words:
        if word in model.wv:
            result_vector += model.wv[word]
    for word in substract_words:
        if word in model.wv:
            result_vector -= model.wv[word]

    return result_vector


# Пример: король - мужчина + женщина
# result_vector = get_result_vector(w2v_model, ["king", "woman"], ["man"])
# similar_words = w2v_model.wv.similar_by_vector(result_vector)
# print_similar(similar_words)

result_vector = get_result_vector(w2v_model, ["family", "male", "old"], [])
similar_words = w2v_model.wv.similar_by_vector(result_vector)
print_similar(similar_words)

family: 0.9999
student: 0.9998
made: 0.9998
called: 0.9998
day: 0.9998
play: 0.9998
group: 0.9998
human: 0.9998
new: 0.9998
school: 0.9998
