0. Импорт необходимых библиотек

In [1]:
import pandas as pd
import numpy as np
import statistics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

1. Подгрузим предобработанный корпус текстов

In [2]:
db = pd.read_csv('Manual_transcription with tokens.csv', sep=';')
corpus = db['Transcription tokens']
corpus

0      давать успокоиться начало мочь вестись ерунда ...
1      подождать подождать успокоиться пожалуйста под...
2      кристин привет слышать самолёт упасть зеландия...
3      кристин привет .. слушать .. бояться лететь ра...
4      почему подождать ... подождать давно планирова...
                             ...                        
530    готовить медовик приготовить медовик очень вре...
531    значит правило игра дурак обычный 36 карта кол...
532    третьяковский галерея продаваться такой игра п...
533    город челябинск уметь хотеть учиться банкир фи...
534    звать ленога екатерина 23 год родиться 93-м го...
Name: Transcription tokens, Length: 535, dtype: object

----
2.1. Подгрузим тональный словарь, подготовим датафрейм для тональных векторов и подготовим корпус к обработке тональным словарём 

In [3]:
tonal_dict = pd.read_csv('Dictionary_lemmatization_Rus.csv')
tonal_vectors = {'Count positive': [], 'Count negative': [], 'Mean sentiment': [], 'Max': [], 'Min': [], 'Median': [], 'Sentiment': []}
corpus_splitted = corpus.apply(lambda t: t.split())
corpus_splitted

0      [давать, успокоиться, начало, мочь, вестись, е...
1      [подождать, подождать, успокоиться, пожалуйста...
2      [кристин, привет, слышать, самолёт, упасть, зе...
3      [кристин, привет, .., слушать, .., бояться, ле...
4      [почему, подождать, ..., подождать, давно, пла...
                             ...                        
530    [готовить, медовик, приготовить, медовик, очен...
531    [значит, правило, игра, дурак, обычный, 36, ка...
532    [третьяковский, галерея, продаваться, такой, и...
533    [город, челябинск, уметь, хотеть, учиться, бан...
534    [звать, ленога, екатерина, 23, год, родиться, ...
Name: Transcription tokens, Length: 535, dtype: object

2.2. Проведём тональную векторизацию

In [4]:
for i in range(len(corpus)):
    tokens, vector = corpus_splitted[i], []  # получаем токены из датафрейма и создаём пустой вектор
    for t in tokens:  # проходимся по токенам
        if t in tonal_dict['Word'].unique():  # если токен есть в тональном словаре, ...
            vector.append(list(tonal_dict[tonal_dict['Word'] == t]['Weight'])[0])  # ... то добавляем в вектор его вес
        else:
            vector.append(0)  # иначе считаем тональный вес равным нулю
    tonal_vectors['Count positive'].append(len(list(filter(lambda v: v > 0, vector))))  # считаем положительные слова
    tonal_vectors['Count negative'].append(len(list(filter(lambda v: v < 0, vector))))  # считаем отрицательные слова
    tonal_vectors['Mean sentiment'].append(sum(vector) / len(vector))  # считаем средний сентимент
    tonal_vectors['Max'].append(max(vector))  # находим максимальный сентимент
    tonal_vectors['Min'].append(min(vector))  # находим миниимальный сентимент
    tonal_vectors['Median'].append(statistics.median(vector))  # считаем медианный сентимент
    tonal_vectors['Sentiment'].append(db.loc[i, 'Sentiment digits'])  # добавляем к тональному вектору сентимент текста

2.3. Продемонстрируем значения тональных векторов

In [5]:
tonal_db = pd.DataFrame(tonal_vectors)
tonal_db

Unnamed: 0,Count positive,Count negative,Mean sentiment,Max,Min,Median,Sentiment
0,2,2,0.002604,1.000000,-1.0,0.0,0
1,9,3,0.102525,1.000000,-1.0,0.0,0
2,7,8,-0.020149,1.000000,-1.0,0.0,0
3,5,9,-0.035507,1.000000,-1.0,0.0,0
4,0,1,-0.111111,0.000000,-1.0,0.0,0
...,...,...,...,...,...,...,...
530,3,5,-0.001373,1.000000,-1.0,0.0,1
531,1,7,-0.049271,1.000000,-1.0,0.0,1
532,8,5,-0.003929,0.666667,-1.0,0.0,1
533,6,7,-0.010018,1.000000,-1.0,0.0,1


----
3.1. Векторизуем корпус с помощью BoW

In [6]:
vectorizer = CountVectorizer()  # инициализируем векторизатор из sklearn
bow_transform = vectorizer.fit_transform(corpus)  # обучаемся и трансформируем корпус
# получаем из трансформированного корпуса векторы и колонки
vectors, columns = bow_transform.toarray().tolist(), vectorizer.get_feature_names_out()
bow_vectors = {column: [] for column in columns} | {'Sentiment': []}  # формируем датафрейм
for i in range(len(vectors)):  # проходимся циклом по векторам
    pairs = zip(columns, vectors[i])  # комбинируем столбцы со значениями вектора 
    for column, value in pairs:  # для каждой пары столбец-значение_вектора...
        bow_vectors[column].append(value)  # ... заносим в датафрейм для столбца значение вектора
    bow_vectors['Sentiment'].append(db.loc[i, 'Sentiment digits'])  # добавляем к BoW-вектору сентимент текста

3.2. Продемонстрируем результаты векторизации

In [7]:
bow_db = pd.DataFrame(bow_vectors)
bow_db

Unnamed: 0,10,11,12,15,17,18,20,23,30,36,...,яковливич,ялта,январь,яндекс,ярмарка,ясно,яхта,яхточка,ёлка,Sentiment
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
531,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
532,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
533,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


----
4.1. Векторизуем корпус с помощью Tf-Idf

In [8]:
vectorizer = TfidfVectorizer()  # инициализируем векторизатор из sklearn
tfidf_transform = vectorizer.fit_transform(corpus)  # обучаемся и трансформируем корпус
# получаем из трансформированного корпуса векторы и колонки
vectors, columns = tfidf_transform.toarray().tolist(), vectorizer.get_feature_names_out()
tfidf_vectors = {column: [] for column in columns} | {'Sentiment': []}  # формируем датафрейм
for i in range(len(vectors)):  # проходимся циклом по векторам
    pairs = zip(columns, vectors[i])  # комбинируем столбцы со значениями вектора 
    for column, value in pairs:  # для каждой пары столбец-значение_вектора...
        tfidf_vectors[column].append(value)  # ... заносим в датафрейм для столбца значение вектора
    tfidf_vectors['Sentiment'].append(db.loc[i, 'Sentiment digits'])  # добавляем к TfIdf-вектору сентимент текста

4.2. Продемонстрируем результаты векторизации

In [9]:
tfidf_db = pd.DataFrame(tfidf_vectors)
tfidf_db

Unnamed: 0,10,11,12,15,17,18,20,23,30,36,...,яковливич,ялта,январь,яндекс,ярмарка,ясно,яхта,яхточка,ёлка,Sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,1
531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.101637,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,1
532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,1
533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,1


---
5. Сохраняем все получившиеся датафреймы в отдельные файлы

In [10]:
tonal_db.to_csv('Manual_transcription tonal.csv', index=False)
bow_db.to_csv('Manual_transcription bow.csv', index=False)
tfidf_db.to_csv('Manual_transcription tfidf.csv', index=False)