In [245]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import nltk
import re
import inspect
import pymorphy2
from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [182]:
neg = pd.read_csv('./negative.csv')
pos = pd.read_csv('./positive.csv')

### Проверяем нет ли дубликатов

In [183]:
print(neg.shape)
print(pos.shape)

(111923, 12)
(114911, 12)


In [184]:
len(pos['ttext'].value_counts()), len(neg['ttext'].value_counts())

(110396, 107044)

### Удаляем дубликаты

In [185]:
pos = pos.drop_duplicates(subset=['ttext'])
print(pos.shape)
neg = neg.drop_duplicates(subset=['ttext'])
print(neg.shape)

(110396, 12)
(107044, 12)


In [186]:
data = []
for i, row in pd.concat([neg, pos]).iterrows():
    data.append({'text': row['ttext'], 'sentiment': row['ttype']})
random.shuffle(data)

In [195]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')
months = ['январь', 'февраль', 'март', 'апрель', 'май', 'июнь', 'июль', 'август', 'сентябрь', 'октябрь', 'ноябрь', 'декабрь',]
all_stop_words = stop_words + months
word_tokenizer = nltk.WordPunctTokenizer()

regex = re.compile(r'[А-Яа-яёЁ-]+')
def words_only(text, regex=regex):
    try:
        return " ".join(regex.findall(text)).lower()
    except:
        return ""

def process_data(data):
    texts = []
    targets = []

    for item in tqdm(data):

        text_lower = words_only(item['text'])
        tokens     = word_tokenizer.tokenize(text_lower)

        tokens = [word for word in tokens if (word not in all_stop_words and not word.isnumeric())]

        texts.append(tokens)

    return texts

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/20223210/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Проводим предобработку

In [232]:
y = [item['sentiment'] for item in data]
texts = process_data(data)

  0%|          | 0/217440 [00:00<?, ?it/s]

In [221]:
print(texts[15000])
print(y[15000])

['агагага', 'другими', 'пишешься']
-1


### Нормализуем и записываем в файл

In [202]:
morph = pymorphy2.MorphAnalyzer()

In [213]:
with open('tweets_lemma.txt', 'a') as file:
    for i in tqdm(range(len(texts))):
        tweet_lemma = [morph.parse(x)[0].normal_form for x in texts[i]]
        tweet = ' '.join(tweet_lemma)
        file.write(tweet + '\n')

  0%|          | 0/217440 [00:00<?, ?it/s]

In [217]:
tweets = [x.replace('\n','') for x in open('tweets_lemma.txt', encoding = 'utf-8').readlines()]

In [219]:
tweets[15000]

'агагага другой писаться'

In [234]:
train_texts, test_texts, train_y, test_y = train_test_split(tweets, y, test_size=0.2, random_state=42, stratify = y)

In [240]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 4),
    max_features=300,
    norm = None
)

word_vectorizer.fit(tweets)

In [241]:
tfidf_train = word_vectorizer.transform(train_texts)
tfidf_test = word_vectorizer.transform(test_texts)

In [242]:
print('Shape of tfidf_train:', tfidf_train.shape)
print('Shape of tfidf_test:', tfidf_test.shape)

Shape of tfidf_train: (173952, 300)
Shape of tfidf_test: (43488, 300)


In [243]:
vectorizer.get_feature_names_out()[:10]

array(['ахи', 'блин', 'болеть', 'быть', 'весь', 'вечер', 'видеть',
       'вообще', 'время', 'всё'], dtype=object)

### Обучение

In [247]:
train_X = vectorizer.fit_transform(train_texts)
test_X  = vectorizer.transform(test_texts)
clf = RandomForestClassifier(n_estimators = 500, max_depth = 10)
clf = clf.fit(train_X, train_y)

In [248]:
pred = clf.predict(test_X)

In [249]:
print('Предсказанные метки: ', pred[0:20], ".....")
print('Истинные метки: ', test_y[0:20], ".....")

Предсказанные метки:  [ 1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1] .....
Истинные метки:  [1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, 1, -1] .....
