In [23]:
import numpy as np
import pandas as pd
import sklearn
import torch
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pymorphy2

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

from sklearn.decomposition import LatentDirichletAllocation
import collections, re
from sklearn.feature_extraction import DictVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

from catboost import CatBoostClassifier

In [34]:
data = pd.read_csv('../datasets/pulse_stage1_patch.csv')
picabu = pd.read_csv('../datasets/labeled.csv')
task2 = pd.read_csv('../datasets/task2_data.csv')

In [17]:
morph = pymorphy2.MorphAnalyzer()

In [19]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('russian') + ['\n', '\r'])
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Timfex\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Timfex\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Timfex\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [32]:
def preprocess(text):
    text = list(filter(str.isalpha, word_tokenize(text.lower())))
    text = list(lemmatizer.lemmatize(word) for word in text)
    text = list(morph.normal_forms(word)[0] for word in text)
    text = list(word for word in text if word not in stop_words)
    return ' '.join(text)

In [49]:
mylist = []

for chunk in  pd.read_csv('../datasets/data.txt',error_bad_lines=False, chunksize=20000):
    mylist.append(chunk)

big_data = pd.concat(mylist, axis= 0)
del mylist

b'Skipping line 37: expected 6 fields, saw 12\nSkipping line 54: expected 6 fields, saw 7\nSkipping line 56: expected 6 fields, saw 12\nSkipping line 62: expected 6 fields, saw 13\nSkipping line 67: expected 6 fields, saw 12\nSkipping line 73: expected 6 fields, saw 7\nSkipping line 85: expected 6 fields, saw 10\nSkipping line 291: expected 6 fields, saw 7\nSkipping line 306: expected 6 fields, saw 7\nSkipping line 481: expected 6 fields, saw 7\nSkipping line 483: expected 6 fields, saw 9\nSkipping line 499: expected 6 fields, saw 8\nSkipping line 508: expected 6 fields, saw 24\nSkipping line 584: expected 6 fields, saw 9\nSkipping line 601: expected 6 fields, saw 7\nSkipping line 609: expected 6 fields, saw 14\nSkipping line 636: expected 6 fields, saw 9\nSkipping line 717: expected 6 fields, saw 7\nSkipping line 763: expected 6 fields, saw 7\nSkipping line 774: expected 6 fields, saw 10\nSkipping line 846: expected 6 fields, saw 15\nSkipping line 866: expected 6 fields, saw 8\nSkippi

In [84]:
col_names = ['Id',
             'label',
             'text']
ok = pd.read_csv('../datasets/ok.csv', index_col='Id', usecols=col_names)
ok['label'] = ok.label.apply(lambda x: 0 if x == '__label__NORMAL' else 1)
ok

Unnamed: 0_level_0,label,text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
41127,1,дворника надо тоже уничтожить!
6812,0,"моя старшая неделю шипела, не принимала подкид..."
6256,0,полностью с вами согласна!
189636,0,"хоть ногу вверх, ничего не изменится"
99053,0,а что значит - левого ребенка?
...,...,...
99767,0,"а вы думаете что полиция об этом не знает, про..."
87456,0,"ну а что вы тут возмущаетесь , все ведь точно ..."
185614,0,затерли до дыр данный ролик сколько лет ему уж...
86412,1,если проследят за пидором то накажут


In [87]:
texts = pd.read_csv('../datasets/negative.csv',  sep=';',header=None, usecols=col_names)
texts

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,408906762813579264,1386325944,dugarchikbellko,на работе был полный пиддес :| и так каждое за...,-1,0,0,0,8064,111,94,2
1,408906818262687744,1386325957,nugemycejela,"Коллеги сидят рубятся в Urban terror, а я из-з...",-1,0,0,0,26,42,39,0
2,408906858515398656,1386325966,4post21,@elina_4post как говорят обещаного три года жд...,-1,0,0,0,718,49,249,0
3,408906914437685248,1386325980,Poliwake,"Желаю хорошего полёта и удачной посадки,я буду...",-1,0,0,0,10628,207,200,0
4,408906914723295232,1386325980,capyvixowe,"Обновил за каким-то лешим surf, теперь не рабо...",-1,0,0,0,35,17,34,0
...,...,...,...,...,...,...,...,...,...,...,...,...
111918,425138243257253888,1390195830,Yanch_96,Но не каждый хочет что то исправлять:( http://...,-1,0,0,0,1138,32,46,0
111919,425138339503943682,1390195853,tkit_on,скучаю так :-( только @taaannyaaa вправляет мо...,-1,0,0,0,4822,38,32,0
111920,425138437684215808,1390195876,ckooker1,"Вот и в школу, в говно это идти уже надо(",-1,0,0,1,165,13,16,0
111921,425138490452344832,1390195889,LisaBeroud,"RT @_Them__: @LisaBeroud Тауриэль, не грусти :...",-1,0,1,0,2516,187,265,0


In [None]:
neg_texts = pd.read_csv('../datasets/positive.csv',  sep=';',header=None)
neg_texts

In [89]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment', return_dict=True)


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1404983.0), HTML(value='')))








HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=499.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=943.0), HTML(value='')))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=711509513.0), HTML(value='')))

In [110]:
@torch.no_grad()
def predict(text):
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).numpy()
    return predicted

In [106]:
t = predict(ok['text'][0])

In [109]:
ok['text'][0]


'во блядь пердун старый, клоун недоношеный. гореть тебе в аду всеравно, хоть ты какой макинтош одень'