In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
positive = pd.read_csv('positive.csv', sep=';', usecols=[3], names=['text'])
positive['label'] = ['positive'] * len(positive)
negative = pd.read_csv('negative.csv', sep=';', usecols=[3], names=['text'])
negative['label'] = ['negative'] * len(negative)
df = pd.concat([positive, negative])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df.text, df.label)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from nltk import ngrams

In [None]:
sent = 'Если б мне платили каждый раз'.split()
list(ngrams(sent, 1))

In [None]:
list(ngrams(sent, 2))

In [None]:
vec = CountVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(x_train)

In [None]:
list(vec.vocabulary_.items())[:10]

In [None]:
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)

In [None]:
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

In [None]:
vec = CountVectorizer(ngram_range=(2, 3))
bow = vec.fit_transform(x_train)
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vec = TfidfVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(x_train)
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

In [None]:
example = 'Это был плохой опыт:('
word_tokenize(example)

### Стоп-слова и пунктуация

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.words('russian'))

In [None]:
from string import punctuation
punctuation

In [None]:
noise = stopwords.words('russian') + list(punctuation)

In [None]:
vec = CountVectorizer(ngram_range=(1, 1), tokenizer=word_tokenize, stop_words=noise)
bow = vec.fit_transform(x_train)
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

In [None]:
!pip install razdel

In [None]:
from razdel import tokenize

def tokenizer(string):
    return [x.text for x in tokenize(string)]

In [None]:
tokenizer(example)

In [None]:
vec = CountVectorizer(ngram_range=(1, 1), tokenizer=tokenizer, stop_words=noise)
bow = vec.fit_transform(x_train)
clf = LogisticRegression(solver='liblinear', random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

In [None]:
import pymorphy2

morph = pymorphy2.MorphAnalyzer()


In [None]:
def lemmatize(string):
    return [morph.parse(token)[0].normal_form for token in tokenizer(string)]

print(lemmatize(example))

In [None]:
vec = TfidfVectorizer(ngram_range=(1, 1), tokenizer=lemmatize)
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

In [None]:
res = x_test.to_frame().reset_index(drop=True)
res['pred'] = pd.Series(pred)
pd.set_option('display.max_colwidth', None)
res.sample(10)