In [1]:
import pandas as pd
import numpy as np

from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from string import punctuation
import re

from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm


In [2]:
max_words = 10000
max_len = 100
num_classes = 1

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [3]:
data = pd.read_excel('отзывы за лето.xls')

In [4]:
data.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


Препроцессинг

In [5]:
exclude = set(punctuation)
sw = set(get_stop_words("ru"))
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = txt.lower()
    txt = re.sub('https?://\S+|www\.\S+', ' ', txt)
    txt = re.sub(r'[^\w\s]',' ', txt)
    txt = re.sub(r'[0-9]+', ' ', txt)
    txt = re.sub('\n', ' ', txt)
    txt = re.sub("не\s", "не", txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [6]:
data['text'] = data['Content'].apply(preprocess_text)
data = data[data['Rating'] != 3]
data['target'] = data['Rating'] > 3

In [7]:
data['target'] = data['target'].astype(int)
data.head()

Unnamed: 0,Rating,Content,Date,text,target
0,5,It just works!,2017-08-14,it just works,1
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14,целое удобноной приложение минус хотеть большо...,1
2,5,Отлично все,2017-08-14,отлично,1
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14,зависать работа антивирус ранее пользоваться н...,1
4,5,"Очень удобно, работает быстро.",2017-08-14,удобно работать быстро,1


Разбиение на train и test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2,
                                                    random_state=13, stratify=data['target'])

Создадим корпус слов

In [9]:
train_corpus = ' '.join(X_train.values)
train_corpus[:100]

'классно невозможно использовать рутованный телефон работать нарекание отлично немочь понять заблокир'

Токенизация

In [10]:
import nltk
from nltk.tokenize import word_tokenize
# nltk.download("punkt")

tokens = word_tokenize(train_corpus)

Отфильтруем данные

и соберём в корпус N наиболее частых токенов

In [11]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [12]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [13]:
len(tokens_filtered_top)

7848

Padding

In [14]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [15]:
import numpy as np
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [16]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

In [17]:
x_train.shape

(15798, 100)

In [29]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping  
import tensorflow as tf

In [19]:
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_test, num_classes)

In [20]:
y_train.shape

(15798, 2)

RNN

In [38]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=512, input_length=max_len))
model.add(Masking(mask_value=0.0))

model.add(SimpleRNN(64))
# model.add(Conv1D(128, 3))
# model.add(Activation("relu"))
# model.add(AveragePooling1D())
# model.add(Conv1D(256, 5))
# model.add(Activation('relu'))
# model.add(AveragePooling1D())
# model.add(Dropout(0.5))
# model.add(Conv1D(128, 3))
# model.add(Activation('relu'))
# model.add(GlobalAveragePooling1D())
# model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(num_classes))

In [39]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [40]:
model.compile(loss=loss,
              optimizer='adam',
              metrics=['accuracy'])

In [41]:
tf.keras.utils.plot_model(model, show_shapes=True)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [42]:
# model.summary()

In [43]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=100,
                    verbose=1,
                    validation_data=(x_test, y_val),
                    callbacks=[tensorboard, early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


In [27]:
score = model.evaluate(x_test, y_val, batch_size=batch_size, verbose=1)
score_train = model.evaluate(x_train, y_train, verbose=1)
print('\n')
print('Test score:', score[0], '\tTrain score: ', score_train[0])
print('Test accuracy:', score[1], '\tTrain accuracy: ', score_train[1])



Test score: 0.19525589048862457 	Train score:  0.08929198980331421
Test accuracy: 0.9344303607940674 	Train accuracy:  0.9754399061203003


In [44]:
score = model.evaluate(x_test, y_val, batch_size=batch_size, verbose=1)
score_train = model.evaluate(x_train, y_train, verbose=1)
print('\n')
print('Test score:', score[0], '\tTrain score: ', score_train[0])
print('Test accuracy:', score[1], '\tTrain accuracy: ', score_train[1])



Test score: 0.16840551793575287 	Train score:  0.08898252248764038
Test accuracy: 0.9283544421195984 	Train accuracy:  0.968793511390686


LSTM

In [45]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=512, input_length=max_len))
model.add(Masking(mask_value=0.0))

model.add(LSTM(64))
# model.add(Conv1D(128, 3))
# model.add(Activation("relu"))
# model.add(AveragePooling1D())
# model.add(Conv1D(256, 5))
# model.add(Activation('relu'))
# model.add(AveragePooling1D())
# model.add(Dropout(0.5))
# model.add(Conv1D(128, 3))
# model.add(Activation('relu'))
# model.add(GlobalAveragePooling1D())
# model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(num_classes))

In [46]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [47]:
model.compile(loss=loss,
              optimizer='adam',
              metrics=['accuracy'])

In [48]:
tf.keras.utils.plot_model(model, show_shapes=True)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [49]:
# model.summary()

In [50]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=100,
                    verbose=1,
                    validation_data=(x_test, y_val),
                    callbacks=[tensorboard, early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


In [51]:
score = model.evaluate(x_test, y_val, batch_size=batch_size, verbose=1)
score_train = model.evaluate(x_train, y_train, verbose=1)
print('\n')
print('Test score:', score[0], '\tTrain score: ', score_train[0])
print('Test accuracy:', score[1], '\tTrain accuracy: ', score_train[1])



Test score: 0.2009226381778717 	Train score:  0.05926620587706566
Test accuracy: 0.9298734068870544 	Train accuracy:  0.9815166592597961


GRU

In [52]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=512, input_length=max_len))
model.add(Masking(mask_value=0.0))

model.add(GRU(64))
# model.add(Conv1D(128, 3))
# model.add(Activation("relu"))
# model.add(AveragePooling1D())
# model.add(Conv1D(256, 5))
# model.add(Activation('relu'))
# model.add(AveragePooling1D())
# model.add(Dropout(0.5))
# model.add(Conv1D(128, 3))
# model.add(Activation('relu'))
# model.add(GlobalAveragePooling1D())
# model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(num_classes))

In [53]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [54]:
model.compile(loss=loss,
              optimizer='adam',
              metrics=['accuracy'])

In [55]:
tf.keras.utils.plot_model(model, show_shapes=True)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [56]:
# model.summary()

In [57]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=100,
                    verbose=1,
                    validation_data=(x_test, y_val),
                    callbacks=[tensorboard, early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


In [58]:
score = model.evaluate(x_test, y_val, batch_size=batch_size, verbose=1)
score_train = model.evaluate(x_train, y_train, verbose=1)
print('\n')
print('Test score:', score[0], '\tTrain score: ', score_train[0])
print('Test accuracy:', score[1], '\tTrain accuracy: ', score_train[1])



Test score: 0.19778960943222046 	Train score:  0.07472924888134003
Test accuracy: 0.9245569705963135 	Train accuracy:  0.9729712605476379
