In [256]:
import pandas as pd
import numpy as np
import tensorflow as tf
import emoji
from nltk import word_tokenize
from string import punctuation, ascii_letters
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
from collections import Counter
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers
from gensim.models import Word2Vec

In [211]:
MAX_WORDS = 2000
MAX_LEN = 30
EMB_SIZE = 100

In [212]:
morph = MorphAnalyzer(lang='ru')

In [213]:
stop_words = set(get_stop_words('ru'))

In [214]:
punctuation = set(punctuation).union((' ', '«', '»', '—', '–', '“', '”', '…'))

In [215]:
cyrillic_letters = set([chr(i) for i in range(ord('а'), ord('я') + 1)] +
                       [chr(i) for i in range(ord('А'), ord('Я') + 1)] +
                       ['ё', 'Ё'])

In [216]:
data = pd.read_excel('../data/summer_reviews.xls')
data.columns = ['rating', 'content', 'date']

In [217]:
data['target'] = (data.rating > 3).astype(int)

Попробуем опять же учитывать эмоджи.

In [218]:
def preprocess(text):
    tokens = word_tokenize(text)
    result = []
    for token in tokens:
        if (set(token).intersection(cyrillic_letters)
            or set(token).intersection(set(ascii_letters))
            or token in emoji.UNICODE_EMOJI):
            result.append(token)
    tokens = [token.lower() for token in result if token.lower() not in stop_words and len(token) >  1]
    tokens = [morph.parse(token)[0].normal_form for token in tokens]
    return tokens
        

In [219]:
data['tokens'] = data.content.apply(lambda x: preprocess(str(x)))

In [220]:
tokens_freq = Counter([val for sublist in data.tokens.tolist() for val in sublist]).most_common(MAX_WORDS)
tokens_freq = [word for word, _ in tokens_freq]

In [221]:
vocab = {v: k for k, v in enumerate(tokens_freq, start=2)}
vocab['UNK'] = 1
vocab['PAD'] = 0

In [222]:
def tokens_to_indices(tokens):
    return [vocab.get(token, 1) for token in tokens]

In [223]:
data['indices'] = data.tokens.apply(lambda x: tokens_to_indices(x))

In [247]:
X_train, X_test, y_train, y_test = train_test_split(data['indices'].values,
                                                    data['target'].values,
                                                    test_size=0.1,
                                                    random_state=42)

In [248]:
X_train = pad_sequences(X_train, MAX_LEN, padding='post', truncating='post')
X_test = pad_sequences(X_test, MAX_LEN, padding='post', truncating='post')

Обучаем эмбеддинги внутри сети.

In [312]:
inputs = tf.keras.layers.Input(shape=(MAX_LEN,))
embeddings = tf.keras.layers.Embedding(input_dim=len(vocab), output_dim=EMB_SIZE, activity_regularizer=regularizers.l2(1e-6))(inputs)
conv_1 = tf.keras.layers.Conv1D(kernel_size=3, filters=EMB_SIZE, strides=1, activation='relu')(embeddings)
pool_1 = tf.keras.layers.MaxPooling1D()(conv_1)
conv_2 = tf.keras.layers.Conv1D(kernel_size=2, filters=int(EMB_SIZE / 2), strides=1, activation='relu')(pool_1)
pool_2 = tf.keras.layers.MaxPooling1D()(conv_2)
conv_3 = tf.keras.layers.Conv1D(kernel_size=2, filters=int(EMB_SIZE / 2), strides=1, activation='relu')(pool_2)
pool_3 = tf.keras.layers.MaxPooling1D()(conv_3)
flat = tf.keras.layers.Flatten()(pool_3)
dense_1 = tf.keras.layers.Dense(64, activation='relu', activity_regularizer=regularizers.l2(1e-4))(flat)
drop_1 = tf.keras.layers.Dropout(0.2)(dense_1)
dense_2 = tf.keras.layers.Dense(32, activation='relu', activity_regularizer=regularizers.l2(1e-4))(drop_1)
drop_2 = tf.keras.layers.Dropout(0.2)(dense_2)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense_2)

In [313]:
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [314]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [315]:
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [319]:
model.fit(X_train, y_train, 
          validation_data=(X_test, y_test),
          batch_size=512,
          epochs=20)

Train on 18593 samples, validate on 2066 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fd638230d50>

Обучаем эмбеддинги отдельно.

In [320]:
w2v = Word2Vec(sentences=data.tokens.values, size=EMB_SIZE, min_count=3, window=3, workers=-1, sg=1)

In [337]:
tokens_freq = Counter([val for sublist in data.tokens.tolist() for val in sublist]).most_common()
tokens_freq = [word for word, _ in tokens_freq]

In [338]:
vocab = {v: k for k, v in enumerate(tokens_freq, start=2)}
vocab['UNK'] = 1
vocab['PAD'] = 0

In [339]:
def get_weights(vocab, model):

    dim = model.vector_size
    weights = np.zeros((len(vocab), dim))

    for word, i in vocab.items():
        if word == 'PAD':
            continue
        if word == 'UNK':
            weights[i] = np.random.normal(0, 2, dim)
        try:
            weights[i] = model.wv.get_vector(word)
        except KeyError:
            weights[i] = np.random.normal(0, 2, dim)
    return weights

In [340]:
inputs = tf.keras.layers.Input(shape=(MAX_LEN,))
embeddings = tf.keras.layers.Embedding(input_dim=len(vocab),
                                       output_dim=EMB_SIZE,
                                       weights=[get_weights(vocab, w2v)],
                                       trainable=False,
                                       activity_regularizer=regularizers.l2(1e-6))(inputs)
conv_1 = tf.keras.layers.Conv1D(kernel_size=3, filters=EMB_SIZE, strides=1, activation='relu')(embeddings)
pool_1 = tf.keras.layers.MaxPooling1D()(conv_1)
conv_2 = tf.keras.layers.Conv1D(kernel_size=2, filters=int(EMB_SIZE / 2), strides=1, activation='relu')(pool_1)
pool_2 = tf.keras.layers.MaxPooling1D()(conv_2)
conv_3 = tf.keras.layers.Conv1D(kernel_size=2, filters=int(EMB_SIZE / 2), strides=1, activation='relu')(pool_2)
pool_3 = tf.keras.layers.MaxPooling1D()(conv_3)
flat = tf.keras.layers.Flatten()(pool_3)
dense_1 = tf.keras.layers.Dense(64, activation='relu', activity_regularizer=regularizers.l2(1e-4))(flat)
drop_1 = tf.keras.layers.Dropout(0.2)(dense_1)
dense_2 = tf.keras.layers.Dense(32, activation='relu', activity_regularizer=regularizers.l2(1e-4))(drop_1)
drop_2 = tf.keras.layers.Dropout(0.2)(dense_2)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense_2)

In [341]:
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [342]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [343]:
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [345]:
model.fit(X_train, y_train, 
          validation_data=(X_test, y_test),
          batch_size=256,
          epochs=30)

Train on 18593 samples, validate on 2066 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fd608155950>

В принципе результат примерное такой же или чуть хуже.