Задание из 2-х частей.
Берем отызывы за лето (из архива с материалами или предыдущего занятия)
1. Учим conv сеть для классификации - выбить auc выше 0.95
2. Предобучаем word2vec и его эмбединга инициализируем сетку, как влияет на качество?

In [1]:
import pandas as pd
import numpy as np

from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from string import punctuation
import re

from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm


In [2]:
max_words = 10000
max_len = 100
num_classes = 1

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [3]:
data = pd.read_excel('отзывы за лето.xls')

In [4]:
data.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


Препроцессинг

In [5]:
exclude = set(punctuation)
sw = set(get_stop_words("ru"))
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = txt.lower()
    txt = re.sub('https?://\S+|www\.\S+', ' ', txt)
    txt = re.sub(r'[^\w\s]',' ', txt)
    txt = re.sub(r'[0-9]+', ' ', txt)
    txt = re.sub('\n', ' ', txt)
    txt = re.sub("не\s", "не", txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [6]:
data['text'] = data['Content'].apply(preprocess_text)
data = data[data['Rating'] != 3]
data['target'] = data['Rating'] > 3

In [7]:
data['target'] = data['target'].astype(int)
data.head()

Unnamed: 0,Rating,Content,Date,text,target
0,5,It just works!,2017-08-14,it just works,1
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14,целое удобноной приложение минус хотеть большо...,1
2,5,Отлично все,2017-08-14,отлично,1
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14,зависать работа антивирус ранее пользоваться н...,1
4,5,"Очень удобно, работает быстро.",2017-08-14,удобно работать быстро,1


Разбиение на train и test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2,
                                                    random_state=13, stratify=data['target'])

Создадим корпус слов

In [9]:
train_corpus = ' '.join(X_train.values)
train_corpus[:100]

'классно невозможно использовать рутованный телефон работать нарекание отлично немочь понять заблокир'

Токенизация

In [10]:
import nltk
from nltk.tokenize import word_tokenize
# nltk.download("punkt")

tokens = word_tokenize(train_corpus)

Отфильтруем данные

и соберём в корпус N наиболее частых токенов

In [11]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [12]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [13]:
len(tokens_filtered_top)

7848

Padding

In [14]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [15]:
import numpy as np
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [16]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

In [17]:
x_train.shape

(15798, 100)

In [18]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, AveragePooling1D, GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping  
import tensorflow as tf

In [19]:
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_test, num_classes)

In [20]:
y_train.shape

(15798, 2)

In [21]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=512, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
# model.add(AveragePooling1D())
# model.add(Conv1D(256, 5))
# model.add(Activation('relu'))
# model.add(AveragePooling1D())
model.add(Dropout(0.5))
# model.add(Conv1D(128, 3))
# model.add(Activation('relu'))
model.add(GlobalAveragePooling1D())
model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(num_classes))

In [22]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [23]:
model.compile(loss=loss,
              optimizer='adam',
              metrics=['accuracy'])

In [24]:
tf.keras.utils.plot_model(model, show_shapes=True)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [25]:
# model.summary()

In [26]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=100,
                    verbose=1,
                    validation_data=(x_test, y_val),
                    callbacks=[tensorboard, early_stopping])

Epoch 1/100
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


In [27]:
score = model.evaluate(x_test, y_val, batch_size=batch_size, verbose=1)
score_train = model.evaluate(x_train, y_train, verbose=1)
print('\n')
print('Test score:', score[0], '\tTrain score: ', score_train[0])
print('Test accuracy:', score[1], '\tTrain accuracy: ', score_train[1])



Test score: 0.20582431554794312 	Train score:  0.0809110626578331
Test accuracy: 0.9326582551002502 	Train accuracy:  0.9776554107666016


In [28]:
%load_ext tensorboard

In [29]:
%tensorboard --logdir logs

### Теперь инициализируем веса эмбеддингами word2vec

In [30]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count

In [31]:
corpus = []
for i in ([[sentence] for sentence in data.text.tolist()]):
    corpus.append(i[0].split())

In [32]:
corpus[:5]

[['it', 'just', 'works'],
 ['целое',
  'удобноной',
  'приложение',
  'минус',
  'хотеть',
  'большой',
  'доступ',
  'персональный',
  'данные',
  'телефонеприходиться',
  'пользоваться',
  'ограниченный',
  'режим'],
 ['отлично'],
 ['зависать', 'работа', 'антивирус', 'ранее', 'пользоваться', 'нормальный'],
 ['удобно', 'работать', 'быстро']]

обучим модель

In [33]:
model = Word2Vec(corpus, min_count = 5, workers=cpu_count())

In [34]:
model.wv.similar_by_word('антивирус')

[('удалить', 0.9989750385284424),
 ('установить', 0.9987788200378418),
 ('вирус', 0.9987732172012329),
 ('стоить', 0.998687207698822),
 ('ругаться', 0.9986011981964111),
 ('встроить', 0.9985995292663574),
 ('открытый', 0.9985509514808655),
 ('какой', 0.9985404014587402),
 ('pro', 0.9984924793243408),
 ('дело', 0.9984890818595886)]

In [35]:
model.wv['антивирус'], model.wv['антивирус'].shape

(array([-2.22246423e-01, -6.75868213e-01,  6.79753944e-02, -4.37089503e-01,
         4.04053926e-01,  8.54307972e-03, -9.25544277e-02,  6.67677298e-02,
         1.72699198e-01,  7.84319818e-01,  2.29080513e-01,  1.50235966e-01,
         2.96838917e-02, -5.17108977e-01,  1.32999271e-02,  1.33911595e-01,
        -2.01028228e-01, -4.18809466e-02,  1.20318808e-01,  2.55722970e-01,
        -9.69982624e-01,  9.04140234e-01,  5.69673479e-01,  3.33797991e-01,
         1.01645088e+00, -9.95044596e-03,  2.90120929e-01,  2.86735892e-01,
        -8.83910060e-02,  5.19492209e-01,  6.14972293e-01, -1.98781118e-02,
         2.29031324e-01, -6.72626570e-02, -2.48865947e-01,  5.76577902e-01,
        -1.51012644e-01,  3.79531711e-01,  4.19659823e-01,  7.44466903e-04,
        -2.45349362e-01,  1.79774359e-01, -2.34059125e-01, -4.57110763e-01,
         1.17608368e-01, -4.57845449e-01,  1.33840278e-01, -7.85816252e-01,
         4.89292085e-01,  4.04952198e-01,  1.68620292e-02, -5.65115929e-01,
        -6.7

Итак, у нас есть эмбеддинги для каждого слова в корпусе размером (100,).  
Попробуем их сложить и получить матрицу, которую потом отправим в 

In [36]:
def summ_ebm(txt):
    summ_ = np.zeros(100)
    for word in txt.split():
        if word in model.wv:
            summ_ += model.wv[word]
    return summ_

In [37]:
X_train_emb = pd.DataFrame(X_train)
X_test_emb = pd.DataFrame(X_test)

In [38]:
X_train_emb['sum_emb'] = X_train_emb.text.apply(summ_ebm)
X_test_emb['sum_emb'] = X_test_emb.text.apply(summ_ebm)

In [39]:
X_train_emb.head(2)

Unnamed: 0,text,sum_emb
547,классно,"[-0.045559320598840714, -0.1767129898071289, 0..."
1863,невозможно использовать рутованный телефон,"[-0.5540715865790844, -1.8357923179864883, 0.2..."


In [40]:
xtrain_emb = np.zeros((X_train_emb.shape[0], 100))
xtest_emb = np.zeros((X_train_emb.shape[0], 100))

In [41]:
# for i in tqdm(range(X_train_emb.shape[0])):
#     xtrain_emb[i] = X_train_emb.iloc[i].sum_emb
for i in range(X_train_emb.shape[0]):
    xtrain_emb[i] = X_train_emb.iloc[i].sum_emb

In [42]:
xtrain_emb.shape

(15798, 100)

In [43]:
for i in range(X_test_emb.shape[0]):
    xtest_emb[i] = X_test_emb.iloc[i].sum_emb

Итак мы получили матрицы из сумм эмбеддингов каждого слова в каждом отзыве. Попробуем засунуть теперь это в нейросеть в качестве весов

In [44]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=100, input_length=max_len, weights=[xtrain_emb[:max_words]]))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
# model.add(AveragePooling1D())
# model.add(Conv1D(256, 5))
# model.add(Activation('relu'))
# model.add(AveragePooling1D())
model.add(Dropout(0.5))
# model.add(Conv1D(128, 3))
# model.add(Activation('relu'))
model.add(GlobalAveragePooling1D())
model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(num_classes))

In [45]:
loss = tf.keras.losses.BinaryCrossentropy()

In [46]:
model.compile(loss=loss,
              optimizer='adam',
              metrics=['accuracy'])

In [47]:
tf.keras.utils.plot_model(model, show_shapes=True)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [48]:
tensorboard=TensorBoard(log_dir='./logs_ext_weights', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=100,
                    verbose=1,
                    validation_data=(x_test, y_val),
                    callbacks=[tensorboard, early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100


In [49]:
score = model.evaluate(x_test, y_val, batch_size=batch_size, verbose=1)
score_train = model.evaluate(x_train, y_train, verbose=1)
print('\n')
print('Test score:', score[0], '\tTrain score: ', score_train[0])
print('Test accuracy:', score[1], '\tTrain accuracy: ', score_train[1])



Test score: 0.39280107617378235 	Train score:  0.3182274103164673
Test accuracy: 0.9237974882125854 	Train accuracy:  0.9508798718452454
