[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sjut/HSE-Compling/blob/master/seminars/7_Sentiment.ipynb)

In [None]:
!pip install innvestigate

In [None]:
!pip install tensorflow==1.14.0

In [None]:
import re
import keras.backend
import keras.models
import matplotlib.pyplot as plt
from matplotlib import cm, transforms

import numpy as np
import pandas as pd
import os
import pickle
import time

In [None]:
!wget http://vectors.nlpl.eu/repository/11/180.zip
!unzip 180.zip

In [None]:
!wget https://raw.githubusercontent.com/sjut/HSE-Compling/master/seminars/data/reviews_tok.txt
!wget https://raw.githubusercontent.com/sjut/HSE-Compling/master/seminars/data/reviews_scores.txt

## Данные
Будем использовать кусочек данных с соревнования SentiRuEval.
Они уже предобработаны (лемматизированы и размечены POS-тегами). Каждый текст - строчка из токенов *лемма_тег*.
Оценки усреднены по трем аспектам, шкалированы от 1 до 10.
Мы будем строить бинарную классификацию, поэтому будем считать оценки выше 5 положительными, а 5 и ниже — отрицательными.

In [None]:
texts = []
with open("reviews_tok.txt") as f:
    for line in f:
        texts.append(line.rstrip('\r\n').split())
scores = []
with open("reviews_scores.txt") as f:
    scores = list(map(lambda x: float(x.rstrip("\r\n")), f.readlines()))

In [None]:
scores = np.array(scores)
binary_scores = scores > 5.
print(binary_scores[0])

In [None]:
binary_scores = binary_scores.astype(int)

In [None]:
def max_length(texts):
    return max(len(t) for t in texts)

In [None]:
print(len(texts))
print(len(scores))

## Задание
Посчитайте $tf*idf$ для токенов и биграмм сначала на всем корпусе, а затем отдельно для положительных и отрицательных отзывов.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=5)

In [None]:
## YOUR CODE HERE

In [None]:
## Выведем топ по tf*idf

In [None]:
## Для положительных отзывов

In [None]:
## Для отрицательных отзывов

## Random Forest
Возьмем представление в виде $tf*idf$ и попробуем обучить на нем классификатор.
Будем использовать Randomm Forest, чтобы легко вытащить важность признаков.

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
X_train = vectorizer.fit_transform([" ".join(t) for t in texts[:2000]]).toarray()
y_train = binary_scores[:2000]
clf.fit(X_train, y_train)
y_test = binary_scores[2000:]
X_test = vectorizer.transform([" ".join(t) for t in texts[2000:]]).toarray()
clf.score(X_test, y_test)

In [None]:
# А вот так можно посмотреть на самые важные признаки
sorted(zip(vectorizer.get_feature_names(), clf.feature_importances_), key=lambda x: x[1], reverse=True)

**Выводы**: не очень утешительные

## Оценка тональности с помощью CNN

В качестве входных представлений будем использовать word2vec для лемм с POS-тегами UD.
Архитектура классификатора примерно воспроизводит описанную в [статье Arras et al. 2017](http://www.aclweb.org/anthology/W16-1601); а для визуализации воспользуемся библиотекой [iNNvestigate](https://github.com/albermax/innvestigate).

In [None]:
from gensim.models import KeyedVectors
w2v_model = KeyedVectors.load_word2vec_format('model.bin', binary=True)

In [None]:
from sklearn.model_selection import train_test_split
scores_train, scores_val, texts_train, texts_val = train_test_split(
    binary_scores, texts[:2000], test_size=0.3)

In [None]:
from collections import Counter
from itertools import chain

MAX_LEN = max(max_length(texts_train), max_length(texts_val))

def load_dataset(lines, embedding_dim, num_examples=None):
    prep = lines[:num_examples]
    vocab = Counter()
    x_tensor = np.zeros((len(prep), MAX_LEN, embedding_dim))
    for i, text in enumerate(prep):
        for j, w in enumerate(text):
            try:
                x_tensor[i, j, :] = w2v_model[w]
            except KeyError:
                pass
        vocab[w] += 1
    return x_tensor, vocab

In [None]:
input_tensor_train, inp_vocab_train = load_dataset(texts_train, w2v_model.vector_size)
input_tensor_val, inp_vocab_val = load_dataset(texts_val, w2v_model.vector_size)

In [None]:
w2v_model.vector_size

In [None]:
input_tensor_train.shape

In [None]:
scores_train

In [None]:
embedding_dim = w2v_model.vector_size
inp_vocab = inp_vocab_train + inp_vocab_val
vocab_inp_size = len(inp_vocab) + 1

Картинка про плассификатор и оценку значимости входных слов ([источник](https://doi.org/10.1371/journal.pone.0181142.g001)):
<img src="https://camo.githubusercontent.com/ba37f37fdbb90ccd76f1c4bf399e0cb8ddbc66f0/68747470733a2f2f692e696d6775722e636f6d2f595144665335502e706e67"/>

In [None]:
from innvestigate.utils.tests.networks import base as network_base
def build_network(max_len, voc_size, embedding_dim, output_n, activation=None, dense_unit=256, dropout_rate=0.25):
    if activation:
        activation = "relu"

    net = {}
    net["in"] = keras.Input(shape=[1, max_len, embedding_dim])
    net["conv"] = keras.layers.Conv2D(filters=100, kernel_size=(1,2), strides=(1, 1), padding='valid')(net["in"])
    net["pool"] = keras.layers.MaxPooling2D(pool_size=(1, max_len - 1), strides=(1,1))(net["conv"])
    net["out"] = network_base.dense_layer(keras.layers.Flatten()(net["pool"]), units=output_n, activation=activation)
    net["sm_out"] = network_base.softmax(net["out"])


    net.update({
        "input_shape": [1, max_len, embedding_dim],
        "output_n": output_n,
    })
    return net

In [None]:
net = build_network(MAX_LEN, vocab_inp_size, embedding_dim, 2)
model_without_softmax = keras.models.Model(inputs=net['in'], outputs=net['out'])
model_with_softmax = keras.models.Model(inputs=net['in'], outputs=net['sm_out'])

In [None]:
model_without_softmax.summary()

In [None]:
def to_one_hot(y):
    return keras.utils.to_categorical(y, 2)

def train_model(model, epochs=20):
    
    x_train = np.expand_dims(input_tensor_train, axis=1)
    y_train = to_one_hot(scores_train)
    
    x_val = np.expand_dims(input_tensor_val, axis=1)
    y_val = to_one_hot(scores_val)

    model.compile(loss='categorical_crossentropy',
                  optimizer=keras.optimizers.Adam(),
                  metrics=['accuracy'])

    history = model.fit(x_train, y_train,
                        batch_size=256,
                        epochs=epochs,
                        verbose=1,
                        validation_data=(x_val, y_val),
                        shuffle=True)

In [None]:
train_model(model_with_softmax, epochs=10)

In [None]:
model_without_softmax.set_weights(model_with_softmax.get_weights())

In [None]:
methods = ['gradient', 'lrp.z', 'lrp.alpha_2_beta_1', 'pattern.attribution']
kwargs = [{}, {}, {}, {'pattern_type': 'relu'}]

In [None]:
import innvestigate
analyzers = []

for method, kws in zip(methods, kwargs):
    analyzer = innvestigate.create_analyzer(method, model_without_softmax, **kws)
    analyzer.fit(np.expand_dims(input_tensor_train, axis=1), batch_size=256, verbose=1)
    analyzers.append(analyzer)

In [None]:
def analyze_scores(X, Y, ridx):
    max_len = max_length(input_tensor_train)

    analysis = np.zeros([len(analyzers), 1, max_len])
    x, y = X[ridx], Y[ridx]
    t_start = time.time()
    x = x.reshape((1, 1, max_len, embedding_dim))
    presm = model_without_softmax.predict_on_batch(x)[0] #forward pass without softmax
    prob = model_with_softmax.predict_on_batch(x)[0] #forward pass with softmax
    y_hat = prob.argmax()
  
    for aidx, analyzer in enumerate(analyzers):
        a = np.squeeze(analyzer.analyze(x))
        a = np.sum(a, axis=1)
        analysis[aidx] = a
    t_elapsed = time.time() - t_start
    print('Review %d (%.4fs)'% (ridx, t_elapsed))
    return analysis, y_hat

In [None]:
analyze_scores(input_tensor_train, scores_train, 97)

In [None]:
def plot_text_heatmap(words, scores, title="", width=5, height=0.2, verbose=0, max_word_per_line=10):
    fig = plt.figure(figsize=(width, height))
    
    ax = plt.gca()

    ax.set_title(title, loc='left')
    tokens = words
    if verbose > 0:
        print('len words : %d | len scores : %d' % (len(words), len(scores)))

    cmap = plt.cm.ScalarMappable(cmap=cm.bwr)
    cmap.set_clim(0, 1)
    
    canvas = ax.figure.canvas
    t = ax.transData

    # нормализация:
    # - отрицательные оценки [0, 0.5]
    # - положительные оценки (0.5, 1]
    normalized_scores = 0.5 * scores / np.max(np.abs(scores)) + 0.5
    
    if verbose > 1:
        print('Raw score')
        print(scores)
        print('Normalized score')
        print(normalized_scores)

    loc_y = -0.2

    for i, token in enumerate(tokens):
        *rgb, _ = cmap.to_rgba(normalized_scores[i], bytes=True)
        color = '#%02x%02x%02x' % tuple(rgb)
        
        text = ax.text(0.0, loc_y, token,
                       bbox={
                           'facecolor': color,
                           'pad': 5.0,
                           'linewidth': 1,
                           'boxstyle': 'round,pad=0.5'
                       }, transform=t)

        text.draw(canvas.get_renderer())
        ex = text.get_window_extent()
        
        # переходим на другую строчку, если слишком много слов
        if (i+1) % max_word_per_line == 0:
            loc_y = loc_y -  2.5
            t = ax.transData
        else:
            t = transforms.offset_copy(text._transform, x=ex.width+15, units='dots')

    if verbose == 0:
        ax.axis('off')

In [None]:
a, y_pred = analyze_scores(input_tensor_train, scores_train, 100, verbose=1)

In [None]:
print(" ".join(texts_train[100]))

In [None]:
a[0][0]

In [None]:
plot_text_heatmap(
    texts_train[100],
    a[0][0]
)

In [None]:
idx = 0
words = texts_val[idx]
    
print('Review(id=%d): %s' % (idx, ' '.join(words)))
y_true = scores_val[idx]
a, y_pred = analyze_scores(input_tensor_val, scores_val, idx)

print("Pred class : %d %s" %
      (y_pred, '✓' if y_pred == y_true else '✗ (%d)' % y_true)
      )
                            
for j, method in enumerate(methods):
    plot_text_heatmap(words, a[j].reshape(-1), title='Method: %s' % method, verbose=0)
    plt.show()
    print()