# Решение задачи классификации текстов по сантименту при помощи нейронных сетей-4

### Эпиграф

Будучи глубоко неудовлетворен теми результатами, которых позволяла достичь линейная и логистическая регрессия, я решил отложить сдачу финального проекта до тех пор, пока не обучусь нейросетям. Т.к. в настоящей специализации они проходились, я считаю такое решение задачи нейросетями совершенно легитимным.


In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download("punkt")
nltk.download("wordnet")
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
print(tf.__version__)
import keras
import keras.backend as K
import keras.layers as L
import tensorflow.compat.v1 as v1
from sklearn.model_selection import train_test_split

np.random.seed(42)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rookie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rookie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


2.3.1


In [2]:
dft = pd.read_csv("_data\products_sentiment_train.tsv", sep='\t', header=None)
dft.columns = ["text", "label"]

dfv = pd.read_csv("_data\products_sentiment_test.tsv", sep='\t')

X = [x[0] for x in dft[["text"]].values.tolist()]
Y = [y[0] for y in dft[["label"]].values.tolist()]
X_test = [x[0] for x in dfv[["text"]].values.tolist()]

def tokenize(text):
    return [lemmatizer.lemmatize(w) for w in text.lower().split() if w not in stopwords and w.isalnum()]

def tokenize_texts(texts):
    return [tokenize(text) for text in texts]

Xt = tokenize_texts(X)
Xtest_t = tokenize_texts(X_test)

PAD = "<PAD>"
START = "<START>"
END = "<END>"
UNK = "<UNK>"

all_words = [PAD, START, END, UNK] + list(set([item for sublist in Xt + Xtest_t for item in sublist]))
vocab = {word: idx for idx, word in enumerate(all_words)}

len(Xt), len(Y), len(vocab)

(2000, 2000, 3903)

In [5]:
import pickle

with open(r"_data\vocab.pckl", "wb") as f:
    pickle.dump(vocab, f)

In [6]:
#4.1 Функции для работы модели
from keras.preprocessing.sequence import pad_sequences

def to_sequence(text):
    return np.array([1] + [vocab[w] if w in vocab else vocab[UNK] for w in text] + [2])

def to_matrix(texts, maxlen=0):
    seqs = [to_sequence(text) for text in texts]
    if maxlen == 0:
        maxlen = min(9999, max(list(map(len, seqs))))
    return pad_sequences(seqs, maxlen=maxlen, dtype='int32', padding='post', truncating='post', value=0)

Xt[:3], to_matrix(Xt[:3])

([['2', 'take', 'around', '640x480', 'picture'],
  ['downloaded',
   'trial',
   'version',
   'computer',
   'associate',
   'ez',
   'firewall',
   'antivirus',
   'fell',
   'love',
   'computer',
   'security',
   'system'],
  ['wrt54g',
   'plus',
   'hga7t',
   'perfect',
   'solution',
   'need',
   'wireless',
   'coverage',
   'wider',
   'area',
   'house',
   'case']],
 array([[   1, 2935,  579,  841,  229, 1192,    2,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   1,  350, 3448,  906,  712,  989, 2532, 2902, 3428, 3165, 2225,
          712, 1430,  954,    2],
        [   1,  283, 3187,  422,  869, 3397, 2791,  610,  870, 1295, 3028,
         1348, 2394,    2,    0]]))

In [12]:
#5.1. Модель 1 - дает точность 0.7666
N_LSTM = 64

def build_model():
    X = L.Input(batch_input_shape=(None, None))
    e = L.Embedding(len(vocab), 64, mask_zero=True)(X)
    l1 = L.Bidirectional(L.LSTM(units=N_LSTM, return_sequences=False, dropout=0.25))(e)    
    d1 = L.Dense(64, activation='relu')(l1)
    d1 = L.Dropout(0.5)(d1) 
    Y = L.Dense(1, activation='sigmoid')(d1)
    return keras.models.Model(inputs=X, outputs=Y)

model = build_model()
model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss="binary_crossentropy", metrics=['accuracy'])
model.summary()

Model: "functional_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_5 (Embedding)      (None, None, 64)          249792    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
Total params: 324,161
Trainable params: 324,161
Non-trainable params: 0
_______________________________________________

In [14]:
X_t, X_v, Y_t, Y_v = train_test_split(Xt, Y, test_size=0.3, random_state=42)

model = build_model()
model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss="binary_crossentropy", metrics=['binary_accuracy'])
model.fit(to_matrix(X_t), np.array(Y_t), validation_data=(to_matrix(X_v), np.array(Y_v)), initial_epoch=0, epochs=5, batch_size=1,
         callbacks=[tf.keras.callbacks.ModelCheckpoint('./best_model2.hdf5', monitor='val_binary_accuracy', verbose=0, save_best_only=True)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x22b2a6a3ca0>

In [15]:
Xtest_t[:3]


[['small',
  'digital',
  'elph',
  'rather',
  'one',
  'camera',
  'better',
  'resolution',
  'picture',
  'quality',
  'size',
  '2',
  'unless',
  'small',
  'cary',
  'around'],
 ['way',
  'first',
  'disk',
  'played',
  'naturally',
  '31',
  'day',
  'purchase',
  'dvd',
  'player',
  'froze'],
 ['better', 'zen', 'micro', 'outlook', 'compatibility']]

In [41]:
Y_pred = [0 if y < 0.5 else 1 for y in model.predict(to_matrix(Xtest_t))]

df = pd.DataFrame()
df["y"] = Y_pred
df.to_csv("kaggle_submission.csv", sep=',', index_label="Id")

In [None]:
from IPython.display import Image

Image(filename = 'screen.png')