In [None]:
import pandas as pd
import tensorflow as tf

train_data = pd.read_csv('train.csv')[['sentence', 'sentiment']]
train_data['sentiment'] = train_data['sentiment'].map({'−': 0, '?': 1, '+': 2})

# Train Data Labels
train_data["sentiment"] = train_data["sentiment"].astype('category')
train_data["sentiment"] = train_data["sentiment"].cat.codes
train_features, train_labels = train_data['sentence'], tf.one_hot(
    train_data["sentiment"], 3)


In [None]:
import numpy as np

In [None]:
train_data.head()

Unnamed: 0,sentence,sentiment
0,При этом всегда получал качественные услуги.,2
1,"Не вижу, за что хотя бы 2 поставить, сервис на 1!",0
2,"Вот так ""Мой любимый"" банк МКБ меня обманул.",0
3,Отвратительное отношение к клиентам.,0
4,"Всегда в любое время дня и ночи помогут, ответ...",2


In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

tokenized_train_features = [word_tokenize(each_train_text) for each_train_text in train_features]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from gensim.models import word2vec

#config
vector_size = 300

w2v_model = word2vec.Word2Vec(
    tokenized_train_features,
    vector_size=vector_size,  # Dimensionality of the word vectors
    window=20,
    min_count=1,
    sg=1  # 1 for skip-gram; otherwise CBOW
)

In [None]:
vocab_list = list(w2v_model.wv.key_to_index.keys())

def remove_OOV_vocab(sample: list, list_vocab):
    """ Takes in tokenized sample in the form of list 
    and the vocabulary list and removes tokens from sample
    that are not in the vocabulary list"""
    in_vocab_sample = []
    for each_token in sample:
        if each_token in list_vocab:
            in_vocab_sample.append(each_token)
    return in_vocab_sample
  
tokenized_train_features = [remove_OOV_vocab(each_test_sample, vocab_list) for each_test_sample in train_features]


In [None]:
vocab = w2v_model.wv.key_to_index.keys()
embedding_matrix = w2v_model.wv[vocab]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_seq_len = 20

def w2v_indexed_token_sequences(w2v_model, list_features):
    indexed_features = []
    for each_seq in list_features:
        list_token_indices = []
        for each_token in each_seq:
            try:
                list_token_indices.append(w2v_model.wv.key_to_index[each_token])
            except KeyError as e:
                continue
        indexed_features.append(list_token_indices)
    return indexed_features

indexed_train_features = w2v_indexed_token_sequences(w2v_model, tokenized_train_features)

padded_train = pad_sequences(indexed_train_features, padding = 'post', maxlen=max_seq_len, truncating='post')

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM

def get_model():
    model = Sequential()
    model.add(
        Embedding(input_dim=21450,
                  output_dim=vector_size,
                  weights=[embedding_matrix],
                  input_length=max_seq_len))
    model.add(Dropout(0.6))
    model.add(LSTM(max_seq_len,return_sequences=True))
    model.add(LSTM(3))
    model.add(Dense(3,activation='softmax'))
    return model
 
# Adding callbacks for best model checkpoint
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                  patience=2,
                                  verbose=1,
                                  restore_best_weights=True),
    keras.callbacks.ModelCheckpoint(filepath='models/lstm_with_w2v.hdf5',
                                    verbose=1,
                                    save_best_only=True)
]

model = get_model()
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=[tf.keras.metrics.AUC()])

tf.config.run_functions_eagerly(True)

# storing model training details to analyze later
history = model.fit(padded_train, 
                    train_labels,
                    validation_split=0.33,
                    callbacks=callbacks,
                    epochs=10)

Epoch 1/10
  2/406 [..............................] - ETA: 25s - loss: 1.0896 - auc_1: 0.5965



Epoch 1: val_loss improved from inf to 0.95814, saving model to models/lstm_with_w2v.hdf5
Epoch 2/10
Epoch 2: val_loss improved from 0.95814 to 0.91924, saving model to models/lstm_with_w2v.hdf5
Epoch 3/10
Epoch 3: val_loss improved from 0.91924 to 0.90966, saving model to models/lstm_with_w2v.hdf5
Epoch 4/10
Epoch 4: val_loss improved from 0.90966 to 0.89819, saving model to models/lstm_with_w2v.hdf5
Epoch 5/10
Epoch 5: val_loss improved from 0.89819 to 0.88773, saving model to models/lstm_with_w2v.hdf5
Epoch 6/10
Epoch 6: val_loss improved from 0.88773 to 0.87220, saving model to models/lstm_with_w2v.hdf5
Epoch 7/10
Epoch 7: val_loss improved from 0.87220 to 0.86131, saving model to models/lstm_with_w2v.hdf5
Epoch 8/10
Epoch 8: val_loss improved from 0.86131 to 0.84620, saving model to models/lstm_with_w2v.hdf5
Epoch 9/10
Epoch 9: val_loss improved from 0.84620 to 0.83832, saving model to models/lstm_with_w2v.hdf5
Epoch 10/10
Epoch 10: val_loss improved from 0.83832 to 0.83106, savin

In [None]:
print(train_labels)

tf.Tensor(
[[0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]], shape=(19361, 3), dtype=float32)


In [None]:
model_with_w2v = keras.models.load_model('models/lstm_with_w2v.hdf5')

In [None]:
y_pred_one_hot_encoded = (model_with_w2v.predict(padded_train)> 0.5).astype("int32")
y_pred_train = np.array(tf.argmax(y_pred_one_hot_encoded, axis=1))



In [None]:
print(y_pred_train[:100])

[0 0 0 0 2 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 0 0 0 0 0 2
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 2
 0 0 0 0 0 0 0 2 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
from sklearn.metrics import classification_report
# Training set
print(classification_report(train_data['sentiment'], y_pred_train))

In [None]:
test_data = pd.read_csv('''the file will be here''')

# Test Data Labels
test_data["sentiment"] = test_data["sentiment"].astype('category')
test_data["sentiment"] = test_data["sentiment"].cat.codes
test_features, test_labels = test_data['sentence'], tf.one_hot(
    test_data["sentiment"], 3)

In [None]:
tokenized_test_features = [word_tokenize(each_test_text) for each_test_text in test_features]
tokenized_test_features = [remove_OOV_vocab(each_test_sample, vocab_list) for each_test_sample in tokenized_test_features]

AttributeError: ignored

In [None]:
indexed_test_features = w2v_indexed_token_sequences(w2v_model, tokenized_test_features)
padded_test = pad_sequences(indexed_test_features, padding = 'post', maxlen=max_seq_len, truncating='post')

In [None]:
y_pred_one_hot_encoded = (model_with_w2v.predict(padded_test)> 0.5).astype("int32")
y_pred_test = np.array(tf.argmax(y_pred_one_hot_encoded, axis=1))

In [None]:
# Test Set
print(classification_report(test_data['sentiment'], y_pred_test))