In [43]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU,SimpleRNN

# Preprocessing

In [2]:
pos = pd.read_csv("positive.csv", header=None, error_bad_lines=False, sep=';', names = [str(x) for x in (list(range(3)) + ['Text'] + list(range(4,12)))])
pos["emotion"] = 1

neg = pd.read_csv("negative.csv", header=None, error_bad_lines=False, sep=';', names = [str(x) for x in (list(range(3)) + ['Text'] + list(range(4,12)))])
neg["emotion"] = 0

df = pd.concat([neg, pos], ignore_index=True)

reg = re.compile('[^а-яА-ЯёЁ\s.():|!?]')
df["cleaned"] = df["Text"].apply(lambda x:re.sub(r'\s+', ' ', (reg.sub(' ', x))).strip())
df["cleaned"] = df["cleaned"].apply(lambda x: re.sub(r'([.():|])',r' \1 ',x))

uTags = {
        'A':       'ADJ',                                                                                                                                                                                                                                                                  
        'ADV':     'ADV',                                                                                                                                                                                                                                                                  
        'ADVPRO':  'ADV',                                                                                                                                                                                                                                                                  
        'ANUM':    'ADJ',                                                                                                                                                                                                                                                                    
        'APRO':    'DET',                                                                                                                                                                                                                                                                    
        'COM':     'ADJ',                                                                                                                                                                                                                                                                  
        'CONJ':    'SCONJ',                                                                                                                                                                                                                                                                 
        'INTJ':    'INTJ',                                                                                                                                                                                                                                                                 
        'NONLEX':  'X',                                                                                                                                                                                                                                                                      
        'NUM':     'NUM',                                                                                                                                                                                                                                                                   
        'PART':    'PART',                                                                                                                                                                                                                                                                   
        'PR':      'ADP',                                                                                                                                                                                                                                                                    
        'S':       'NOUN',                                                                                                                                                                                                                                                                   
        'SPRO':    'PRON',                                                                                                                                                                                                                                                                  
        'UNKN':    'X',                                                                                                                                                                                                                                                                      
        'V':       'VERB'
    }


def sentenceInLemmas(sentence, m=Mystem()):
    processed = m.analyze(sentence)
    tagged = []
    for w in processed:        
        try:
            lemma = w["analysis"][0]["lex"].lower().strip()
            pos = w["analysis"][0]["gr"].split(',')[0]
            pos = pos.split('=')[0].strip()
            tagged.append(lemma+'_'+uTags[pos])
        except:
            if 'text' in w:
                tagged.append(w['text'].strip())
            else:
                tagged.append(w)
    return re.sub('\s+',' ',' '.join(tagged)).split(' + ')

text = ' + '.join(df["cleaned"].values)
df["lemmed"] = sentenceInLemmas(text)

In [3]:
part_index = list(df.loc[df['emotion']==0].index[:15000])+list(df.loc[df['emotion']==1].index[:15000])

# GloVe

## Vocabulary

In [4]:
words_set = set()
for i in tqdm(df.iloc[part_index].index):
    sentence = df.iloc[i]['lemmed']
    for word in sentence.split(' '):
        words_set.add(word)

100%|██████████████████████████████████████████████████████████████████████████| 30000/30000 [00:09<00:00, 3195.85it/s]


In [5]:
vocab_len = len(words_set)

In [6]:
word_idx = {}
for n,word in enumerate(words_set):
    word_idx[word]=n

## Co-occurance matrix

In [7]:
P = np.zeros(shape=(vocab_len,vocab_len))
window_size = 5
for s in tqdm(df.iloc[part_index]['lemmed'].values):
    words = s.split(' ')
    for n, word in enumerate(words):
        i1 = word_idx[word]
        occuring_words_l = words[max([0,n-window_size]):n]
        occuring_words_r = words[n+1:min([len(words),n+1+window_size])]
        occuring_words = occuring_words_l + occuring_words_r
        distances = list(range(len(occuring_words_l),0,-1)) + list(range(1,len(occuring_words_r)+1,1))
        weights = [1/x for x in distances]
        for ow,w in zip(occuring_words,weights):
            i2 = word_idx[ow]
            if P[i1,i2] < 100:
                P[i1,i2] = min([P[i1,i2]+w,100])
P_max = P.max()

100%|██████████████████████████████████████████████████████████████████████████| 30000/30000 [00:10<00:00, 2811.04it/s]


## Training GloVe

In [9]:
word_dimensionality = 300
v_vectors = np.random.uniform(-1,1,(vocab_len, word_dimensionality))
u_vectors = np.random.uniform(-1,1,(vocab_len, word_dimensionality))
u_learning_rate = 2e-3
v_learning_rate = 2e-3

n_epochs = 8

for step in tqdm_notebook(range(n_epochs)):
    # Здесь будут градиенты
    u_changes = []
    v_changes = []
    
    loss = 0
    
    for i in tqdm_notebook(range(vocab_len)):
        # градиент для конкретного вектора
        u_change = None
        v_change = None
        
        for j in range(vocab_len):
            P_local = P[i,j]
            if P_local == 0:
                continue
            
            # Выражение в скобках, чтобы не пересчитывать несколько раз
            diff = np.matmul(u_vectors[i].T, v_vectors[j]) - math.log(P_local)
            
            loss += (P_local/P_max)**(3/4)*(diff)**2
            
            v_addition = (P_local/P_max)**(3/4)*diff*u_vectors[j]
            u_addition = (P_local/P_max)**(3/4)*diff*v_vectors[j]
            
            if u_change is not None:
                u_change += u_addition
                v_change += v_addition
            else:
                u_change = u_addition
                v_change = v_addition
        u_changes.append(u_change)
        v_changes.append(v_change)
        
    print('loss before epoch {}: {}'.format(step, loss))
        
    u_vectors -= u_learning_rate * np.array(u_changes)
    v_vectors -= v_learning_rate * np.array(v_changes)
    
    u_learning_rate *= 0.85
    v_learning_rate *= 0.85

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=28568.0), HTML(value='')))


loss before epoch 0: 1243006.1113330657


HBox(children=(FloatProgress(value=0.0, max=28568.0), HTML(value='')))


loss before epoch 1: 1008530.8014066451


HBox(children=(FloatProgress(value=0.0, max=28568.0), HTML(value='')))


loss before epoch 2: 929491.2084711127


HBox(children=(FloatProgress(value=0.0, max=28568.0), HTML(value='')))


loss before epoch 3: 884182.9707948067


HBox(children=(FloatProgress(value=0.0, max=28568.0), HTML(value='')))


loss before epoch 4: 854356.6528184398


HBox(children=(FloatProgress(value=0.0, max=28568.0), HTML(value='')))


loss before epoch 5: 833439.6178608071


HBox(children=(FloatProgress(value=0.0, max=28568.0), HTML(value='')))


loss before epoch 6: 818129.6436833286


HBox(children=(FloatProgress(value=0.0, max=28568.0), HTML(value='')))


loss before epoch 7: 806534.5097516429



In [10]:
final_loss = 0
for i in tqdm(range(vocab_len)):
    for j in range(vocab_len):
        if P[i,j] == 0:
            continue
        final_loss += (P[i,j]/P_max)**(3/4)*(np.matmul(u_vectors[i].T, v_vectors[j]) - math.log(P[i,j]))**2
print('finished at loss: {}'.format(final_loss))

100%|████████████████████████████████████████████████████████████████████████████| 28568/28568 [09:14<00:00, 51.49it/s]

finished at loss: 797510.1406268472





## Resulting vectors for words

In [11]:
word2vec = u_vectors + v_vectors

# Test

In [39]:
def to_matrix(frame, word2vec):
    frame.reset_index(inplace=True)
    matrixes = []
    tone = []
    for i in tqdm_notebook(frame.index):
        try:
            words = frame.iloc[i]['lemmed'].split(' ')
            word_vectors = []
            for w in words:
                try:
                    word_vectors.append(word2vec[word_idx[w]])
                except KeyError:
                    print(w)
            if len(word_vectors) > 2 and len(word_vectors) <= 32:
                matrixes.append(np.array(word_vectors))
                tone.append(frame.iloc[i]['emotion'])
        except Exception as e:
            print(frame.iloc[i]['lemmed'])
            raise e

    return matrixes, tone

In [15]:
vocabsize = 300
timesteps = 32
batch_size = 64
n_epochs = 200
n_parts = 5

In [40]:
matrices, tones = to_matrix(df.iloc[part_index], word2vec)

HBox(children=(FloatProgress(value=0.0, max=30000.0), HTML(value='')))




In [41]:
for i in tqdm_notebook(range(len(matrices))):
    matrices[i] = np.vstack(
        (np.random.normal(scale=0.005, size=(timesteps - matrices[i].shape[0], vocabsize)),matrices[i]))
matrices = np.array(matrices)

HBox(children=(FloatProgress(value=0.0, max=29753.0), HTML(value='')))




In [44]:
X_train, X_test, y_train, y_test = train_test_split(matrices, tones, test_size=0.2, random_state=42,stratify=tones)

In [45]:
y_train = np.array(y_train, ndmin=2).T
y_test = np.array(y_test, ndmin=2).T

In [46]:
def lstm(timesteps, vocabsize):
    model = Sequential()
    model.add(LSTM(1, return_sequences=False, input_shape=(timesteps, vocabsize), dropout=0.05, activation="sigmoid"))
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [47]:
model3 = lstm(timesteps, vocabsize)

In [48]:
%%time
model3.fit(X_train, y_train, batch_size=batch_size, epochs=50, verbose=2,validation_data=(X_test,y_test))

Train on 23802 samples, validate on 5951 samples
Epoch 1/50
 - 19s - loss: 0.2666 - acc: 0.8600 - val_loss: 0.2156 - val_acc: 0.8704
Epoch 2/50
 - 17s - loss: 0.1997 - acc: 0.8816 - val_loss: 0.1908 - val_acc: 0.8852
Epoch 3/50
 - 17s - loss: 0.1808 - acc: 0.9006 - val_loss: 0.1831 - val_acc: 0.9020
Epoch 4/50
 - 18s - loss: 0.1719 - acc: 0.9190 - val_loss: 0.1769 - val_acc: 0.9027
Epoch 5/50
 - 18s - loss: 0.1663 - acc: 0.9324 - val_loss: 0.1684 - val_acc: 0.9361
Epoch 6/50
 - 18s - loss: 0.1604 - acc: 0.9444 - val_loss: 0.1651 - val_acc: 0.9366
Epoch 7/50
 - 17s - loss: 0.1564 - acc: 0.9484 - val_loss: 0.1609 - val_acc: 0.9456
Epoch 8/50
 - 17s - loss: 0.1533 - acc: 0.9517 - val_loss: 0.1589 - val_acc: 0.9504
Epoch 9/50
 - 18s - loss: 0.1503 - acc: 0.9576 - val_loss: 0.1531 - val_acc: 0.9524
Epoch 10/50
 - 17s - loss: 0.1466 - acc: 0.9605 - val_loss: 0.1501 - val_acc: 0.9580
Epoch 11/50
 - 16s - loss: 0.1433 - acc: 0.9637 - val_loss: 0.1474 - val_acc: 0.9578
Epoch 12/50
 - 18s - loss

<tensorflow.python.keras.callbacks.History at 0x211437412b0>