In [17]:
import sys
import time
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score
from __future__ import division

In [18]:
wv = KeyedVectors.load('embs_train.kv')

In [19]:
def sentence_embedding(sentence):
    tokens = sentence.split()
    embeddings = []
    for token in tokens:
        if token in wv:
            embeddings.append(wv[token])
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(wv.vector_size)

In [20]:
def read_from(dataframe):
    for i, row in dataframe.iterrows():
        label = 1 if row['Label'] == '+' else -1
        yield (label, row['Review_Embedding'])

In [21]:
def test(dev_data, model):
    tot, err = 0, 0
    for label, emb in read_from(dev_data):
        err += label * (np.dot(model, emb)) <= 0
    return err / len(dev_data)

In [22]:
def train(train_data, dev_data, epochs=10):
    t = time.time()
    best_err = 1.
    model = np.zeros(train_data['Review_Embedding'].iloc[0].shape)  
    for it in range(1, epochs + 1):
        updates = 0
        for label, emb in read_from(train_data):
            if label * (np.dot(model, emb)) <= 0:
                updates += 1
                model += label * emb
        dev_err = test(dev_data, model)
        best_err = min(best_err, dev_err)
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / len(train_data) * 100, dev_err * 100))
    print("best dev err %.1f%%, time: %.1f secs" % (best_err * 100, time.time() - t))

In [23]:
if __name__ == "__main__":
    train_data = pd.read_csv('C:/3rd term/ML/HW-4/hw4-data/train.txt', sep='\t', header=None, names=['Label', 'Review'])
    dev_data = pd.read_csv('C:/3rd term/ML/HW-4/hw4-data/dev.txt', sep='\t', header=None, names=['Label', 'Review'])
    
    
    train_data['Review_Embedding'] = train_data['Review'].apply(sentence_embedding)
    dev_data['Review_Embedding'] = dev_data['Review'].apply(sentence_embedding)
    
    train(train_data, dev_data, epochs=10)

epoch 1, update 31.1%, dev 37.4%
epoch 2, update 29.5%, dev 35.4%
epoch 3, update 29.8%, dev 33.2%
epoch 4, update 29.1%, dev 40.0%
epoch 5, update 29.7%, dev 35.2%
epoch 6, update 29.4%, dev 40.4%
epoch 7, update 29.4%, dev 38.4%
epoch 8, update 29.4%, dev 42.5%
epoch 9, update 29.1%, dev 39.0%
epoch 10, update 29.1%, dev 39.2%
best dev err 33.2%, time: 12.2 secs
