In [1]:
import sys
import time
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score
from __future__ import division
from collections import Counter

In [2]:
wv = KeyedVectors.load('embs_train.kv')

In [3]:
def sentence_embedding(sentence, word_counts):
    tokens = sentence.split()
    embeddings = []
    for token in tokens:
        if token in wv and word_counts[token] > 1:
            embeddings.append(wv[token])
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(wv.vector_size)

In [4]:
def read_from(dataframe):
    for i, row in dataframe.iterrows():
        label = 1 if row['Label'] == '+' else -1
        yield (label, row['Review_Embedding'])

In [5]:
def test(dev_data, model):
    tot, err = 0, 0
    for label, emb in read_from(dev_data):
        err += label * (np.dot(model, emb)) <= 0
    return err / len(dev_data)

In [6]:
def averaged_perceptron(train_data, dev_data, epochs=10):
    t = time.time()
    best_err = 1.
    avg_model = np.zeros(train_data['Review_Embedding'].iloc[0].shape)  
    model = np.zeros(train_data['Review_Embedding'].iloc[0].shape)  


    word_counts = Counter()
    for review in train_data['Review']:
        word_counts.update(review.split())

    for it in range(1, epochs + 1):
        updates = 0
        for label, emb in read_from(train_data):
            if label * (np.dot(model, emb)) <= 0:
                updates += 1
                model += label * emb
            avg_model += model

        dev_err = test(dev_data, avg_model)
        if dev_err < best_err:
            best_err = dev_err
            best_model = avg_model.copy()  
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / len(train_data) * 100, dev_err * 100))
    
    avg_model /= (epochs * len(train_data))  
    print("best dev err %.1f%%, time: %.1f secs" % (best_err * 100, time.time() - t))
    
    return best_model  

In [7]:
if __name__ == "__main__":
    train_data = pd.read_csv('C:/3rd term/ML/HW-4/hw4-data/train.txt', sep='\t', header=None, names=['Label', 'Review'])
    dev_data = pd.read_csv('C:/3rd term/ML/HW-4/hw4-data/dev.txt', sep='\t', header=None, names=['Label', 'Review'])
    
    
    word_counts = Counter()
    for review in train_data['Review']:
        word_counts.update(review.split())

    
    train_data['Review_Embedding'] = train_data['Review'].apply(lambda x: sentence_embedding(x, word_counts))
    dev_data['Review_Embedding'] = dev_data['Review'].apply(lambda x: sentence_embedding(x, word_counts))

    best_model = averaged_perceptron(train_data, dev_data, epochs=10)

    
    correct_examples = []
    for index, (label, emb) in enumerate(read_from(dev_data)):
        prediction = np.dot(best_model, emb)
        if (prediction > 0 and label == 1) or (prediction <= 0 and label == -1):
            correct_examples.append((index, prediction))

    
    print(f"Number of correct examples found: {len(correct_examples)}")
    for idx, pred in correct_examples:
        print(f"Example Index: {idx}, Predicted Label: {'+' if pred > 0 else '-'}")
        print(f"Review: {dev_data.iloc[idx]['Review']}")
        print("------------------------------------------------------------")

    
    predicted_labels = []
    for _, emb in read_from(dev_data):
        prediction = np.dot(best_model, emb)
        if prediction > 0:
            predicted_labels.append('+')
        else:
            predicted_labels.append('-')

    
    actual_labels = dev_data['Label'].tolist()
    accuracy = accuracy_score(actual_labels, predicted_labels)
    print("Accuracy on dev_data:", accuracy)


epoch 1, update 32.0%, dev 25.0%
epoch 2, update 30.4%, dev 25.1%
epoch 3, update 30.3%, dev 24.4%
epoch 4, update 29.9%, dev 24.2%
epoch 5, update 30.4%, dev 24.2%
epoch 6, update 30.7%, dev 24.5%
epoch 7, update 30.4%, dev 24.3%
epoch 8, update 30.1%, dev 24.5%
epoch 9, update 29.9%, dev 24.7%
epoch 10, update 30.2%, dev 24.7%
best dev err 24.2%, time: 6.8 secs
Number of correct examples found: 761
Example Index: 0, Predicted Label: -
Review: you could easily mistake it for a sketchy work in progress that was inexplicably rushed to the megaplexes before its time
------------------------------------------------------------
Example Index: 2, Predicted Label: -
Review: the film is so packed with subplots involving the various silbersteins that it feels more like the pilot episode of a tv series than a feature film
------------------------------------------------------------
Example Index: 5, Predicted Label: +
Review: escapism in its purest form
-----------------------------------------