### problem 3.1

In [1]:
from csv import DictReader

In [2]:
vocab = {}
vocab_size = 0
examples = []
training_size = 0
test_size = 0

In [3]:
with open('C:/Users/28562/Desktop/reviews_tr.csv', 'r') as f:
    reader = DictReader(f)
    for row in reader:
        training_size += 1
        label = row['rating'] == '1'
        words = row['text'].split(' ')
        for word in words:
            if word not in vocab:
                vocab[word] = vocab_size
                vocab_size += 1
        examples.append((label, [vocab[word] for word in words]))

In [4]:
with open('C:/Users/28562/Desktop/reviews_te.csv', 'r') as f:
    reader = DictReader(f)
    for row in reader:
        test_size += 1
        label = row['rating'] == '1'
        words = row['text'].split(' ')
        for word in words:
            if word not in vocab:
                vocab[word] = vocab_size
                vocab_size += 1
        examples.append((label, [vocab[word] for word in words]))

In [5]:
training_data = examples[:training_size]
test_data = examples[training_size:]

In [6]:
from numpy import zeros

def bag_of_words_rep(word_ids, dim):
    bow_vector = zeros(dim) # creates a numpy.ndarray of shape (dim,)
    for word_id in word_ids:
        bow_vector[word_id] += 1
    return bow_vector

first_bow_vector = bag_of_words_rep(examples[0][1], vocab_size)

In [7]:
from collections import defaultdict
def new_bag_of_words_rep(word_ids):
    dic = defaultdict(int)
    for word_id in word_ids:
        dic[word_id] += 1
    return dic

In [8]:
def dot(w, x):
    ret = 0
    for key in x.keys():
        ret += w[key] * x[key]
    return ret

In [9]:
def add(w, x):
    for key in x.keys():
        w[key] += x[key]
    return w

In [10]:
def sub(w, x):
    for key in x.keys():
        w[key] -= x[key]
    return w

In [11]:
print(f'The amount of memory is estimated as {round(training_size * vocab_size / 8)} bytes.')

The amount of memory is estimated as 29997125000 bytes.


### problem 3.2

In [12]:
import numpy as np

def online_perceptron(training):
    n = len(training)
    d = vocab_size
    w = zeros(d)
    for i in range(n):
        y_i = training[i][0]
        x_i = new_bag_of_words_rep(training[i][1])   
        if (y_i and dot(w, x_i) <= 0) or (not y_i and dot(w, x_i) > 0):
            if y_i:
                w = add(w, x_i)
            else:
                w = sub(w, x_i)
    return w

In [13]:
weight = online_perceptron(training_data)

In [14]:
def error_rate(w, data):
    n = len(data)
    err = 0
    for i in range(n):
        y_i = data[i][0]
        x_i = new_bag_of_words_rep(data[i][1])  
        if (y_i and dot(w, x_i) <= 0) or (not y_i and dot(w, x_i) > 0):
            err += 1
    return err / n

In [15]:
train_err_rate = error_rate(weight, training_data)
test_err_rate = error_rate(weight, test_data)
print(f'training error rate is {train_err_rate}')
print(f'test error rate is {test_err_rate}')

training error rate is 0.133159
test error rate is 0.13582946501646248


### problem 3.3

In [24]:
import heapq

class new_tuple(tuple):
    def __lt__(self,other):
        return self[1] > other[1]

def top_ten(w, best=True):
    temp = w
    if not best:
        temp = -temp
    wid = list(enumerate(temp))
    heap = []
    #override the existing "cmp_lt" module function with your function
    for pair in wid:
        heapq.heappush(heap, new_tuple(pair))
    ret = []
    for i in range(10):
        tu = heapq.heappop(heap)
        ret.append(list(vocab.keys())[tu[0]])
    return ret

In [25]:
print(f'highest weights: {top_ten(weight)}')
print(f'lowest weights: {top_ten(weight, best=False)}')

highest weights: ['perfection', 'gem', 'incredible', 'heaven', 'superb', 'phenomenal', 'amazing', 'worried', 'heavenly', 'perfect']
lowest weights: ['mediocre', 'worst', 'meh', 'disappointing', 'lacked', 'underwhelmed', 'flavorless', 'bland', 'poisoning', 'disgusting']


### problem 3.5

In [20]:
def avg_online_perceptron(training):
    n = len(training)
    d = vocab_size
    w = zeros(d)
    avg_w = zeros(d)
    for i in range(n):
        y_i = training[i][0]
        x_i = new_bag_of_words_rep(training[i][1])   
        if (y_i and dot(w, x_i) <= 0) or (not y_i and dot(w, x_i) > 0):
            if y_i:
                w = add(w, x_i)
            else:
                w = sub(w, x_i)
        avg_w = avg_w + w
    return avg_w

In [21]:
avg_weight = avg_online_perceptron(training_data)

In [22]:
train_err_rate = error_rate(avg_weight, training_data)
test_err_rate = error_rate(avg_weight, test_data)
print(f'training error rate is {train_err_rate}')
print(f'test error rate is {test_err_rate}')

training error rate is 0.104548
test error rate is 0.10685613609811259


### problem 3.6

In [26]:
print(f'highest weights: {top_ten(avg_weight)}')
print(f'lowest weights: {top_ten(avg_weight, best=False)}')

highest weights: ['perfection', 'perfect', 'incredible', 'perfectly', 'gem', 'fantastic', 'delicious', 'amazing', 'excellent', 'disappoint']
lowest weights: ['worst', 'mediocre', 'bland', 'meh', 'disappointing', 'awful', 'horrible', 'terrible', 'lacked', 'flavorless']


### problem 3.7

In [34]:
def avg_mulpass_online_perceptron(training):
    n = len(training)
    d = vocab_size
    w = zeros(d)
    avg_w = zeros(d)
    for i in range(n-1):
        y_i = training[i][0]
        x_i = new_bag_of_words_rep(training[i][1])   
        if (y_i and dot(w, x_i) <= 0) or (not y_i and dot(w, x_i) > 0):
            if y_i:
                w = add(w, x_i)
            else:
                w = sub(w, x_i)
                
        y_i2 = training[i+1][0]
        x_i2 = new_bag_of_words_rep(training[i+1][1])   
        if (y_i2 and dot(w, x_i2) <= 0) or (not y_i2 and dot(w, x_i2) > 0):
            if y_i2:
                w = add(w, x_i2)
            else:
                w = sub(w, x_i2)
        avg_w = avg_w + w
    return avg_w

In [35]:
avg_mulpass_weight = avg_mulpass_online_perceptron(training_data)

In [36]:
train_err_rate = error_rate(avg_mulpass_weight, training_data)
test_err_rate = error_rate(avg_mulpass_weight, test_data)
print(f'training error rate is {train_err_rate}')
print(f'test error rate is {test_err_rate}')

training error rate is 0.103399
test error rate is 0.10578154578566921
