# 1. Dataset Loader

In [15]:
data_path = '../../dataset/sentiment_analysis/'

import os
import sys
sys.path.insert(0, os.path.abspath('{}full_ibc'.format(data_path)))
import pickle

import numpy

## 1.1 IBC

In [3]:
[lib, con, neutral] = pickle.load(open('{}full_ibc/ibcData.pkl'.format(data_path), 'rb'))

ibc_lib = [(tree.get_words().split(), "LIBERAL") for tree in lib]
ibc_con = [(tree.get_words().split(), "CONSERVATIVE") for tree in con]
# ibc_neutral = [(tree.get_words().split(), "NEUTRAL") for tree in neutral]

ibc_full_data = ibc_lib + ibc_con
ibc_full_size = len(ibc_full_data)

ibc_train_size = int(ibc_full_size*0.8)
ibc_test_size = ibc_full_size - ibc_train_size

ibc_train_data, ibc_test_data = D.random_split(ibc_full_data, [ibc_train_size, ibc_test_size])

ibc_label_to_id = {"LIBERAL": 0, "CONSERVATIVE": 1}
ibc_word_to_id = {}
for sent, _ in ibc_train_data:
    for word in sent:
        if word not in ibc_word_to_id:
            ibc_word_to_id[word] = len(ibc_word_to_id)

ibc_vocab_size = len(ibc_word_to_id)
ibc_num_labels = 2

print('data size:', len(ibc_full_data))
print('dict size:', len(ibc_word_to_id))

data size: 3726
dict size: 13955


## 1.2 Convote

In [2]:
convote_train_lib = []
convote_train_con = []
convote_train_data = []

for filename in os.listdir("../../dataset/sentiment_analysis/convote_v1.1/data_stage_one/training_set"):
    filename_split = filename.split("_")
    party = filename_split[-1][:1]
    if party == 'D':
        with open("{}convote_v1.1/data_stage_one/training_set/{}".format(data_path, filename), "r") as f:
            convote_train_lib += f.readlines()
    if party == 'R':
        with open("{}convote_v1.1/data_stage_one/training_set/{}".format(data_path, filename), "r") as f:
            convote_train_con += f.readlines()

convote_train_lib = [(line.split(), "LIBERAL") for line in convote_train_lib]
convote_train_con = [(line.split(), "CONSERVATIVE") for line in convote_train_con]
convote_train_data = convote_train_lib + convote_train_con

print(len(convote_train_data))

65015


In [3]:
convote_test_lib = []
convote_test_con = []
convote_test_data = []

for filename in os.listdir("../../dataset/sentiment_analysis/convote_v1.1/data_stage_one/test_set"):
    filename_split = filename.split("_")
    party = filename_split[-1][:1]
    if party == 'D':
        with open("{}convote_v1.1/data_stage_one/test_set/{}".format(data_path, filename), "r") as f:
            convote_test_lib += f.readlines()
    if party == 'R':
        with open("{}convote_v1.1/data_stage_one/test_set/{}".format(data_path, filename), "r") as f:
            convote_test_con += f.readlines()

convote_test_lib = [(line.split(), "LIBERAL") for line in convote_test_lib]
convote_test_con = [(line.split(), "CONSERVATIVE") for line in convote_test_con]
convote_test_data = convote_test_lib + convote_test_con

print(len(convote_test_data))

22098


In [4]:
convote_label_to_id = {"LIBERAL": 0, "CONSERVATIVE": 1}
convote_word_to_id = {}
for sent, _ in convote_train_data:
    for word in sent:
        if word not in convote_word_to_id:
            convote_word_to_id[word] = len(convote_word_to_id)
print(len(convote_word_to_id))

convote_vocab_size = len(convote_word_to_id)
convote_num_labels = 2

26804


# 2 Models

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [21]:
def make_bow_vec(sentence, word_to_id):
    vec = torch.zeros(len(word_to_id))
    for word in sentence:
        if word in word_to_id.keys():
            vec[word_to_id[word]] += 1
    return vec.view(1, -1)

def make_target(label, label_to_id):
    return torch.LongTensor([label_to_id[label]])

## 2.1 Logistic Regression

In [42]:
class BoWClassifier(nn.Module):
    def __init__(self, vocab_size, num_labels):
        super(BoWClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_labels)
        
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=-1)

### 2.1.1 IBC Logistic Regression

In [20]:
ibc_train_data_lr = [(make_bow_vec(sentence, ibc_word_to_id), make_target(label, ibc_label_to_id)) for sentence, label in ibc_train_data]
ibc_test_data_lr = [(make_bow_vec(sentence, ibc_word_to_id), make_target(label, ibc_label_to_id)) for sentence, label in ibc_test_data]

In [30]:
# first run
model = BoWClassifier(ibc_vocab_size, ibc_num_labels)

num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in ibc_test_data_lr:
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print("before training:", num_correct/num_predictions*100)
        
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(100):
    for bow_vec, target in ibc_train_data_lr:
        model.zero_grad()
        
        probs = model(bow_vec)
        
        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        print(epoch+1)

num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in ibc_test_data_lr:
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print("after training:", num_correct/num_predictions*100)

before training: 52.54691689008043
10
20
30
40
50
60
70
80
90
100
after training: 62.86863270777479


### 2.1.2 Convote Logistic Regression

In [7]:
convote_train_data_lr = [(make_bow_vec(sentence, convote_word_to_id), make_target(label, convote_label_to_id)) for sentence, label in convote_train_data]
convote_test_data_lr = [(make_bow_vec(sentence, convote_word_to_id), make_target(label, convote_label_to_id)) for sentence, label in convote_test_data]

In [None]:
# first run
model = BoWClassifier(convote_vocab_size, convote_num_labels)

print("before training")
num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in convote_test_data_lr:
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print(num_correct/num_predictions*100)
        
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(50):
    for bow_vec, target in convote_train_data_lr:
        model.zero_grad()
        
        probs = model(bow_vec)
        
        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()
    print(epoch)

print("after training")
num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in convote_test_data_lr:
        bow_vec = make_bow_vec(sentence, convote_word_to_id)
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print(num_correct/num_predictions*100)

## 2.2 RNN

In [50]:
class RNN(nn.Module):
    def __init__(self, vocab_size, label_size, hidden_size):
        super(RNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.left = nn.Linear(hidden_size, hidden_size)
        self.right = nn.Linear(hidden_size, hidden_size)
        self.linear = nn.Linear(hidden_size, label_size)

    def forward(self, sentence):
        embedding = self.embedding(sentence)
        encoding = embedding[0]
        for i in range(1, len(embedding)):
            encoding = torch.add(self.left(encoding), self.right(embedding[i]))
        return F.log_softmax(self.linear(encoding), dim=-1).view(1, -1)

### 2.2.1 IBC RNN

In [48]:
ibc_train_data_rnn = [(torch.tensor([ibc_word_to_id[word] for word in sentence if word in ibc_word_to_id]), make_target(label, ibc_label_to_id)) for sentence, label in ibc_train_data]
ibc_test_data_rnn = [(torch.tensor([ibc_word_to_id[word] for word in sentence if word in ibc_word_to_id]), make_target(label, ibc_label_to_id)) for sentence, label in ibc_test_data]

In [None]:
model = RNN(ibc_vocab_size, ibc_num_labels, 128)

num_predictions = 0
num_correct = 0
with torch.no_grad():
    for sentence, target in ibc_test_data_rnn:
        probs = model(sentence)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print("before training:", num_correct/num_predictions*100)
        
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(50):
    for bow_vec, target in ibc_train_data_rnn:
        model.zero_grad()
        
        probs = model(bow_vec)
        
#         print(probs,target)
        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        print(epoch+1)

num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in ibc_test_data_rnn:
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print("after training:", num_correct/num_predictions*100)