# 1. Dataset Loader

In [54]:
data_path = '../../dataset/'

import os
import sys
sys.path.insert(0, os.path.abspath('{}full_ibc'.format(data_path)))
import pickle

import numpy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
def make_bow_vec(sentence, word_to_id):
    vec = torch.zeros(len(word_to_id), device=device)
    for word in sentence:
        word_lower = word.lower()
        if word_lower in word_to_id.keys():
            vec[word_to_id[word_lower]] += 1
    return vec.view(1, -1)

def make_word_index(sentence, word_to_id):
    return torch.tensor([word_to_id[word.lower()] for word in sentence if word.lower() in word_to_id]).to(device)

def make_target(label, label_to_id):
    return torch.LongTensor([label_to_id[label]]).to(device)

## 1.1 IBC

In [40]:
# for lr model
[lib, con, neutral] = pickle.load(open('{}full_ibc/ibcData.pkl'.format(data_path), 'rb'))

ibc_lib = [(tree.get_words().split(), "LIBERAL") for tree in lib]
ibc_con = [(tree.get_words().split(), "CONSERVATIVE") for tree in con]
# ibc_neutral = [(tree.get_words().split(), "NEUTRAL") for tree in neutral]

ibc_full_data = ibc_lib + ibc_con
ibc_full_size = len(ibc_full_data)

ibc_train_size = int(ibc_full_size*0.9)
ibc_test_size = ibc_full_size - ibc_train_size

ibc_train_data, ibc_test_data = torch.utils.data.random_split(ibc_full_data, [ibc_train_size, ibc_test_size])

ibc_label_to_id = {"LIBERAL": 0, "CONSERVATIVE": 1}
ibc_word_to_id = {}
for sent, _ in ibc_train_data:
    for word in sent:
        word_lower = word.lower()
#         if word_lower not in ibc_word_to_id:
        if word_lower not in ibc_word_to_id and word_lower not in stopwords.words('english'):
            ibc_word_to_id[word_lower] = len(ibc_word_to_id)

ibc_vocab_size = len(ibc_word_to_id)
ibc_num_labels = 2

print('data size:', len(ibc_full_data))
print('dict size:', len(ibc_word_to_id))

data size: 3726
dict size: 13589


In [52]:
def generate_data_rnn():
    # for rnn model: remove infrequent words from dictionary
    [lib, con, neutral] = pickle.load(open('{}full_ibc/ibcData.pkl'.format(data_path), 'rb'))

    tokenizer = nltk.RegexpTokenizer(r"\w+")
    ibc_lib = [(tokenizer.tokenize(tree.get_words()), "LIBERAL") for tree in lib]
    ibc_con = [(tokenizer.tokenize(tree.get_words()), "CONSERVATIVE") for tree in con]
    # ibc_neutral = [(tree.get_words().split(), "NEUTRAL") for tree in neutral]
    ibc_full_data = ibc_lib + ibc_con

    ibc_lib_size = len(ibc_lib)
    ibc_lib_train_size = int(ibc_lib_size*0.9)
    ibc_lib_train_data, ibc_lib_test_data = torch.utils.data.random_split(ibc_lib, [ibc_lib_train_size, ibc_lib_size - ibc_lib_train_size])
    ibc_con_size = len(ibc_con)
    ibc_con_train_size = int(ibc_con_size*0.9)
    ibc_con_train_data, ibc_con_test_data = torch.utils.data.random_split(ibc_con, [ibc_con_train_size, ibc_con_size - ibc_con_train_size])

    ibc_train_data = ibc_lib_train_data + ibc_con_train_data
    ibc_test_data = ibc_lib_test_data + ibc_con_test_data

    ibc_label_to_id = {"LIBERAL": 0, "CONSERVATIVE": 1}
    ibc_word_count = {}
    for sent, _ in ibc_train_data:
        for word in sent:
            word_lower = word.lower()
            if word_lower in stopwords.words('english'):
                continue
            if word_lower not in ibc_word_count:
                ibc_word_count[word_lower] = 1
            else:
                ibc_word_count[word_lower] += 1

    ibc_word_to_id = {}
    for word, count in ibc_word_count.items():
        if count > 0:
            ibc_word_to_id[word] = len(ibc_word_to_id)

    ibc_vocab_size = len(ibc_word_to_id)
    ibc_num_labels = 2

    print('data size:', len(ibc_full_data))
    print('dict size:', len(ibc_word_to_id))
    ibc_train_data_loader = torch.utils.data.DataLoader([(make_word_index(sentence, ibc_word_to_id),
                                                       make_target(label, ibc_label_to_id))
                                                      for sentence, label in ibc_train_data], shuffle=True)
    ibc_test_data_loader = torch.utils.data.DataLoader([(make_word_index(sentence, ibc_word_to_id),
                                                      make_target(label, ibc_label_to_id))
                                                     for sentence, label in ibc_test_data])
    return ibc_train_data_loader, ibc_test_data_loader

## 1.2 Convote

In [2]:
convote_train_lib = []
convote_train_con = []
convote_train_data = []

for filename in os.listdir("../../dataset/sentiment_analysis/convote_v1.1/data_stage_one/training_set"):
    filename_split = filename.split("_")
    party = filename_split[-1][:1]
    if party == 'D':
        with open("{}convote_v1.1/data_stage_one/training_set/{}".format(data_path, filename), "r") as f:
            convote_train_lib += f.readlines()
    if party == 'R':
        with open("{}convote_v1.1/data_stage_one/training_set/{}".format(data_path, filename), "r") as f:
            convote_train_con += f.readlines()

convote_train_lib = [(line.split(), "LIBERAL") for line in convote_train_lib]
convote_train_con = [(line.split(), "CONSERVATIVE") for line in convote_train_con]
convote_train_data = convote_train_lib + convote_train_con

print(len(convote_train_data))

65015


In [3]:
convote_test_lib = []
convote_test_con = []
convote_test_data = []

for filename in os.listdir("../../dataset/sentiment_analysis/convote_v1.1/data_stage_one/test_set"):
    filename_split = filename.split("_")
    party = filename_split[-1][:1]
    if party == 'D':
        with open("{}convote_v1.1/data_stage_one/test_set/{}".format(data_path, filename), "r") as f:
            convote_test_lib += f.readlines()
    if party == 'R':
        with open("{}convote_v1.1/data_stage_one/test_set/{}".format(data_path, filename), "r") as f:
            convote_test_con += f.readlines()

convote_test_lib = [(line.split(), "LIBERAL") for line in convote_test_lib]
convote_test_con = [(line.split(), "CONSERVATIVE") for line in convote_test_con]
convote_test_data = convote_test_lib + convote_test_con

print(len(convote_test_data))

22098


In [4]:
convote_label_to_id = {"LIBERAL": 0, "CONSERVATIVE": 1}
convote_word_to_id = {}
for sent, _ in convote_train_data:
    for word in sent:
        if word not in convote_word_to_id:
            convote_word_to_id[word] = len(convote_word_to_id)
print(len(convote_word_to_id))

convote_vocab_size = len(convote_word_to_id)
convote_num_labels = 2

26804


# 2 Models

In [21]:
def train(model, data, num_epochs, loss_function, optimizer):
    for epoch in range(num_epochs):
        iters = 0
        total_loss = 0
        for bow_vec, target in data:
            model.zero_grad()

            probs = model(bow_vec[0])
            loss = loss_function(probs, target[0])
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            iters += 1

        print('[{}] loss: {}'.format(epoch+1, total_loss / iters))

def eval(model, data):
    num_predictions = 0
    num_correct = 0
    with torch.no_grad():
        for sentence, target in data:
            probs = model(sentence[0])
            num_predictions += 1
            if (torch.argmax(probs) == target[0]):
                num_correct += 1
    return num_correct/num_predictions*100

## 2.1 Logistic Regression

In [17]:
class BoWClassifier(nn.Module):
    def __init__(self, vocab, output_size):
        super(BoWClassifier, self).__init__()
        self.vocab = vocab
        self.input_size = len(vocab)
        self.output_size = output_size
        self.linear = nn.Linear(self.input_size, self.output_size).to(device)
        
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=-1)

### 2.1.1 IBC Logistic Regression

In [41]:
ibc_train_data_lr = [(make_bow_vec(sentence, ibc_word_to_id), make_target(label, ibc_label_to_id)) for sentence, label in ibc_train_data]
ibc_test_data_lr = [(make_bow_vec(sentence, ibc_word_to_id), make_target(label, ibc_label_to_id)) for sentence, label in ibc_test_data]

In [18]:
# first run
model = BoWClassifier(ibc_word_to_id, ibc_num_labels)

num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in ibc_test_data_lr:
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print("before training:", num_correct/num_predictions*100)
        
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(100):
    for bow_vec, target in ibc_train_data_lr:
        model.zero_grad()
        
        probs = model(bow_vec)
        
        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        print(epoch+1)

num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in ibc_test_data_lr:
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print("after training:", num_correct/num_predictions*100)

before training: 47.45308310991957
10
20
30
40
50
60
70
80
90
100
after training: 62.466487935656836


### 2.1.2 Convote Logistic Regression

In [6]:
convote_train_data_lr = [(make_bow_vec(sentence, convote_word_to_id), make_target(label, convote_label_to_id)) for sentence, label in convote_train_data]
convote_test_data_lr = [(make_bow_vec(sentence, convote_word_to_id), make_target(label, convote_label_to_id)) for sentence, label in convote_test_data]

NameError: name 'convote_train_data' is not defined

In [None]:
# first run
model = BoWClassifier(convote_vocab_size, convote_num_labels)

print("before training")
num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in convote_test_data_lr:
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print(num_correct/num_predictions*100)
        
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(50):
    for bow_vec, target in convote_train_data_lr:
        model.zero_grad()
        
        probs = model(bow_vec)
        
        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()
    print(epoch)

print("after training")
num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in convote_test_data_lr:
        bow_vec = make_bow_vec(sentence, convote_word_to_id)
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print(num_correct/num_predictions*100)

## 2.2 RNN

In [7]:
class RNN(nn.Module):
    def __init__(self, vocab, output_size, hidden_size):
        super(RNN, self).__init__()

        self.vocab = vocab
        self.input_size = len(vocab)
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(self.input_size, self.hidden_size).to(device)
        self.left = nn.Linear(self.hidden_size, self.hidden_size).to(device)
        self.right = nn.Linear(self.hidden_size, self.hidden_size).to(device)
        self.cat = nn.Linear(self.hidden_size, self.output_size).to(device)

    def forward(self, sentence):
        embedding = self.embedding(sentence)
        hidden = embedding[0]
        for i in range(1, len(embedding)):
            hidden = torch.tanh(torch.add(self.left(hidden), self.right(embedding[i])))
        return F.log_softmax(self.cat(hidden), dim=-1).view(1, -1)

In [48]:
class RNN_v2(nn.Module):
    def __init__(self, vocab, output_size, hidden_size):
        super(RNN_v2, self).__init__()

        self.vocab = vocab
        self.input_size = len(vocab)
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(self.input_size, self.hidden_size).to(device)
        self.rnn = nn.RNN(self.hidden_size, self.hidden_size).to(device)
        self.cat = nn.Linear(self.hidden_size, self.output_size).to(device)

    def forward(self, sentence):
        embedding = self.embedding(sentence).unsqueeze(1)
        output, hidden = self.rnn(embedding)
        output = output.squeeze()[-1]
        return F.log_softmax(self.cat(output), dim=-1).view(1, -1)

In [9]:
class RNN_v3(nn.Module):
    def __init__(self, vocab, output_size, hidden_size):
        super(RNN_v3, self).__init__()

        self.vocab = vocab
        self.input_size = len(vocab)
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(self.input_size, self.hidden_size).to(device)
        self.linear = nn.Linear(2 * self.hidden_size, self.hidden_size).to(device)
        self.cat = nn.Linear(self.hidden_size, self.output_size).to(device)

    def forward(self, sentence):
        embedding = self.embedding(sentence)
        hidden = embedding[0]
        for i in range(1, len(embedding)):
            hidden = torch.tanh(self.linear(torch.cat((hidden, embedding[i]))))
        return F.log_softmax(self.cat(hidden), dim=-1).view(1, -1)

### 2.2.1 IBC RNN

In [55]:
ibc_train_data_rnn, ibc_test_data_rnn = generate_data_rnn()

data size: 3726
dict size: 12721


In [56]:
model = RNN(ibc_word_to_id, ibc_num_labels, 300)
# model.load_state_dict(torch.load('./rnn_50epochs_v2.model'))

print(eval(model, ibc_test_data_rnn))
        
num_epochs = 50
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

train(model, ibc_train_data_rnn, num_epochs, loss_function, optimizer)

print(eval(model, ibc_test_data_rnn))

RuntimeError: CUDA error: device-side assert triggered

In [50]:
print(eval(model, ibc_train_data_rnn))
print(eval(model, ibc_test_data_rnn))

99.46300715990454
57.48663101604278


In [39]:
# torch.save(model.state_dict(), 'rnn_50epochs_v2.model')

new_model = RNN_v2(ibc_word_to_id, ibc_num_labels, 300)
new_model.load_state_dict(torch.load('./rnn_50epochs_v2.model'))

print(eval(new_model, ibc_test_data_rnn))

RuntimeError: Error(s) in loading state_dict for RNN_v2:
	size mismatch for embedding.weight: copying a param with shape torch.Size([12670, 300]) from checkpoint, the shape in current model is torch.Size([6664, 300]).

In [28]:
torch.save(model.state_dict(), 'rnn_50epochs.model')

new_model = RNN(ibc_vocab_size, ibc_num_labels, 300)
new_model.load_state_dict(torch.load('./rnn_50epochs.model'))

num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in ibc_test_data_rnn:
        probs = new_model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print("after training:", num_correct/num_predictions*100)

after training: 57.9088471849866
