# 1. Dataset Loader

In [1]:
data_path = '../../dataset/'

import os
import sys
sys.path.insert(0, os.path.abspath('{}full_ibc'.format(data_path)))
import pickle
import numpy
import random
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

from gensim.models import Word2Vec

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [2]:
def make_bow_vec(sentence, word_to_id):
    vec = torch.zeros(len(word_to_id), device=device)
    for word in sentence:
        word_lower = word.lower()
        if word_lower in word_to_id.keys():
            vec[word_to_id[word_lower]] += 1
    return vec.view(1, -1)

def make_word_index(sentence, word_to_id):
    return torch.tensor([word_to_id[word] for word in sentence if word in word_to_id]).to(device)

def make_word2vec_embed(sentence, word_to_id):
    return torch.tensor([word_to_id[word] for word in sentence if word in word_to_id]).to(device)

def make_target(label, label_to_id):
    return torch.LongTensor([label_to_id[label]]).to(device)

def random_swap(sent):
    sent_len = len(sent)
    pos1 = random.randint(0, sent_len - 1)
    pos2 = random.randint(0, sent_len - 1)
    while pos1 == pos2:
        pos2 = random.randint(0, sent_len - 1)

    tmp = sent[pos1]
    sent[pos1] = sent[pos2]
    sent[pos2] = tmp
    return sent

## 1.1 IBC

In [27]:
# for lr model
[lib, con, neutral] = pickle.load(open('{}full_ibc/ibcData.pkl'.format(data_path), 'rb'))

ibc_lib = [(tree.get_words().split(), "LIBERAL") for tree in lib]
ibc_con = [(tree.get_words().split(), "CONSERVATIVE") for tree in con]
# ibc_neutral = [(tree.get_words().split(), "NEUTRAL") for tree in neutral]

ibc_full_data = ibc_lib + ibc_con
ibc_full_size = len(ibc_full_data)

ibc_train_size = int(ibc_full_size*0.9)
ibc_test_size = ibc_full_size - ibc_train_size

ibc_train_data, ibc_test_data = torch.utils.data.random_split(ibc_full_data, [ibc_train_size, ibc_test_size])

ibc_label_to_id = {"LIBERAL": 0, "CONSERVATIVE": 1}
ibc_word_to_id = {}
for sent, _ in ibc_train_data:
    for word in sent:
        word_lower = word.lower()
#         if word_lower not in ibc_word_to_id:
        if word_lower not in ibc_word_to_id and word_lower not in stopwords.words('english'):
            ibc_word_to_id[word_lower] = len(ibc_word_to_id)

ibc_vocab_size = len(ibc_word_to_id)
ibc_num_labels = 2

print('data size:', len(ibc_full_data))
print('dict size:', len(ibc_word_to_id))

data size: 3726
dict size: 13552


In [None]:
with open('vocab.json', 'w') as f:
    json.dump(ibc_word_to_id, f)

In [22]:
with open('vocab.json', 'r') as f:
    vocab = json.load(f)
    print(len(vocab))

13586


In [94]:
def generate_data_rnn(cutoff_freq=1, use_word2vec=False, embed_size=100):
    torch.manual_seed(0)
    
    # for rnn model: remove infrequent words from dictionary
    [lib, con, neutral] = pickle.load(open('{}full_ibc/ibcData.pkl'.format(data_path), 'rb'))

    tokenizer = nltk.RegexpTokenizer(r"\w+")
    ibc_lib = [(tokenizer.tokenize(tree.get_words().lower()), "LIBERAL") for tree in lib]
    ibc_con = [(tokenizer.tokenize(tree.get_words().lower()), "CONSERVATIVE") for tree in con]
    # ibc_neutral = [(tree.get_words().split(), "NEUTRAL") for tree in neutral]
    ibc_full_data = ibc_lib + ibc_con

    ibc_lib_size = len(ibc_lib)
    ibc_lib_train_size = int(ibc_lib_size*0.8)
    ibc_lib_val_size = int(ibc_lib_size*0.1)
    ibc_lib_test_size = ibc_lib_size - ibc_lib_train_size - ibc_lib_val_size
    ibc_lib_train_data, ibc_lib_val_data, ibc_lib_test_data = torch.utils.data.random_split(ibc_lib, [ibc_lib_train_size, ibc_lib_val_size, ibc_lib_test_size])
    
    ibc_con_size = len(ibc_con)
    ibc_con_train_size = int(ibc_con_size*0.8)
    ibc_con_val_size = int(ibc_con_size*0.1)
    ibc_con_test_size = ibc_con_size - ibc_con_train_size - ibc_con_val_size
    ibc_con_train_data, ibc_con_val_data, ibc_con_test_data = torch.utils.data.random_split(ibc_con, [ibc_con_train_size, ibc_con_val_size, ibc_con_test_size])

    ibc_train_data = ibc_lib_train_data + ibc_con_train_data
#     # data aug
#     ibc_aug_data = []
#     for sent, label in ibc_train_data:
#         ibc_aug_data.append((random_swap(sent), label))
#     ibc_train_data += ibc_aug_data
    
    ibc_val_data = ibc_lib_val_data + ibc_con_val_data
    ibc_test_data = ibc_lib_test_data + ibc_con_test_data
    ibc_labels = {"LIBERAL": 0, "CONSERVATIVE": 1}
    print('training data size:', len(ibc_train_data))
    print('validation data size:', len(ibc_val_data))
    print('test data size:', len(ibc_test_data))
    
    if use_word2vec:
        ibc_vocab = Word2Vec([sent for sent, _ in ibc_train_data], min_count=2, size=embed_size).wv
        print('vocab size:', len(ibc_vocab.vocab))
        
        ibc_train_data_loader = torch.utils.data.DataLoader([(make_word2vec_embed(sentence, ibc_vocab),
                                                              make_target(label, ibc_labels))
                                                             for sentence, label in ibc_train_data], shuffle=True)
        ibc_val_data_loader = torch.utils.data.DataLoader([(make_word2vec_embed(sentence, ibc_vocab),
                                                            make_target(label, ibc_labels))
                                                           for sentence, label in ibc_val_data])
        ibc_test_data_loader = torch.utils.data.DataLoader([(make_word2vec_embed(sentence, ibc_vocab),
                                                             make_target(label, ibc_labels))
                                                            for sentence, label in ibc_test_data])
    else:
        ibc_word_count = {}
        for sent, _ in ibc_train_data:
            for word in sent:
                if word in stopwords.words('english'):
                    continue
                if word not in ibc_word_count:
                    ibc_word_count[word] = 1
                else:
                    ibc_word_count[word] += 1

        ibc_vocab = {}
        for word, count in ibc_word_count.items():
            if count > cutoff_freq:
                ibc_vocab[word] = len(ibc_vocab)
        print('vocab size:', len(ibc_vocab))

        ibc_train_data_loader = torch.utils.data.DataLoader([(make_word_index(sentence, ibc_vocab),
                                                              make_target(label, ibc_labels))
                                                             for sentence, label in ibc_train_data], shuffle=True, worker_init_fn=0)
        ibc_val_data_loader = torch.utils.data.DataLoader([(make_word_index(sentence, ibc_vocab),
                                                            make_target(label, ibc_labels))
                                                           for sentence, label in ibc_val_data])
        ibc_test_data_loader = torch.utils.data.DataLoader([(make_word_index(sentence, ibc_vocab),
                                                             make_target(label, ibc_labels))
                                                            for sentence, label in ibc_test_data])
    return ibc_vocab, ibc_train_data_loader, ibc_val_data_loader, ibc_test_data_loader

## 1.2 Convote

In [2]:
convote_train_lib = []
convote_train_con = []
convote_train_data = []

for filename in os.listdir("../../dataset/sentiment_analysis/convote_v1.1/data_stage_one/training_set"):
    filename_split = filename.split("_")
    party = filename_split[-1][:1]
    if party == 'D':
        with open("{}convote_v1.1/data_stage_one/training_set/{}".format(data_path, filename), "r") as f:
            convote_train_lib += f.readlines()
    if party == 'R':
        with open("{}convote_v1.1/data_stage_one/training_set/{}".format(data_path, filename), "r") as f:
            convote_train_con += f.readlines()

convote_train_lib = [(line.split(), "LIBERAL") for line in convote_train_lib]
convote_train_con = [(line.split(), "CONSERVATIVE") for line in convote_train_con]
convote_train_data = convote_train_lib + convote_train_con

print(len(convote_train_data))

65015


In [3]:
convote_test_lib = []
convote_test_con = []
convote_test_data = []

for filename in os.listdir("../../dataset/sentiment_analysis/convote_v1.1/data_stage_one/test_set"):
    filename_split = filename.split("_")
    party = filename_split[-1][:1]
    if party == 'D':
        with open("{}convote_v1.1/data_stage_one/test_set/{}".format(data_path, filename), "r") as f:
            convote_test_lib += f.readlines()
    if party == 'R':
        with open("{}convote_v1.1/data_stage_one/test_set/{}".format(data_path, filename), "r") as f:
            convote_test_con += f.readlines()

convote_test_lib = [(line.split(), "LIBERAL") for line in convote_test_lib]
convote_test_con = [(line.split(), "CONSERVATIVE") for line in convote_test_con]
convote_test_data = convote_test_lib + convote_test_con

print(len(convote_test_data))

22098


In [4]:
convote_label_to_id = {"LIBERAL": 0, "CONSERVATIVE": 1}
convote_word_to_id = {}
for sent, _ in convote_train_data:
    for word in sent:
        if word not in convote_word_to_id:
            convote_word_to_id[word] = len(convote_word_to_id)
print(len(convote_word_to_id))

convote_vocab_size = len(convote_word_to_id)
convote_num_labels = 2

26804


# 2 Models

In [68]:
def train(model, train_data, valid_data, test_data, loss_function, optimizer, num_epochs=50, max_patience=5, lr_decay=0.5):
    torch.manual_seed(0)
    
    print('Before training:', eval(model, test_data, loss_function))
    best_valid_accuracy = 0
    patience = 0
    
    for epoch in range(num_epochs):
        iters = 0
        total_loss = 0
        
        for bow_vec, target in train_data:
            model.zero_grad()

            probs = model(bow_vec[0])
            loss = loss_function(probs, target[0])
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            iters += 1

        # begin validation
        valid_loss, valid_accuracy = eval(model, valid_data, loss_function)
        if valid_accuracy > best_valid_accuracy:
            patience = 0
            best_valid_accuracy = valid_accuracy
            torch.save(model, 'rnn_best_valid.model')
            print('[{}] Saved best model'.format(epoch + 1))
#         else:
#             patience += 1
#             print('[{}] Hit patience {}'.format(epoch + 1, patience))
            
#             if patience == max_patience:
#                 optimizer.param_groups[0]['lr'] *= lr_decay
# #                 model = torch.load('rnn_best_valid.model')
#                 patience = 0
#                 print('[{}] Hit max paitence, learning rate decayed to {}'.format(epoch + 1, optimizer.param_groups[0]['lr']))

        print('[{}] training loss: {}, validation loss: {}, validation accuracy: {}'.format(epoch + 1, total_loss / iters, valid_loss, valid_accuracy))
    
    print('After training:', eval(model, test_data, loss_function))
        

def eval(model, data, loss_function):
    model.eval()
    num_predictions = 0
    num_correct = 0
    iters = 0
    total_loss = 0
    for sentence, target in data:
        with torch.no_grad():
            probs = model(sentence[0])
            loss = loss_function(probs, target[0])
        iters += 1
        total_loss += loss.item()
        num_predictions += 1
#         print(torch.argmax(probs))
        if (torch.argmax(probs) == target[0]):
            num_correct += 1
    model.train()
    return total_loss/iters, num_correct/num_predictions*100

## 2.1 Logistic Regression

In [9]:
class BoWClassifier(nn.Module):
    def __init__(self, vocab, output_size):
        super(BoWClassifier, self).__init__()
        self.vocab = vocab
        self.input_size = len(vocab)
        self.output_size = output_size
        self.linear = nn.Linear(self.input_size, self.output_size).to(device)
        
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=-1)

### 2.1.1 IBC Logistic Regression

In [28]:
ibc_train_data_lr = [(make_bow_vec(sentence, ibc_word_to_id), make_target(label, ibc_label_to_id)) for sentence, label in ibc_train_data]
ibc_test_data_lr = [(make_bow_vec(sentence, ibc_word_to_id), make_target(label, ibc_label_to_id)) for sentence, label in ibc_test_data]

In [29]:
# first run
model = BoWClassifier(ibc_word_to_id, ibc_num_labels)

num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in ibc_test_data_lr:
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print("before training:", num_correct/num_predictions*100)
        
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(100):
    for bow_vec, target in ibc_train_data_lr:
        model.zero_grad()
        
        probs = model(bow_vec)
        
        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        print(epoch+1)

num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in ibc_test_data_lr:
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print("after training:", num_correct/num_predictions*100)

before training: 50.67024128686327
10
20
30
40
50
60
70
80
90
100
after training: 65.14745308310992


In [30]:
torch.save(model.state_dict(), 'lr.model')

### 2.1.2 Convote Logistic Regression

In [6]:
convote_train_data_lr = [(make_bow_vec(sentence, convote_word_to_id), make_target(label, convote_label_to_id)) for sentence, label in convote_train_data]
convote_test_data_lr = [(make_bow_vec(sentence, convote_word_to_id), make_target(label, convote_label_to_id)) for sentence, label in convote_test_data]

NameError: name 'convote_train_data' is not defined

In [None]:
# first run
model = BoWClassifier(convote_vocab_size, convote_num_labels)

print("before training")
num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in convote_test_data_lr:
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print(num_correct/num_predictions*100)
        
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(50):
    for bow_vec, target in convote_train_data_lr:
        model.zero_grad()
        
        probs = model(bow_vec)
        
        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()
    print(epoch)

print("after training")
num_predictions = 0
num_correct = 0
with torch.no_grad():
    for bow_vec, target in convote_test_data_lr:
        bow_vec = make_bow_vec(sentence, convote_word_to_id)
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == target):
            num_correct += 1
print(num_correct/num_predictions*100)

## 2.2 RNN

In [5]:
class RNN(nn.Module):
    def __init__(self, vocab, output_size, hidden_size):
        super(RNN, self).__init__()

        self.vocab = vocab
        self.input_size = len(vocab)
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        self.dropout = nn.Dropout()
        self.embedding = nn.Embedding(self.input_size, self.hidden_size).to(device)
        self.left = nn.Linear(self.hidden_size, self.hidden_size).to(device)
        self.right = nn.Linear(self.hidden_size, self.hidden_size).to(device)
        self.cat = nn.Linear(self.hidden_size, self.output_size).to(device)

    def forward(self, sentence):
        embedding = self.dropout(self.embedding(sentence))
        hidden = embedding[0]
        for i in range(1, len(embedding)):
            hidden = torch.tanh(torch.add(self.left(hidden), self.right(embedding[i])))
        return F.log_softmax(self.cat(self.dropout(hidden)), dim=-1).view(1, -1)

In [134]:
from torchnlp.nn import LockedDropout

class RNN_v2(nn.Module):
    def __init__(self, vocab, output_size, hidden_size):
        torch.manual_seed(0)
        super(RNN_v2, self).__init__()

        self.vocab = vocab
        self.input_size = len(vocab)
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        # dropout
#         self.dropout = nn.Dropout()
#         self.dropout = LockedDropout()
        self.embedding = nn.Embedding(self.input_size, self.hidden_size).to(device)
        self.rnn = nn.RNN(self.hidden_size, self.hidden_size).to(device)
        self.cat = nn.Linear(self.hidden_size, self.output_size).to(device)
        
        # custome init
        torch.nn.init.zeros_(self.rnn.weight_ih_l0)
        torch.nn.init.zeros_(self.rnn.weight_hh_l0)
        torch.nn.init.zeros_(self.rnn.bias_ih_l0)
        torch.nn.init.zeros_(self.rnn.bias_hh_l0)
        torch.nn.init.zeros_(self.cat.weight)
        torch.nn.init.zeros_(self.cat.bias)

    def forward(self, sentence):
        embedding = self.embedding(sentence).unsqueeze(1)
#         embedding = self.dropout(embedding)
        output, hidden = self.rnn(embedding)
        output = output
#         output = self.dropout(output)
        return F.log_softmax(self.cat(output.squeeze()[-1]), dim=-1).view(1, -1)

In [7]:
class RNN_v3(nn.Module):
    def __init__(self, vocab, output_size, hidden_size):
        super(RNN_v3, self).__init__()

        self.vocab = vocab
        self.input_size = len(vocab)
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(self.input_size, self.hidden_size).to(device)
        self.linear = nn.Linear(2 * self.hidden_size, self.hidden_size).to(device)
        self.cat = nn.Linear(self.hidden_size, self.output_size).to(device)

    def forward(self, sentence):
        embedding = self.embedding(sentence)
        hidden = embedding[0]
        for i in range(1, len(embedding)):
            hidden = torch.tanh(self.linear(torch.cat((hidden, embedding[i]))))
        return F.log_softmax(self.cat(hidden), dim=-1).view(1, -1)

In [None]:
class RNN_Word2Vec(nn.Module):
    def __init__(self, vocab, output_size, hidden_size):
        super(RNN_Word2Vec, self).__init__()
        
        self.vocab = vocab
        self.input_size = len(vocab.vocab)
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        self.dropout = nn.Dropout()
        self.rnn = nn.RNN(self.hidden_size, self.hidden_size).to(device)
        self.cat = nn.Linear(self.hidden_size, self.output_size).to(device)
        
    def forward(self, sentence):
        embedding = sentence.unsqueeze(1)
        embedding = self.dropout(embedding)
        output, hidden = self.rnn(embedding)
        output = output.squeeze()[-1]
        output = self.dropout(output)
        return F.log_softmax(self.cat(output), dim=-1).view(1, -1)

### 2.2.1 IBC RNN

In [95]:
ibc_vocab, ibc_train_data_rnn, ibc_val_data_rnn, ibc_test_data_rnn = generate_data_rnn(cutoff_freq=0)

training data size: 2980
validation data size: 372
test data size: 374
vocab size: 11952


In [136]:
ibc_num_labels = 2
model = RNN_v2(ibc_vocab, ibc_num_labels, 150)
# model.load_state_dict(torch.load('./rnn_50epochs_v2.model'))

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)
num_epochs = 50

train(model, ibc_train_data_rnn, ibc_val_data_rnn, ibc_test_data_rnn, loss_function, optimizer, num_epochs)

Before training: (0.6931471824645996, 45.72192513368984)
[1] Saved best model
[1] training loss: 0.6908004702537652, validation loss: 0.6896023843237149, validation accuracy: 54.3010752688172
[2] training loss: 0.6894991918298222, validation loss: 0.6894729521966749, validation accuracy: 54.3010752688172
[3] training loss: 0.6895190922805927, validation loss: 0.6894463204568432, validation accuracy: 54.3010752688172
[4] training loss: 0.6895136769585961, validation loss: 0.6894527289175219, validation accuracy: 54.3010752688172
[5] training loss: 0.6894779929178673, validation loss: 0.6894502453906561, validation accuracy: 54.3010752688172
[6] training loss: 0.6895704201603896, validation loss: 0.6894519008615966, validation accuracy: 54.3010752688172
[7] training loss: 0.689555600245527, validation loss: 0.6894432447289908, validation accuracy: 54.3010752688172
[8] training loss: 0.6895415153279401, validation loss: 0.6894592001873959, validation accuracy: 54.3010752688172
[9] trainin

In [104]:
print('training loss and accuracy:', eval(model, ibc_train_data_rnn, loss_function))
print('testing loss and accuracy', eval(model, ibc_test_data_rnn, loss_function))

training loss and accuracy: (0.3147436384536676, 87.28187919463087)
testing loss and accuracy (0.8275716159353281, 59.35828877005348)


In [105]:
new_model = torch.load('./rnn_best_valid.model')
print(eval(new_model, ibc_test_data_rnn, loss_function))

(0.8393489906893057, 59.35828877005348)


### IBC RNN Word2Vec

In [88]:
ibc_vocab, ibc_train_data_rnn, ibc_val_data_rnn, ibc_test_data_rnn = generate_data_rnn(use_word2vec=True, embed_size=100)

training data size: 2980
validation data size: 372
test data size: 374
vocab size: 6400


In [91]:
ibc_num_labels = 2
model = RNN_Word2Vec(ibc_vocab, ibc_num_labels, 100)

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
num_epochs = 50

train(model, ibc_train_data_rnn, ibc_val_data_rnn, ibc_test_data_rnn, loss_function, optimizer, num_epochs)

Before training: (0.6927840488160996, 53.475935828877006)
[1] Saved best model
[1] training loss: 0.6960849377732949, validation loss: 0.6893330195578196, validation accuracy: 54.3010752688172
[2] training loss: 0.6917633099843992, validation loss: 0.6905470794887953, validation accuracy: 54.3010752688172
[3] Saved best model
[3] training loss: 0.6918588072861601, validation loss: 0.6869211326683721, validation accuracy: 54.83870967741935
[4] training loss: 0.6917449307321702, validation loss: 0.6906568300659939, validation accuracy: 54.3010752688172


KeyboardInterrupt: 

In [87]:
print('training loss and accuracy:', eval(model, ibc_train_data_rnn, loss_function))
print('testing loss and accuracy', eval(model, ibc_test_data_rnn, loss_function))

tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
t

In [57]:
new_model = torch.load('./rnn_best_valid.model')
print(eval(new_model, ibc_test_data_rnn, loss_function))

(0.6905160562239866, 54.01069518716578)


In [None]:
for sent, _ in ibc_train_data_rnn:
    print(sent[0].unsqueeze(1).shape)

## 3 Decision tree

In [3]:
from sklearn.tree import DecisionTreeClassifier as DCT

In [38]:
def make_bow_vec_dt(sentence, word_to_id):
    vec = [0] * len(word_to_id)
    for word in sentence:
        word_lower = word.lower()
        if word_lower in word_to_id.keys():
            vec[word_to_id[word_lower]] += 1
    return vec

def make_target_dt(label, label_to_id):
    return label_to_id[label]

def generate_data_dt(cutoff_freq=0):
    torch.manual_seed(0)
    
    [lib, con, neutral] = pickle.load(open('{}full_ibc/ibcData.pkl'.format(data_path), 'rb'))

    ibc_lib = [(tree.get_words().split(), "LIBERAL") for tree in lib]
    ibc_con = [(tree.get_words().split(), "CONSERVATIVE") for tree in con]
    # ibc_neutral = [(tree.get_words().split(), "NEUTRAL") for tree in neutral]

    ibc_full_data = ibc_lib + ibc_con
    ibc_full_size = len(ibc_full_data)

    ibc_train_size = int(ibc_full_size*0.9)
    ibc_test_size = ibc_full_size - ibc_train_size

    ibc_train_data, ibc_test_data = torch.utils.data.random_split(ibc_full_data, [ibc_train_size, ibc_test_size])

    ibc_label_to_id = {"LIBERAL": 0, "CONSERVATIVE": 1}
    ibc_word_to_id = {}
    for sent, _ in ibc_train_data:
        for word in sent:
            word_lower = word.lower()
    #         if word_lower not in ibc_word_to_id:
            if word_lower not in ibc_word_to_id and word_lower not in stopwords.words('english'):
                ibc_word_to_id[word_lower] = len(ibc_word_to_id)
                
    ibc_vocab_size = len(ibc_word_to_id)
    ibc_num_labels = 2

    ibc_train_data = [(make_bow_vec_knn(sentence, ibc_word_to_id), make_target_knn(target, ibc_label_to_id)) for sentence, target in ibc_train_data]
    ibc_test_data = [(make_bow_vec_knn(sentence, ibc_word_to_id), make_target_knn(target, ibc_label_to_id)) for sentence, target in ibc_test_data]

    print('data size:', len(ibc_full_data))
    print('dict size:', ibc_vocab_size)
    
    return ibc_word_to_id, ibc_train_data, ibc_test_data

In [39]:
ibc_vocab, ibc_train_data, ibc_test_data = generate_data_dt()

data size: 3726
dict size: 13491


In [58]:
def train(model, train_data):
    x = []
    y = []
    
    for sentence, label in train_data:
        x.append(sentence)
        y.append(label)

#     print(y[0])
    model.fit(x, y)
    
def eval(model, test_data):
    x = []
    y = []
    
    for sentence, label in test_data:
        x.append(sentence)
        y.append(label)
        
    predictions = model.predict(x)
    
    correct_prediction = 0
    idx = 0
    while idx < len(x):
        if predictions[idx] == y[idx]:
            correct_prediction += 1
        idx += 1
        
    return correct_prediction / idx

In [47]:
dt = DCT()
train(dt, ibc_train_data)

In [59]:
print(eval(dt, ibc_test_data))

0.6193029490616622


# KNN

In [60]:
from sklearn.neighbors import KNeighborsClassifier

In [61]:
def make_bow_vec_knn(sentence, word_to_id):
    vec = [0] * len(word_to_id)
    for word in sentence:
        word_lower = word.lower()
        if word_lower in word_to_id.keys():
            vec[word_to_id[word_lower]] += 1
    return vec

def make_target_knn(label, label_to_id):
    return label_to_id[label]

def generate_data_knn(cutoff_freq=0):
    torch.manual_seed(0)
    
    [lib, con, neutral] = pickle.load(open('{}full_ibc/ibcData.pkl'.format(data_path), 'rb'))

    ibc_lib = [(tree.get_words().split(), "LIBERAL") for tree in lib]
    ibc_con = [(tree.get_words().split(), "CONSERVATIVE") for tree in con]
    # ibc_neutral = [(tree.get_words().split(), "NEUTRAL") for tree in neutral]

    ibc_full_data = ibc_lib + ibc_con
    ibc_full_size = len(ibc_full_data)

    ibc_train_size = int(ibc_full_size*0.9)
    ibc_test_size = ibc_full_size - ibc_train_size

    ibc_train_data, ibc_test_data = torch.utils.data.random_split(ibc_full_data, [ibc_train_size, ibc_test_size])

    ibc_label_to_id = {"LIBERAL": 0, "CONSERVATIVE": 1}
    ibc_word_to_id = {}
    for sent, _ in ibc_train_data:
        for word in sent:
            word_lower = word.lower()
    #         if word_lower not in ibc_word_to_id:
            if word_lower not in ibc_word_to_id and word_lower not in stopwords.words('english'):
                ibc_word_to_id[word_lower] = len(ibc_word_to_id)
                
    ibc_vocab_size = len(ibc_word_to_id)
    ibc_num_labels = 2

    ibc_train_data = [(make_bow_vec_knn(sentence, ibc_word_to_id), make_target_knn(target, ibc_label_to_id)) for sentence, target in ibc_train_data]
    ibc_test_data = [(make_bow_vec_knn(sentence, ibc_word_to_id), make_target_knn(target, ibc_label_to_id)) for sentence, target in ibc_test_data]

    print('data size:', len(ibc_full_data))
    print('dict size:', ibc_vocab_size)
    
    return ibc_word_to_id, ibc_train_data, ibc_test_data

In [63]:
ibc_vocab, ibc_train_data, ibc_test_data = generate_data_knn()

data size: 3726
dict size: 13491


In [64]:
def train(model, train_data):
    x = []
    y = []
    
    for sentence, label in train_data:
        x.append(sentence)
        y.append(label)

    model.fit(x, y)
    
def eval(model, test_data):
    x = []
    y = []
    
    for sentence, label in test_data:
        x.append(sentence)
        y.append(label)
        
    predictions = model.predict(x)
    
    correct_prediction = 0
    idx = 0
    while idx < len(x):
        if predictions[idx] == y[idx]:
            correct_prediction += 1
        idx += 1
        
    return correct_prediction / idx

In [72]:
knn = KNeighborsClassifier(n_neighbors=2)
train(knn, ibc_train_data)

In [73]:
print(eval(knn, ibc_test_data))

0.5764075067024129
