# 1. Dataset Loader

In [82]:
data_path = '../../dataset/sentiment_analysis/'

import os
import sys
sys.path.insert(0, os.path.abspath('{}full_ibc'.format(data_path)))
import pickle

import numpy
import torch.utils.data as D

## 1.1 IBC

In [121]:
[lib, con, neutral] = pickle.load(open('{}full_ibc/ibcData.pkl'.format(data_path), 'rb'))

ibc_lib = [(tree.get_words().split(), "LIBERAL") for tree in lib]
ibc_con = [(tree.get_words().split(), "CONSERVATIVE") for tree in con]
ibc_neutral = [(tree.get_words().split(), "NEUTRAL") for tree in neutral]

ibc_full_data = ibc_lib + ibc_con + ibc_neutral
ibc_full_size = len(ibc_full_data)

ibc_train_size = int(ibc_full_size*0.8)
ibc_test_size = ibc_full_size - ibc_train_size

ibc_train_data, ibc_test_data = D.random_split(ibc_full_data, [ibc_train_size, ibc_test_size])

4326 3460 866


In [123]:
ibc_label_to_id = {"LIBERAL": 0, "CONSERVATIVE": 1, "NEUTRAL": 2}
ibc_word_to_id = {}
for sent, _ in ibc_train_data:
    for word in sent:
        if word not in ibc_word_to_id:
            ibc_word_to_id[word] = len(ibc_word_to_id)
print(len(ibc_word_to_id))

ibc_vocab_size = len(ibc_word_to_id)
ibc_num_labels = 3

15514


## 1.2 Convote

In [4]:
bills = {}
speakers = {}
parties = {'D': set(), 'R': set(), 'I': set()}
d_sent = ""
r_sent = ""
sent_len = []
d_sent_len = []
r_sent_len = []

for directory in os.listdir("{}convote_v1.1/data_stage_one/".format(data_path)):
    for filename in os.listdir("../../dataset/sentiment_analysis/convote_v1.1/data_stage_one/{}".format(directory)):
        filename_split = filename.split("_")
        bill = filename_split[0]
        speaker = filename_split[1]
        party = filename_split[-1][:1]
        if party == 'D':
            with open("{}convote_v1.1/data_stage_one/{}/{}".format(data_path, directory, filename), "r") as f:
                d_sent += f.read().replace("\n", " ")
        if party == 'R':
            with open("{}convote_v1.1/data_stage_one/{}/{}".format(data_path, directory, filename), "r") as f:
                r_sent += f.read().replace("\n", " ")
        if bill in bills:
            bills[bill] += 1
        else:
            bills[bill] = 1
        if speaker in speakers:
            speakers[speaker] += 1
        else:
            speakers[speaker] = 1
        if speaker not in parties[party]:
            parties[party].add(speaker)
        with open("{}convote_v1.1/data_stage_one/{}/{}".format(data_path, directory, filename), "r") as f:
            for line in f.readlines():
                length = len(line.split(' '))
#                 if length > 100:
#                     print(line)
                sent_len.append(length)
                if party == 'D':
                    d_sent_len.append(length)
                if party == 'R':
                    r_sent_len.append(length)

print(len(bills), len(speakers), len(parties['R']), len(parties['D']), len(parties['I']))

# stopwords = set(STOPWORDS)
# stopwords.add("mr speaker")
# stopwords.add("bill")
# stopwords.add("mr chairman")
# stopwords.add("will")
# stopwords.add("one")
# stopwords.add("need")
# wc = WordCloud(stopwords=stopwords)
# r_wc = WordCloud(stopwords=stopwords).generate(r_sent)
# d_wc = WordCloud(stopwords=stopwords).generate(d_sent)

# print(r_wc.words_)
# print(d_wc.words_)

# print(len(sent_len))
# print(len(r_sent_len))
# print(len(d_sent_len))

53 401 212 188 1


# 2 Models

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## 2.1 Logistic Regression

In [106]:
# load data
train_data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH")]

label_to_id = {"SPANISH": 0, "ENGLISH": 1}
word_to_id = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_id:
            word_to_id[word] = len(word_to_id)
print(word_to_id)

VOCAB_SIZE = len(word_to_id)
NUM_LABELS = 2

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}


In [99]:
class BoWClassifier(nn.Module):
    def __init__(self, vocab_size, num_labels):
        super(BoWClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_labels)
        
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=-1)

def make_bow_vec(sentence, word_to_id):
    vec = torch.zeros(len(word_to_id))
    for word in sentence:
        if word in word_to_id.keys():
            vec[word_to_id[word]] += 1
    return vec.view(1, -1)

def make_target(label, label_to_id):
    return torch.LongTensor([label_to_id[label]])

### 2.1.1 IBC Logistic Regression

In [131]:
# first run
model = BoWClassifier(ibc_vocab_size, ibc_num_labels)

print("before training")
num_predictions = 0
num_correct = 0
with torch.no_grad():
    for sentence, label in ibc_test_data:
        bow_vec = make_bow_vec(sentence, ibc_word_to_id)
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == make_target(label, ibc_label_to_id)):
            num_correct += 1
#         print(probs, label, torch.argmax(probs))
print(num_correct/num_predictions*100)
        
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(100):
    for sentence, label in ibc_train_data:
        model.zero_grad()
        
        bow_vec = make_bow_vec(sentence, ibc_word_to_id)
        target = make_target(label, ibc_label_to_id)
        
        probs = model(bow_vec)
        
        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()
    print(epoch)

print("after training")
num_predictions = 0
num_correct = 0
with torch.no_grad():
    for sentence, label in ibc_test_data:
        bow_vec = make_bow_vec(sentence, ibc_word_to_id)
        probs = model(bow_vec)
        num_predictions += 1
        if (torch.argmax(probs) == make_target(label, ibc_label_to_id)):
            num_correct += 1
#         print(probs, label, torch.argmax(probs))
print(num_correct/num_predictions*100)

before training
29.330254041570434
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
after training
51.03926096997691


### 2.1.2 Convote Logistic Regression

## 2.2 RNN