In [1]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rifkiaputri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rifkiaputri/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
stop_words = set(stopwords.words('english')).union(set(ENGLISH_STOP_WORDS))

In [3]:
from torch.utils.data import Dataset, DataLoader
import csv
import json
import re
import os

class Mydataset(Dataset):
    def __init__(self, train=True):
        def clean_str(string):
            return " ".join(re.findall(r'\w+', string, flags=re.UNICODE)).lower()
        if train:
            print('loading training dataset')
            bf = 'train_bodies.csv'
            sf = 'train_stances.csv'
        else:
            print('loading testing dataset')
            bf = 'competition_test_bodies.csv'
            sf = 'competition_test_stances.csv'
            
        with open(os.path.join('fnc-1', sf), 'r', newline='', encoding='utf-8') as myFile:  
            rdr = csv.reader(myFile)
            next(rdr)
            temp = list(rdr)
            self.stances = [[clean_str(a[0]), a[1], a[2]] for a in temp]
            print(len(self.stances), 'stances')

        with open(os.path.join('fnc-1', bf), 'r', newline='', encoding='utf-8') as myFile:  
            rdr = csv.reader(myFile)
            next(rdr)
            temp = list(rdr)
            self.bodies = dict([[a[0], clean_str(a[1])]for a in temp])
            print(len(self.bodies), 'bodies')

        self.len = len(self.stances)
        self.labels = list(sorted(set([t[2] for t in self.stances])))
        
    def __getitem__(self, index):
        return self.stances[index][0], self.bodies[self.stances[index][1]], self.stances[index][2]
    
    def __len__(self):
        return self.len
        
    def get_labels(self):
        return self.labels
    
    def get_label(self, id):
        return self.labels[id]
    
    def get_label_id(self, label):
        return self.labels.index(label)

In [4]:
train_dataset = Mydataset()
test_dataset = Mydataset(train=False)

loading training dataset
49972 stances
1683 bodies
loading testing dataset
25413 stances
904 bodies


In [6]:
BATCH_SIZE = 128

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE, shuffle=True)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE, shuffle=False)

N_LABELS = len(train_dataset.get_labels())

In [7]:
import word2vec.wordvector as w2v

class args:
    pass

args = args()

# Assign embedding param in different cell because it takes some time to load
args.embed, args.embeding_num, args.embeding_dim = w2v.get_embedding(vec_file='./word2vec/data/vec_50.txt', dim=50)

Read vector file...
Initialize word vector array...
Convert word vector to tensor...


In [8]:
args.class_num = N_LABELS
args.kernel_num = 16
args.kernel_sizes = [3,4,5]
args.dropout = 0.5
args.static = True

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [11]:
word_dic = {}
with open(os.path.join('word2vec', 'data', 'word_to_id.txt'), 'r', encoding='utf-8') as f:
    i = 1
    for line in f:
        word = line.strip()
        word_dic[word] = i
        i += 1

def get_id(word):
    return word_dic.get(word, 0)

_wnl = nltk.WordNetLemmatizer()

def normalize_word(w):
    return _wnl.lemmatize(w).lower()

def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]

def build_tensor(titles, bodys, labels):
    label_t = torch.tensor([train_dataset.get_label_id(l) for l in labels], dtype=torch.long, device=device)
    
    title_t = [torch.tensor([get_id(w) for w in get_tokenized_lemmas(title) if w not in stop_words], dtype=torch.long, device=device) for title in titles]
    title_l = [a.shape[0] for a in title_t]
    title_max = max(title_l)
    title_p = [title_max - a for a in title_l]
    title_t = [F.pad(a.view(1,1,1,-1), (0, title_p[i], 0, 0)).view(1,-1) for i, a in enumerate(title_t)]
    
    body_t = [torch.tensor([get_id(w) for w in get_tokenized_lemmas(body) if w not in stop_words], dtype=torch.long, device=device) for body in bodys]
    body_l = [a.shape[0] for a in body_t]
    body_max = max(body_l)
    body_p = [body_max - a for a in body_l]
    body_t = [F.pad(a.view(1,1,1,-1), (0, body_p[i], 0, 0)).view(1,-1) for i, a in enumerate(body_t)]
    return torch.cat(title_t, 0), torch.cat(body_t, 0), label_t 

In [12]:
def eval(test_loader, model, args):
    model.eval()
    corrects, avg_loss, total = 0, 0, 0
    for title, body, label in test_loader:
        title_t, body_t, label_t = build_tensor(title, body, label)
        logit = model(title_t, body_t)
        loss = F.cross_entropy(logit, label_t, size_average=False)
        avg_loss += loss.data[0]
        corrects += (torch.max(logit, 1)[1].view(label_t.size()).data == label_t.data).sum()
        total += len(title)

    size = len(test_loader.dataset)
    avg_loss /= size
    accuracy = 100.0 * corrects/size
    print('Evaluation - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(avg_loss, 
                                                                       accuracy, 
                                                                       corrects, 
                                                                       size))
    return accuracy

In [13]:
def save(model, save_dir, save_prefix, steps):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_prefix = os.path.join(save_dir, save_prefix)
    save_path = '{}_steps_{}.pt'.format(save_prefix, steps)
    torch.save(model.state_dict(), save_path)

In [28]:
class CNN_Text(nn.Module):
    
    def __init__(self, args):
        super(CNN_Text, self).__init__()
        self.args = args
        
        C = args.class_num
        Ci = 1
        Co = args.kernel_num
        Ks = args.kernel_sizes
        V = args.embeding_num
        D = args.embeding_dim
        self.embed = args.embed

        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        
        self.convs2 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])

        self.dropout = nn.Dropout(args.dropout)

        self.fc1 = nn.Linear(len(Ks) * Co, len(Ks) * Co)
        
        self.fc2 = nn.Linear(len(Ks) * Co, len(Ks) * Co)
        
        self.fc_final = nn.Linear(len(Ks) * Co, C)

    def forward(self, x, y):
        # Title
        x = self.embed(x)  # (N, W, D)

        if self.args.static:
            x = Variable(x)

        x = x.unsqueeze(1)  # (N, Ci, W, D)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        
        # Body text
        y = self.embed(y)  # (N, W, D)

        if self.args.static:
            y = Variable(y)

        y = y.unsqueeze(1)  # (N, Ci, W, D)
        y = [F.relu(conv(y)).squeeze(3) for conv in self.convs2]  # [(N, Co, W), ...]*len(Ks)
        y = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in y]  # [(N, Co), ...]*len(Ks)
        y = torch.cat(y, 1)
        
        # Calculate distance between x and y
        out = torch.mul(x, y)
        
        # Fully connected nn
        out = self.fc1(out)
        out = self.fc2(out)
        out = self.dropout(out)  # (N, len(Ks)*Co)
        out = self.fc_final(out)  # (N, C)
        
        return out
    
    
model = CNN_Text(args)
model.double()
model.to(device)

CNN_Text(
  (embed): Embedding(3000001, 50)
  (convs1): ModuleList(
    (0): Conv2d(1, 16, kernel_size=(3, 50), stride=(1, 1))
    (1): Conv2d(1, 16, kernel_size=(4, 50), stride=(1, 1))
    (2): Conv2d(1, 16, kernel_size=(5, 50), stride=(1, 1))
  )
  (convs2): ModuleList(
    (0): Conv2d(1, 16, kernel_size=(3, 50), stride=(1, 1))
    (1): Conv2d(1, 16, kernel_size=(4, 50), stride=(1, 1))
    (2): Conv2d(1, 16, kernel_size=(5, 50), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5)
  (fc1): Linear(in_features=48, out_features=48, bias=True)
  (fc2): Linear(in_features=48, out_features=48, bias=True)
  (fc_final): Linear(in_features=48, out_features=4, bias=True)
)

In [29]:
args.lr = 0.001
args.epochs = 10
args.log_interval = 10
args.test_interval = 400
args.save_interval = 1000
args.save_dir = 'models'  # model save path
if not os.path.exists(args.save_dir):
    os.mkdir(args.save_dir)

In [30]:
def train(model, train_loader, test_loader, args):
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    model.train()
    
    steps = 0
    best_acc = 0
    
    for epoch in range(1, args.epochs+1):
        print('\nStart epoch', epoch, '....')
        for title, body, label in train_loader:
            title_t, body_t, label_t = build_tensor(title, body, label)
            optimizer.zero_grad()
            logit = model(title_t, body_t)
            loss = F.cross_entropy(logit, label_t)
            loss.backward()
            optimizer.step()
            steps += 1
            if steps % args.log_interval == 0:
                corrects = (torch.max(logit, 1)[1].view(label_t.size()).data == label_t.data).sum()
                accuracy = 100.0 * corrects/label_t.shape[0]
                print(
                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps,
                                                                             loss.data[0], 
                                                                             accuracy,
                                                                             corrects,
                                                                             label_t.shape[0]))
        
            # Evaluate & save model after every interval
#             if steps % args.save_interval == 0 and steps > 1500:
#                 corrects = (torch.max(logit, 1)[1].view(label_t.size()).data == label_t.data).sum()
#                 accuracy = 100.0 * corrects/label_t.shape[0]
#                 print(
#                     '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps,
#                                                                              loss.data[0], 
#                                                                              accuracy,
#                                                                              corrects,
#                                                                              label_t.shape[0]))
#                 print('Saving snapshot result ...')
#                 acc_str = '{:.0f}'.format(accuracy)
#                 save(model, args.save_dir, 'snapshot_acc_' + acc_str, steps)
                
    # save final model
    corrects = (torch.max(logit, 1)[1].view(label_t.size()).data == label_t.data).sum()
    accuracy = 100.0 * corrects/label_t.shape[0]
    acc_str = '{:.0f}'.format(accuracy)
    print('Final training accuracy: ' + acc_str)
    save(model, args.save_dir, 'final_epoch_' + str(args.epochs) + '_hl', 3)


try:
    train(model, train_loader, test_loader, args)
except KeyboardInterrupt:
    print('\n' + '-' * 89)
    print('Exiting from training early')


Start epoch 1 ....




Batch[10] - loss: 1.234055  acc: 70.0000%(90/128)
Batch[20] - loss: 1.111588  acc: 67.0000%(86/128)
Batch[30] - loss: 0.850963  acc: 74.0000%(95/128)
Batch[40] - loss: 0.727852  acc: 78.0000%(101/128)
Batch[50] - loss: 0.874618  acc: 72.0000%(93/128)
Batch[60] - loss: 0.781356  acc: 73.0000%(94/128)
Batch[70] - loss: 0.790848  acc: 73.0000%(94/128)
Batch[80] - loss: 0.756115  acc: 76.0000%(98/128)
Batch[90] - loss: 0.847405  acc: 72.0000%(93/128)
Batch[100] - loss: 0.766590  acc: 73.0000%(94/128)
Batch[110] - loss: 0.858218  acc: 68.0000%(88/128)
Batch[120] - loss: 0.792750  acc: 75.0000%(96/128)
Batch[130] - loss: 0.850287  acc: 72.0000%(93/128)
Batch[140] - loss: 0.899479  acc: 71.0000%(91/128)
Batch[150] - loss: 0.870599  acc: 67.0000%(86/128)
Batch[160] - loss: 0.893462  acc: 65.0000%(84/128)
Batch[170] - loss: 0.713164  acc: 76.0000%(98/128)
Batch[180] - loss: 0.731306  acc: 75.0000%(97/128)
Batch[190] - loss: 0.985043  acc: 67.0000%(86/128)
Batch[200] - loss: 0.736531  acc: 76.00

Batch[1580] - loss: 0.261080  acc: 89.0000%(114/128)
Batch[1590] - loss: 0.276141  acc: 89.0000%(115/128)
Batch[1600] - loss: 0.277655  acc: 91.0000%(117/128)
Batch[1610] - loss: 0.350498  acc: 86.0000%(111/128)
Batch[1620] - loss: 0.264771  acc: 91.0000%(117/128)
Batch[1630] - loss: 0.230887  acc: 91.0000%(117/128)
Batch[1640] - loss: 0.264432  acc: 91.0000%(117/128)
Batch[1650] - loss: 0.342060  acc: 86.0000%(111/128)
Batch[1660] - loss: 0.280505  acc: 90.0000%(116/128)
Batch[1670] - loss: 0.256573  acc: 92.0000%(118/128)
Batch[1680] - loss: 0.269033  acc: 89.0000%(115/128)
Batch[1690] - loss: 0.231784  acc: 91.0000%(117/128)
Batch[1700] - loss: 0.192448  acc: 95.0000%(122/128)
Batch[1710] - loss: 0.271623  acc: 91.0000%(117/128)
Batch[1720] - loss: 0.212034  acc: 91.0000%(117/128)
Batch[1730] - loss: 0.254088  acc: 90.0000%(116/128)
Batch[1740] - loss: 0.287040  acc: 89.0000%(115/128)
Batch[1750] - loss: 0.218452  acc: 92.0000%(118/128)
Batch[1760] - loss: 0.174430  acc: 93.0000%(12

Batch[3120] - loss: 0.250714  acc: 92.0000%(118/128)

Start epoch 9 ....
Batch[3130] - loss: 0.227372  acc: 89.0000%(114/128)
Batch[3140] - loss: 0.177092  acc: 91.0000%(117/128)
Batch[3150] - loss: 0.114697  acc: 95.0000%(122/128)
Batch[3160] - loss: 0.141702  acc: 95.0000%(122/128)
Batch[3170] - loss: 0.171061  acc: 92.0000%(118/128)
Batch[3180] - loss: 0.109795  acc: 97.0000%(125/128)
Batch[3190] - loss: 0.088957  acc: 99.0000%(127/128)
Batch[3200] - loss: 0.132319  acc: 93.0000%(120/128)
Batch[3210] - loss: 0.089116  acc: 98.0000%(126/128)
Batch[3220] - loss: 0.109611  acc: 96.0000%(124/128)
Batch[3230] - loss: 0.192636  acc: 92.0000%(118/128)
Batch[3240] - loss: 0.305806  acc: 88.0000%(113/128)
Batch[3250] - loss: 0.119055  acc: 96.0000%(124/128)
Batch[3260] - loss: 0.127486  acc: 95.0000%(122/128)
Batch[3270] - loss: 0.157948  acc: 93.0000%(120/128)
Batch[3280] - loss: 0.220308  acc: 91.0000%(117/128)
Batch[3290] - loss: 0.158623  acc: 93.0000%(120/128)
Batch[3300] - loss: 0.1158

AttributeError: 'args' object has no attribute 'epoch'

In [32]:
def predict_test(test_loader, model, args):
    pred = []
    gold = []
    
    # restore the best parameters
#     model_file = os.path.join(args.save_dir, 'final_acc_84_steps_40000.pt')
#     model.load_state_dict(torch.load(model_file, map_location=lambda storage, loc: storage))
    
    model.eval()
    corrects, avg_loss, total = 0, 0, 0
    for title, body, label in test_loader:
        title_t, body_t, label_t = build_tensor(title, body, label)
        logit = model(title_t, body_t)
        loss = F.cross_entropy(logit, label_t, size_average=False)
        avg_loss += loss.data[0]
        corrects += (torch.max(logit, 1)[1].view(label_t.size()).data == label_t.data).sum()
        total += len(title)
        tmp = torch.max(logit, 1)[1]
        pred = pred + tmp.cpu().numpy().tolist()
        gold = gold + label_t.cpu().numpy().tolist()

    size = len(test_loader.dataset)
    avg_loss /= size
    accuracy = 100.0 * corrects/size
    print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) \n'.format(avg_loss, 
                                                                       accuracy, 
                                                                       corrects, 
                                                                       size))
    return pred, gold

In [33]:
pred, gold = predict_test(test_loader, model, args)

  from ipykernel import kernelapp as app



Evaluation - loss: 1.600614  acc: 64.0000%(16325/25413) 



In [34]:
import utils.score as sc

In [35]:
sc.report_score([sc.LABELS[e] for e in gold], [sc.LABELS[e] for e in pred])

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    794    |     3     |    400    |    706    |
-------------------------------------------------------------
| disagree  |    304    |     2     |    133    |    258    |
-------------------------------------------------------------
|  discuss  |   1042    |    20     |   2167    |   1235    |
-------------------------------------------------------------
| unrelated |   2411    |     5     |   2571    |   13362   |
-------------------------------------------------------------
Score: 6779.0 out of 11651.25	(58.18259843364446%)


58.18259843364446