In [1]:
stop_words = [
        "a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along",
        "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
        "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be",
        "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
        "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "co",
        "con", "could", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight",
        "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
        "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill", "find", "fire", "first", "five", "for",
        "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had",
        "has", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself",
        "him", "himself", "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "interest",
        "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made",
        "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much",
        "must", "my", "myself", "name", "namely", "neither", "nevertheless", "next", "nine", "nobody", "now", "nowhere",
        "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours",
        "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see",
        "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some",
        "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take",
        "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby",
        "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though",
        "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve",
        "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what",
        "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon",
        "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will",
        "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"
        ]

In [2]:
from torch.utils.data import Dataset, DataLoader
import csv
import json
import re
import os

class Mydataset(Dataset):
    def __init__(self, train=True):
        def clean_str(string):
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip().lower()
        if train:
            print('loading trainig dataset')
            bf = 'train_bodies.csv'
            sf = 'train_stances.csv'
        else:
            print('loading testing dataset')
            bf = 'competition_test_bodies.csv'
            sf = 'competition_test_stances.csv'
            
        with open(os.path.join('fnc-1', sf), 'r', newline='', encoding='utf-8') as myFile:  
            rdr = csv.reader(myFile)
            next(rdr)
            temp = list(rdr)
            self.stances = [[clean_str(a[0]), a[1], a[2]] for a in temp]
            print(len(self.stances), 'stances')

        with open(os.path.join('fnc-1', bf), 'r', newline='', encoding='utf-8') as myFile:  
            rdr = csv.reader(myFile)
            next(rdr)
            temp = list(rdr)
            self.bodies = dict([[a[0], clean_str(a[1])]for a in temp])
            print(len(self.bodies), 'bodies')

        self.len = len(self.stances)
        self.labels = list(sorted(set([t[2] for t in self.stances])))
        
    def __getitem__(self, index):
        return self.stances[index][0], self.bodies[self.stances[index][1]], self.stances[index][2]
    
    def __len__(self):
        return self.len
        
    def get_labels(self):
        return self.labels
    
    def get_label(self, id):
        return self.labels[id]
    
    def get_label_id(self, label):
        return self.labels.index(label)

In [3]:
train_dataset = Mydataset()
test_dataset = Mydataset(train=False)

loading trainig dataset
49972 stances
1683 bodies
loading testing dataset
25413 stances
904 bodies


In [4]:
BATCH_SIZE = 128

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE, shuffle=True)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE, shuffle=False)

N_LABELS = len(train_dataset.get_labels())

In [5]:
import word2vec.wordvector as w2v

class args:
    pass

args = args()

args.class_num = N_LABELS
args.kernel_num = 100
args.kernel_sizes = [3,4,5]
args.dropout = 0.5
args.static = True
args.lr = 0.001
args.epochs = 256
args.embed, args.embeding_num, args.embeding_dim = w2v.get_embedding()



Initialize word vector array...
Convert word vector to tensor...


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


class CNN_Text(nn.Module):
    
    def __init__(self, args):
        super(CNN_Text, self).__init__()
        self.args = args
        
        C = args.class_num
        Ci = 1
        Co = args.kernel_num
        Ks = args.kernel_sizes
        V = args.embeding_num
        D = args.embeding_dim
        self.embed = args.embed

        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])

        self.dropout = nn.Dropout(args.dropout)

        self.fc1 = nn.Linear(len(Ks)*Co, C)

    def forward(self, x):
        x = self.embed(x)  # (N, W, D)

        if self.args.static:
            x = Variable(x)

        x = x.unsqueeze(1)  # (N, Ci, W, D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)

        x = torch.cat(x, 1)

        x = self.dropout(x)  # (N, len(Ks)*Co)

        logit = self.fc1(x)  # (N, C)
        return logit
model = CNN_Text(args)
model.double()

CNN_Text(
  (embed): Embedding(3000001, 50)
  (convs1): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 50), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 50), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 50), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5)
  (fc1): Linear(in_features=300, out_features=4, bias=True)
)

In [7]:
word_dic = {}
with open(os.path.join('word2vec', 'data', 'word_to_id.txt'), 'r', encoding='utf-8') as f:
    i = 1
    for line in f:
        word = line.strip()
        word_dic[word] = i
        i += 1

def get_id(word):
    return word_dic.get(word, 0)

def build_tensor(titles, bodys, labels):
    label_t = torch.tensor([train_dataset.get_label_id(l) for l in labels], dtype=torch.long)
    
    title_t = [torch.tensor([get_id(w) for w in title.split(' ') if w not in stop_words], dtype=torch.long) for title in titles]
    title_l = [a.shape[0] for a in title_t]
    title_max = max(title_l)
    title_p = [title_max - a for a in title_l]
    title_t = [F.pad(a.view(1,1,1,-1), (0, title_p[i], 0, 0)).view(1,-1) for i, a in enumerate(title_t)]
    
    body_t = [torch.tensor([get_id(w) for w in body.split(' ') if w not in stop_words], dtype=torch.long) for body in bodys]
    body_l = [a.shape[0] for a in body_t]
    body_max = max(body_l)
    body_p = [body_max - a for a in body_l]
    body_t = [F.pad(a.view(1,1,1,-1), (0, body_p[i], 0, 0)).view(1,-1) for i, a in enumerate(body_t)]
    return torch.cat(title_t, 0), torch.cat(body_t, 0), label_t 

In [8]:
args.log_interval = 1
args.test_interval = 1

def train(model, train_loader, test_loader, args):
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    model.train()
    
    steps = 0
    
    for epoch in range(1, args.epochs+1):
        batch = 0
        for title, body, label in train_loader:
            title_t, body_t, label_t = build_tensor(title, body, label)
            optimizer.zero_grad()
            feature = torch.cat([title_t, body_t], 1)
            logit = model(feature)
            loss = F.cross_entropy(logit, label_t)
            loss.backward()
            optimizer.step()
            steps += 1
            if steps % args.log_interval == 0:
                corrects = (torch.max(logit, 1)[1].view(label_t.size()).data == label_t.data).sum()
                accuracy = 100.0 * corrects/label_t.shape[0]
                print(
                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps,
                                                                             loss.data[0], 
                                                                             accuracy,
                                                                             corrects,
                                                                             label_t.shape[0]))
            if steps % args.test_interval == 0:
                acc = eval(test_loader, model, args)
                
def eval(test_loader, model, args):
    model.eval()
    corrects, avg_loss, total = 0, 0, 0
    for title, body, label in train_loader:
        title_t, body_t, label_t = build_tensor(title, body, label)
        logit = model(torch.cat([title_t, body_t], 1))
        loss = F.cross_entropy(logit, label_t, size_average=False)
        avg_loss += loss.data[0]
        corrects += (torch.max(logit, 1)[1].view(label_t.size()).data == label_t.data).sum()
        total += len(title)
    size = len(test_loader.dataset)
    avg_loss /= size
    accuracy = 100.0 * corrects/size
    print(size, total, corrects)
#     print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) \n'.format(avg_loss, 
#                                                                        accuracy, 
#                                                                        corrects, 
#                                                                        size))
    return accuracy


try:
    train(model, train_loader, test_loader, args)
except KeyboardInterrupt:
    print('\n' + '-' * 89)
    print('Exiting from training early')



Batch[1] - loss: 1.393164  acc: 21.0000%(27/128)




RuntimeError: $ Torch: not enough memory: you tried to allocate 0GB. Buy new RAM! at ..\src\TH\THGeneral.c:218