In [56]:
# import libraries
%matplotlib inline 
import os
import random
import numpy as np
import pandas as pd
import spacy
from bs4 import BeautifulSoup
import re
import string
import unicodedata
import contractions
import inflect
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
punctuations = string.punctuation
from nltk.util import ngrams
nlp = spacy.load('en_core_web_sm')
import torch
import torch.nn as nn
from torch.autograd import Variable

import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
hp = pd.read_csv('training_set_20k.csv', sep = '|')
hp_test = pd.read_csv('test_set_2k.csv', sep = '|')

In [17]:
def article_preprocessing(st):
    
    st = re.sub(r"&quot;", r" ", st)
    st = re.sub(r"&apos;", r" ", st)
    st = re.sub(r"([.!?])", r" ", st)
    st = re.sub(r"[^a-zA-Z0-9.!?]+", r" ", st)
    st = ' '.join(word for word in st.split(' ') if not word.startswith('httpwww'))
    st = ' '.join(word for word in st.split(' ') if not word.startswith('OrderedD'))
    st = st.replace('type external text','').replace('  ','').replace('In', ' In').replace('externaltype','')
    st = st.replace('externalSource','').replace('type internal text','').replace('external Source','').replace('external external text Source','').replace('internal text','').replace('type internal','')
    st = st.lower()
    
    return st    

In [28]:
# apply the preprocessing on the 'Article' column
hp['Article'] = hp['Article'].apply(article_preprocessing)
hp_test['Article'] = hp_test['Article'].apply(article_preprocessing)

In [29]:
# process the labels
hp['Hyperpartisan'] = [0 if hp['Hyperpartisan'][i] == False else 1 for i in range(len(hp['Hyperpartisan']))]
hp_test['Hyperpartisan'] = [0 if hp_test['Hyperpartisan'][i] == False else 1 for i in range(len(hp_test['Hyperpartisan']))]

In [30]:
# Split train data into actual train and validation sets 0.7/0.3

from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(hp, test_size=0.3, random_state=42)

train_index = train_data['ID']
train_article = train_data['Article']
train_label = train_data['Hyperpartisan']

val_index = val_data['ID']
val_article = val_data['Article']
val_label = val_data['Hyperpartisan']

test_index = hp_test['ID']
test_article = hp_test['Article']
test_label = hp_test['Hyperpartisan']

print ("Train dataset size is {}".format(len(train_index)))
print ("Val dataset size is {}".format(len(val_index)))
print ("Test dataset size is {}".format(len(test_index)))

Train dataset size is 14000
Val dataset size is 6000
Test dataset size is 2000


In [5]:
# calcualte mean article length

articles = train_article.values
length_of_article = [len(article) for article in articles]
MAX_LEN = np.mean(length_of_article)

In [3]:
# detailed preprocesing, lemmatization, removing stopwords, punctuations and tokenization
def preproc(train_article, n):
    
    token_dataset = []
    all_tokens = []
    
    for article in train_article.values:
        article = nlp(article, disable=['tagger', 'parser', 'ner'])
        tokenso = [tok.lemma_.strip() for tok in article if tok.lemma_ != '-PRON-']
        
        tokenso = [tok for tok in tokenso if tok not in stopwords and tok not in punctuations]
     
        # and get a list of all the n-grams,list of n-word tuples 
        grams = list(ngrams(tokenso, n))
        
        # token_dataset is a list of list of tuples
        token_dataset.append(grams)
        
        all_tokens += grams

    return token_dataset, all_tokens

    

In [59]:
# tokenize the training set
train_article_token_dataset, train_all_tokens = preproc(train_article,1)

In [4]:
val_article_token_dataset, val_all_tokens = preproc(val_article,1)
test_article_token_dataset, test_all_tokens = preproc(test_article,1)

In [25]:
train_all_tokens[0]

('Maison',)

In [39]:
import collections

max_vocab_size = 20000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = collections.Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(train_all_tokens)

In [40]:
print(len(token2id))
print(len(id2token))

20002
20002


In [154]:
# Lets check the dictionary by loading random token from it

random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

In [49]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

# get list of list of idxes
train_data_indices = token2index_dataset(train_article_token_dataset)
val_data_indices = token2index_dataset(val_article_token_dataset)
test_data_indices = token2index_dataset(test_article_token_dataset)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
print ("Test dataset size is {}".format(len(val_data_indices)))

Train dataset size is 3281
Val dataset size is 547
Test dataset size is 547


In [20]:
MAX_ARTICLE_LENGTH = MAX_LEN

import numpy as np
import torch
from torch.utils.data import Dataset

class GroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, article_list, label_list):
        """
        @param data_list: list of review tokens indexes
        @param target_list: list of review targets 

        """
        self.article_list = article_list
        self.label_list = label_list
        assert (len(self.article_list) == len(self.label_list))

    def __len__(self):
        return len(self.article_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.article_list[key][:MAX_ARTICLE_LENGTH]
        label = self.label_list[key]
        return [token_idx, len(token_idx), label]

def group_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    article_list = []
    label_list = []
    length_list = []
    

    
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        article_list.append(padded_vec)
    return [torch.from_numpy(np.array(article_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]


# batchsize better be power of 2
BATCH_SIZE = 128

train_dataset = GroupDataset(train_data_indices, train_label.values.tolist())
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=group_collate_func,
                                           shuffle=True)

val_dataset = GroupDataset(val_data_indices, val_label.values.tolist())
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=group_collate_func,
                                           shuffle=True)



In [None]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
import pdb
import numpy as np 
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt


MAX_SENTENCE_LENGTH = MAX_LEN

class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        # Binary calssification
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,2)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        # view basically reshapes it, so this averages it out. 
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out



# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

def test_model_routine(train_loader, val_loader, model, criterion, optimizer, num_epochs, learning_rate, scheduler=None):
    acc_per_step_val = []
    acc_per_step_train = []
    for epoch in range(num_epochs):
        acc_per_epoch = []
        acc_per_epoch_val = []
        acc = []
        for i, (data, lengths, labels) in enumerate(train_loader):
            model.train()
            data_batch, length_batch, label_batch = data, lengths, labels
            optimizer.zero_grad()
            outputs = model(data_batch, length_batch)
            loss = criterion(outputs, label_batch)
            loss.backward(retain_graph=True)
            optimizer.step()
            # validate every 100 iterations
            if i > 0 and i % 10 == 0:
                # validate
                val_acc = test_model(val_loader, model) 
                train_acc = test_model(train_loader, model)
                acc.append(val_acc)
                acc_per_epoch_val.append(val_acc)
                acc_per_epoch.append(train_acc)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, Train Acc: {}'.format( 
                           epoch+1, num_epochs, i+1, len(train_loader), val_acc, train_acc))
        #scheduler.step(loss)
        print("Average accuracy is"+ str(np.mean(acc)))
        acc_per_step_val.append(acc_per_epoch_val)
        acc_per_step_train.append(acc_per_epoch)
    print("total average accuarcies validation")
    print(acc_per_step_val)
    print("total accuracies train")
    print(acc_per_step_train)
    save_model(model, acc_per_step_val, acc_per_step_train, "Bag-of-words Deep Learning Model Performance on HyperPartisan Task")
    return acc_per_step_val, acc_per_step_train, model

def save_model(model, val_accs, train_accs, title):
    pdb.set_trace()
    val_accs = np.array(val_accs)
    max_val = val_accs.max()
    train_accs = np.array(train_accs)
    link =  ""
    torch.save(model.state_dict(), link + "model_states")
    pickle.dump(val_accs, open(link + "val_accuracies", "wb"))
    pickle.dump(train_accs, open(link + "train_accuracies", "wb"))
    pickle.dump(max_val, open(link + "maxvalaccis"+str(max_val), "wb"))
    # this is when you want to overlay
    num_in_epoch = np.shape(train_accs)[1]
    num_epochs = np.shape(train_accs)[0]
    x_vals = np.arange(0, num_epochs, 1.0/float(num_in_epoch))
    fig = plt.figure()
    plt.title(title)
    plt.plot(x_vals, train_accs.flatten(), label="Training Accuracy")
    plt.plot(x_vals, val_accs.flatten(), label="Validation Accuracy")
    plt.legend(loc="lower right")
    plt.ylabel("Accuracy of Model With Given Parameter")
    plt.xlabel("Epochs (Batch Size 32)")
    plt.ylim(0,100)
    plt.xlim(0, num_epochs)
    plt.yticks([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    plt.xticks(np.arange(num_epochs + 1))
    fig.savefig(link+"graph.png")

learning_rate = 0.001
num_epochs = 10 # number epoch to train
BATCH_SIZE = 32
max_vocab_size = 20002
# Criterion and Optimizer
model = BagOfWords(max_vocab_size, 100)
# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_data_indices = pickle.load(open('data/train_data_indexed', "rb"))
val_data_indices = pickle.load(open('data/val_data_indexed', "rb"))

train_labels = pd.read_pickle('data/train_labels').tolist()
val_labels = pd.read_pickle('data/val_labels').tolist()


convert_to_binary = {True: 1, False: 0}
train_labeldf = [convert_to_binary[x] for x in train_labels]
val_labeldf = [convert_to_binary[x] for x in val_labels]

train_dataset = HyperPartGroupDataset(train_data_indices, train_labeldf)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=hype_collate_func,
                                           shuffle=True)

val_dataset = HyperPartGroupDataset(val_data_indices, val_labeldf)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=hype_collate_func,
                                           shuffle=True)


test_model_routine(train_loader, val_loader, model, criterion, optimizer, num_epochs, learning_rate)       



In [None]:
class RNN_encoder(nn.Module):
    def __init__(self, weight, hidden_size, num_layers, fc_hidden_size, num_classes):
        # RNN Accepts the following hyperparams:
        # weight: pretrained weight matrix
        # hidden_size: Hidden Size of layer in the RNN encoder
        # num_layers: number of layers in RNN
        # fc_hidden_size: hidden size of fully connected layer
        # num_classes: number of output classes
        
        super(RNN_encoder, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.rnn = nn.GRU(emb_dim, hidden_size, num_layers, batch_first=True, bidirectional = True) # the batch is actaully in a descending order
        self.fc_model_concat = nn.Sequential(
            nn.Linear(hidden_size, fc_hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(fc_hidden_size, num_classes)
                )
    
        
        
        
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0, it is actually h0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        # this step is used to initialize the initial hidden state, for each sentence
        # because this evolves differently with each sentence, it can be initilized for each sentence
        k = self.num_layers*2
        hidden = torch.randn(k, batch_size, self.hidden_size)

        return Variable(hidden)

    
    
    def forward(self, data, length):
        
        # reset hidden state
        batch_size, seq_len = data.size()
        hidden = self.init_hidden(batch_size).to(device)
        sorted_lengths, idx_sort = torch.sort(length, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        idx_sort = Variable(idx_sort)
        idx_unsort = Variable(idx_unsort).to(device)
        data = data.index_select(0, idx_sort1).to(device)
              
        # get embedding of characters
        embed = self.embedding(data)
        # pack padded sequence
        # input: batch*length*dim
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, sorted_lengths.numpy(), batch_first=True)
        # fprop though RNN
        # input: batch seq feature
        _, h_n = self.rnn(embed,hidden)
        h_n.to(device)
        h_n = h_n.index_select(1, idx_unsort)

        encoded_out = torch.sum(h_n, dim=0)
        
        
        out = self.fc_model_concat(encoded_out.float())
        

        return out





In [None]:
def test_model_rnn(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    val_loss = 0
    model.eval()   ### this is the testing part
    with torch.no_grad():

        for data, lengths, labels in loader: 
            data_batch, length_batch, label_batch = data, lengths, labels
            outputs = F.softmax(model(data_batch,length_batch ), dim=1) ### after softmax, it becomes probability
            predicted = outputs.max(1, keepdim=True)[1].to(device)

            total += label_batch.size(0)
            correct += predicted.eq(label_batch.view_as(predicted)).sum().item()
            val_acc = 100 * correct / total
            
            val_loss_s = rnn_criterion(outputs, label_batch)
            val_loss_s = val_loss_s.item() * label_batch.size(0) 
            
            val_loss += val_loss_s / len(loader.dataset)
            
    rnn_optimizer.zero_grad() 
    
     
    return val_acc, val_loss


model_rnn = RNN_encoder(weight, 450, 1, 300,3)

# print the number of trained parameters in the model

model_parameters = filter(lambda p: p.requires_grad, model_rnn.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(params)
      
rnn_learning_rate = 0.001
rnn_num_epochs = 10 # number epoch to train

# Criterion and Optimizer
rnn_criterion = torch.nn.CrossEntropyLoss()
rnn_optimizer = torch.optim.Adam(model_rnn.parameters(), lr=rnn_learning_rate)

# Train the model
rnn_total_step = len(snli_train_loader)

rnn_training_loss = []
rnn_training_acc = []
rnn_validation_loss = []
rnn_validation_acc = []
            
for epoch in range(rnn_num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        
        rnn_train_total = 0
        rnn_train_correct = 0
        model_rnn.to(device)
        model_rnn.train()
        rnn_optimizer.zero_grad()
        
        # Forward pass
        data_batch, length_batch, label_batch = data, lengths, labels
        rnn_outputs = model_rnn(data_batch, length_batch).to(device)
       
      
        # calculate the training loss and append it
        rnn_training_loss_s = rnn_criterion(rnn_outputs, label_batch).to(device)
        
        
        # Backward and optimize
        rnn_training_loss_s.backward()
        rnn_optimizer.step()
        
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc, val_loss = test_model_rnn(val_loader, model_rnn)
            
            rnn_validation_loss.append(val_loss)
            rnn_validation_acc.append(val_acc)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {},Validation loss: {}'.format(
                       epoch+1, rnn_num_epochs, i+1, len(_train_loader), val_acc, val_loss))

            
    tra_acc, tra_loss = test_model_rnn(train_loader, model_rnn)    
    rnn_training_loss.append(tra_loss)
    rnn_training_acc.append(tra_acc)
    print('Epoch:{},Training Acc: {}, Training loss: {}'.format(epoch, tra_acc, tra_loss))