In [145]:
from torch.utils.data import Dataset, TensorDataset
import torch.nn.utils
import torch.nn as nn
import numpy as np
import pandas as pd
import os
import sys
import time
import shutil

# let's first load the data

In [22]:
class Vocabulary:
    """
    Class for converting words to indexes and back out again
    """

    def __init__(self, series_list):
        """
        Stores list to map an index number to a word and a dictionary to map a word to an index number

        @param series_list (list(pd.Series)): list of pandas series to be used to form vocabulary
        """
        
        vocab_dict = dict()
        def add_entry_if_necessary(entry):
            words = entry.split()
            for word in words:
                if word.lower() not in vocab_dict:
                    vocab_dict[word.lower()] = True

        for series in series_list:
            series.apply(add_entry_if_necessary)

        self.index_to_word = list(vocab_dict.keys())
        self.index_to_word.insert(0, "<pad>")
        self.index_to_word.insert(1, "<unk>")
        self.index_to_word.insert(2, "<s>")
        self.index_to_word.insert(3, "</s>")

        self.word_to_index = dict()
        for idx in range(len(self.index_to_word)):
            self.word_to_index[self.index_to_word[idx]] = idx


    def __getitem__(self, item):
        """
        Allows Vocabulary object to be subscripted

        @param item (int or string): if int, gets the word found at that index
                                     if string, gets the index associated with that word

        """
        if type(item) == type("string"):
            return self.word_to_index[item]
        else:
            return self.index_to_word[item]

    def __len__(self):
        return len(self.index_to_word)

    def get_index_list_from_sentence(self, sentence):
        """
        Converts a sentence into a list of indices

        @param sentence (str): a string of words that is to be converted

        @returns idx_list (List(int)): a list of integers where each element corresponds to the index of a word in the input sentence
        """
        idx_list = list()
        idx_list.append(self["<s>"])
        for word in sentence.split():
            if word.lower() not in self.word_to_index:
                idx_list.append(self["<unk>"])
            else:
                idx_list.append(self[word.lower()])
        idx_list.append(self["</s>"])
        return idx_list

    def get_tensor_from_sentences(self, sentences, device: torch.device):
        """
        Makes a torch tensor from a batch of sentences

        @param sentences (List(List(int))): the sentences that will comprise the tensor
        @param device (torch.device): device code is being run on 

        @returns tensor (torch.tensor): padded tensor of input sentences 
        """
        return torch.t(torch.tensor(self.pad_sentences(sentences), dtype=torch.long, device=device))
    
    def pad_sentences(self, sentences):
        max_length = max(len(sentence) for sentence in sentences)
        word_idxs = np.zeros((len(sentences), max_length), dtype=np.dtype(int)) # pad id == 0
        for i, s in enumerate(sentences):
            word_idxs[i,:len(s)] = s
        return word_idxs


class WikiDataset(Dataset):
    """
    Class for storing input data from Wikipedia dataset
    """

    def __init__(self, comment_df: pd.DataFrame, annotation_df: pd.DataFrame, vocab: Vocabulary):
        """
        @param comment_df (pd.DataFrame): pandas DataFrame with "comments" section that is used as the input
        @param annotation_df (pd.DataFrame): pandas DataFrame that stores the labels
        @param vocab (Vocabulary): vocabulary to be used 
        """

        super().__init__()

        self.vocab = vocab
        
        cleaned_comment_df = comment_df.copy()
        cleaned_comment_df["comment"] = cleaned_comment_df["comment"].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
        cleaned_comment_df["comment"] = cleaned_comment_df["comment"].apply(lambda x: x.replace("TAB_TOKEN", " "))

        self.x = cleaned_comment_df["comment"].apply(self.vocab.get_index_list_from_sentence).values
        self.x = self.vocab.pad_sentences(self.x)
        self.y = (annotation_df[annotation_df["rev_id"].isin(cleaned_comment_df["rev_id"])].groupby("rev_id")["attack"].mean() > 0.5).values
        self.y = np.array([int(i) for i in self.y])
        self._num_labels = np.max(self.y) + 1
        
    def num_labels(self):
        return self._num_labels
        
    def __getitem__(self, index):
        """
        @returns (tuple(List(int), bool)): first term is a list of the indices of the words of the input sentence at the specified index
                                           second term is boolean corresponding to whether it is an attack (True) or not (False)
        """
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)


class FakeNewsDataset(Dataset):

    def __init__(self, body_df: pd.DataFrame, stance_df: pd.DataFrame, vocab: Vocabulary):

        super().__init__()
        self.vocab = vocab

        stance_to_idx = {}
        stances = stance_df["Stance"].drop_duplicates().values
        for i, stance in enumerate(stances):
            stance_to_idx[stance] = i
        num_stances = len(stance_to_idx)

        body_df["sentence_as_idx"] = body_df["articleBody"].apply(self.vocab.get_index_list_from_sentence)

        x_list = []
        y_list = []
        idx_to_id = {body_id:i for (i, body_id) in enumerate(body_df['Body ID'])}

        for body_id, headline, stance in zip(stance_df["Body ID"], stance_df["Headline"], stance_df["Stance"]):
            head = vocab.get_index_list_from_sentence(headline)
            body = body_df.iloc[idx_to_id[body_id]]["sentence_as_idx"]
            x_list.append(head + body)
            y_list.append(stance_to_idx[stance])

        self.x = self.vocab.pad_sentences(x_list)
        self.y = np.array(y_list)
        self._num_labels = np.max(self.y) + 1

    def num_labels(self):
        return self._num_labels

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

In [15]:
comment_df = pd.read_csv("../data/attack_annotated_comments.tsv", sep ='\t')
body_df = pd.read_csv("../data/fake_news_bodies.csv")
stance_df = pd.read_csv("../data/fake_news_stances.csv")
annotation_df = pd.read_csv("../data/attack_annotations.tsv",  sep='\t')

In [23]:
vocab = Vocabulary([comment_df["comment"], body_df["articleBody"], stance_df["Headline"]])

In [29]:
# takes 426.58s with equal length torch tensors and 27.5s with equal length numpy arrays!
t_start = time.time()
wiki_dataset = WikiDataset(comment_df, annotation_df, vocab)
fake_news_dataset = FakeNewsDataset(body_df, stance_df, vocab)
print("time: ", time.time() - t_start)

time:  28.7069571018219


# looking good so far! let's import some models...

In [60]:
""" models """
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.nn import TransformerEncoder, TransformerEncoderLayer


""" Credit: this code was inspired by pytorch.org/tutorials/beginner/transformer_tutorial.html """

# helper module for our classifier
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        """ d_model = word embedding dimension of the transformer inputs """
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0) # add batch dimension
        self.register_buffer('pe', pe) # do not perform gradient descent on positional embeddings!!

    def forward(self, x): # BERT also adds positional encodings directly to embeddings
        x = x + self.pe[:, :x.size(1), :] # only include sentence-length many points
        return self.dropout(x)


# implement classification transformer specific for our application
class TransformerClassifier(nn.Module):

    def __init__(self, vocab_size, labels, embedding_dim, nhead, feedforward_dim, nlayers, dropout=0.5):
        """
        Args:
            vocab_size: number of words/max index of our vocabulary
            labels: number of labels in our predictions
            embedding_dim: word embedding dimension
            nhead: number of attention heads
            feedforward_dim: dimension of feedforward layers
            nlayers: number of attention layers to stack
            dropout: dropout rate
        """
        super(TransformerClassifier, self).__init__()
        self.embedding_dim = embedding_dim
        self.labels = labels
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # word embedding layer
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout) # positional embedding
        encoder_layers = TransformerEncoderLayer(embedding_dim, nhead, feedforward_dim, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) # transformer
        self.linear_classifier = nn.Linear(embedding_dim, labels) # transformer output to class scores
        self.softmax = nn.Softmax(dim=1) # softmax over scores
        self.init_weights()

    def init_weights(self):
        initrange = 0.1 # small variance has been shown to lead to better embedding initialization
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear_classifier.bias.data.zero_()
        initrange = np.sqrt(6) / np.sqrt(self.embedding_dim + self.labels) # glorot initialization
        self.linear_classifier.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        """ src must be formatted with dims (batch_size, sentence_length, one_hot_labels) """
        word_embedding = self.embedding(src)
        word_pos_embed = self.pos_encoder(word_embedding)
        encoder_output = (self.transformer_encoder(word_pos_embed)[:,0,:]).squeeze(1) # use only the first word's embedding
        scores = self.linear_classifier(encoder_output)
        softmax_scores = self.softmax(scores) # softmax over scores
        return softmax_scores

# ... and now train them

In [71]:
import torch.nn as nn
from torch.utils.data import DataLoader

In [150]:
# create a model with provided arguments
def model_from_dataset(vocab_size, num_labels, args):
    model = TransformerClassifier(vocab_size, num_labels, args['embedding_dim'], args['nheads'], \
        args['layers'], args['feedforward_dim'])
    return model

# evaluate model accuracy and loss on given dataset
def evaluate(model, dataset, num_labels, batch_size=32):
    n = len(dataset)
    predictions = np.zeros(n) # used for confusion matrix
    truth = np.zeros(n) 
    total_loss = 0
    dataloader = DataLoader(dataset, batch_size=batch_size)
    criterion = nn.CrossEntropyLoss()
    curr = 0
    with torch.no_grad():
        for (x, y) in dataloader:
            print(curr)
            pred = model(x)
            predictions[curr:min(n,curr+batch_size)] = torch.argmax(pred, axis=1)
            truth[curr:min(n,curr+batch_size)] = y
            total_loss += criterion(pred.view(-1, num_labels), y).item()
            curr += batch_size
    mean_loss = total_loss / n
    mean_accuracy = np.mean(predictions == truth)
    return mean_loss, mean_accuracy, predictions, truth

# split dataset into train, validation, and test
def split_dataset(dataset, train_size, val_size, test_size):
    return torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

# create/empty directory to save models in
def new_dir(dir_path):
    if os.path.isdir(dir_path): 
        print("deleting existing directory {}".format(dir_path))
        shutil.rmtree(dir_path)
    os.makedirs(dir_path) # check valid directory

# method for training transformer model on given dataset
def train(vocab_size, num_labels, train_dataset, val_dataset, save_dir, args):
    new_dir(save_dir)
    
    """ create model and prepare optimizer """
    model = model_from_dataset(vocab_size, num_labels, args)
    train_dataloader = DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=args['lr'])
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95) # note: 0.95^13 is approx 0.5
    model.train() # turn on train mode
    
    """ train model """
    print("starting training")
    for epoch in range(args['epochs']):
        total_loss = 0.
        start_time = time.time()
        for (x_batch, y_batch) in train_dataloader: # different shuffle each time
            optimizer.zero_grad()
            output = model(x_batch)
            loss = criterion(output.view(-1, num_labels), y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args['clip_grad_norm'])
            optimizer.step()
            #scheduler.step()
            total_loss += loss.item()
        elapsed = time.time() - start_time
        
        # print info from this training epoch
        print('| epoch {:3d} | {:5d} samples | '
              'lr {:02.2f} | time {:5.2f} | loss {:5.2f}'.format(epoch, len(train_dataset), 
                args['lr'], elapsed, total_loss)) # or scheduler.get_lr()[0]
        
        # save model to new directory
        if (epoch+1) % args['save_frequency'] == 0 and epoch+1 != args['epochs']:
            path = os.path.join(save_dir, "epoch-{}".format(epoch))
            new_dir(path)
            torch.save(model.state_dict(), path) # store trained model
            
    """ save final model """
    path = os.path.join(save_dir, "final")
    new_dir(path)
    torch.save(model.state_dict(), path) # store trained model
    """ 
    Note: see https://stackoverflow.com/questions/42703500/
    best-way-to-save-a-trained-model-in-pytorch — the best way to save a model
    is to save the state, then to load using
    new_model = TheModelClass(*args, **kwargs)
    new_model.load_state_dict(torch.load(path))
    """

In [107]:
args = { # very small values for testing purposes
    'epochs': 2,
    'batch_size': 16,
    'embedding_dim': 32,
    'nheads': 1,
    'layers': 1,
    'feedforward_dim': 32,
    'lr': 0.01,
    'clip_grad_norm': 0.5,
    'save_frequency': 1,
}

In [None]:
#device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
base_dir = os.getcwd()
wiki_dir = os.path.join(base_dir, "wiki") 

In [124]:
vocab_length = len(vocab)
wiki_num_labels = wiki_dataset.num_labels()

In [132]:
n = len(wiki_dataset)
#n_train, n_val, n_test = int(0.7*n), int(0.15*n), n - (int(0.7*n) + int(0.15*n))
n_train, n_val, n_test = 50, int(0.15*n), n - (50 + int(0.15*n))
wiki_train, wiki_val, wiki_test = split_dataset(wiki_dataset, n_train, n_val, n_test)

In [152]:
train(vocab_length, wiki_num_labels, wiki_train, wiki_val, wiki_dir, args) # no validation dataset

deleting existing directory /Users/johnhallman/mlcourse/senior_ml/nlp-final-project/src/wiki
starting training


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



| epoch   0 |    50 samples | lr 0.01 | time 17.31 | loss  2.52
Traceback (most recent call last):
  File "/Users/johnhallman/mlcourse/mlenv/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3291, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-152-e5c120d39415>", line 1, in <module>
    train(vocab_length, wiki_num_labels, wiki_train, wiki_val, wiki_dir, args) # no validation dataset
  File "<ipython-input-150-eab85c1350ea>", line 76, in train
    torch.save(model.state_dict(), path) # store trained model
  File "/Users/johnhallman/mlcourse/mlenv/lib/python3.6/site-packages/torch/serialization.py", line 260, in save
    return _with_file_like(f, "wb", lambda f: _save(obj, f, pickle_module, pickle_protocol))
  File "/Users/johnhallman/mlcourse/mlenv/lib/python3.6/site-packages/torch/serialization.py", line 183, in _with_file_like
    f = open(f, mode)
IsADirectoryError: [Errno 21] Is a directory: '/Users/johnhallman/mlcourse/se

IsADirectoryError: [Errno 21] Is a directory: '/Users/johnhallman/mlcourse/senior_ml/nlp-final-project/src/wiki/epoch-0'