In [None]:
'''
DS-GA 1022 Homework 2
Code throughout adapted from DS-GA 1011 2018 Lab Sessions and previously submitted HW1
'''

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import time
import os
import csv

'''
First we'll load our pre-trained embedding.
We'll prefix our embeddings with a PAD and OOV
'''

#switch between linux laptop and windows desktop
embedding_home = '/home/gandalf/NYUCDS/DS-GA1011/lab5/'
if not os.path.isdir(embedding_home):
    embedding_home = 'C:\\development\\NYUCDS\\DSGA1011\\'

#pre-trained embeddings from fasttext
embedding_source = 'wiki-news-300d-1M.vec' 
words_to_load = 50000 #limit our vocab to something reasonable
embedding_dim = 300
num_embeddings = words_to_load +2 #leave room for our pad and OOV

words = {}
idx2words = {}
ordered_words = []
loaded_embeddings = np.zeros((words_to_load+2, embedding_dim))

#prefix with PAD and OOV
PAD_IDX = 0
OOV_IDX = 1
words['<PAD>'] = PAD_IDX
idx2words[PAD_IDX] = '<PAD>'
words['<OOV>'] = OOV_IDX
idx2words[OOV_IDX] = '<OOV>'
loaded_embeddings[OOV_IDX,:] = np.random.normal(size=embedding_dim)

with open(embedding_home + embedding_source, "r", encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings[i+2, :] = np.asarray(s[1:])
        words[s[0]] = i+2
        idx2words[i+2] = s[0]
        ordered_words.append(s[0])

loaded_embeddings = torch.tensor(loaded_embeddings)


In [None]:
'''
Let's load our data now
'''

#switch between linux laptop and windows desktop
data_src = '/home/gandalf/Dropbox/NYU CDS/DS-GA 1011 NLP/HW2/hw2_data/'
if not os.path.isdir(data_src):
    data_src = 'E:\\cloudstation\\Dropbox\\NYU CDS\\DS-GA 1011 NLP\\HW2\\hw2_data\\'

#our pre-processed SNLI files
train_snli_name = 'snli_train.tsv' 
val_snli_name = 'snli_val.tsv' 

def index_it(sentence):
    s_i = []
    for w in sentence.split():
        if w not in words:
            s_i.append(OOV_IDX)
        else:
            s_i.append(words[w])
    return s_i


#load our data, it is in a tab delimited row, with permise, hypothesis, label
#our sentences are already pre-procesed, we just need to split
def loadsnli(filename):
    data = []
    with open(filename) as f:
        reader = csv.reader(f,delimiter='\t')
        next(reader)#skip the header
        for line in reader:
            prem = index_it(line[0])
            hypo = index_it(line[1])

            if (line[2] == 'neutral'):
                target = 0
            elif (line[2] == 'entailment'):
                target = 1
            elif (line[2] == 'contradiction'):
                target = 2
            else:
                target = 3 # shouldn't ever happen
            data.append((prem,hypo,target))
    return np.array(data)
            
train_data = loadsnli(data_src+train_snli_name)
val_data = loadsnli(data_src+val_snli_name)
            
        

In [None]:
#let's inspect some things about our data before continuing

from scipy import stats

#check the classes
cls = {}
p_lens = []
h_lens = []
cl = 0
proposed_max_cutoff = 35
for val in train_data:
    v = val[2]
    if not v in cls:
        cls[v] = 1
    else:
        cls[v] = cls[v] + 1
    p_lens.append(len(val[0]))
    h_lens.append(len(val[1]))
    if len(val[0]) < proposed_max_cutoff:
        cl = cl +1

p_lens = np.array(p_lens)
h_lens = np.array(h_lens)

print ("our target class:",cls)
print(stats.describe(p_lens))
print(stats.describe(h_lens))
print("% fit in cutoff:", cl/1000)


In [None]:
#define our dataset for our investigations

class SNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, prem_list, hypo_list, target_list, max_sentenance_length):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.prem_list = prem_list
        self.hypo_list = hypo_list
        self.target_list = target_list
        self.max_sentenance_length = max_sentenance_length
        assert (len(self.prem_list) == len(self.target_list))
        assert (len(self.hypo_list) == len(self.target_list))


    def __len__(self):
        return len(self.prem_list)#they should all be the same
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        prem_token_idx = self.prem_list[key][:self.max_sentenance_length]
        prem_token_len = len(prem_token_idx)
        
        prem_token_idx = np.pad(np.array(prem_token_idx), 
                                pad_width=((0,self.max_sentenance_length-prem_token_len)), 
                                mode="constant", constant_values=0)
        
        hypo_token_idx = self.hypo_list[key][:self.max_sentenance_length]
        hypo_token_len = len(hypo_token_idx)
        
        hypo_token_idx = np.pad(np.array(hypo_token_idx), 
                                pad_width=((0,self.max_sentenance_length-hypo_token_len)), 
                                mode="constant", constant_values=0)
        
        
        label = self.target_list[key]
        return [prem_token_idx, prem_token_len, hypo_token_idx, hypo_token_len,label]

        

BATCH_SIZE = 32 #chosen based on previous experience
MAX_SENTENANCE_LENGTH = 35 #chosed from above

train_dataset = SNLIDataset(train_data[:,0], train_data[:,1], train_data[:,2], MAX_SENTENANCE_LENGTH)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = SNLIDataset(val_data[:,0], val_data[:,1], val_data[:,2], MAX_SENTENANCE_LENGTH)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True)



In [None]:
'''
Our CNN model, with 2 1d convolutions, and a non-linear relu after each.
Maxpooling is done across the seq len
Both sentences are run through the same convolution model, and concatinated 
before going through another linear and non-linear transformation
'''
class CNN(nn.Module):
    def __init__(self, emb, hidden_size, num_classes):

        super(CNN, self).__init__()

        self.hidden_size =  hidden_size
        
        #use our previous loaded embedding matrix
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_idx=PAD_IDX)
        self.embedding.load_state_dict({'weight': emb})
        self.embedding.weight.requires_grad = False
        
        self.conv1 = nn.Conv1d(embedding_dim, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)

        
        #self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.linear1 = nn.Linear(hidden_size*2, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x1, lengths1, x2, lengths2 ): 
        batch_size, seq_len = x1.size() #assumes seq_len is same for both, which our loader does

        embed1 = self.embedding(x1.long())
        hidden1 = self.conv1(embed1.transpose(1,2)).transpose(1,2)
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size, seq_len, hidden1.size(-1))

        hidden1 = self.conv2(hidden1.transpose(1,2)).transpose(1,2)
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size, seq_len, hidden1.size(-1))

        #now maxppool, need to move things around to pool the right dimension
        hidden1 = hidden1.transpose(1,2)
        hidden1 = F.max_pool1d(hidden1, kernel_size=hidden1.size()[2])
        hidden1 = hidden1.transpose(1,2)
        hidden1 = hidden1.view(batch_size, self.hidden_size)
                
        embed2 = self.embedding(x2.long())
        hidden2 = self.conv1(embed2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len, hidden2.size(-1))

        hidden2 = self.conv2(hidden2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size, seq_len, hidden2.size(-1))

        #now maxppool, need to move things around to pool the right dimension
        hidden2 = hidden2.transpose(1,2)
        hidden2 = F.max_pool1d(hidden2, kernel_size=hidden2.size()[2])
        hidden2 = hidden2.transpose(1,2)
        hidden2 = hidden2.view(batch_size, self.hidden_size)
        
        
        full = torch.cat((hidden1, hidden2), dim=1)
        
        #other ways were examined, but produced consistently worse results.
        #full = torch.mul(hidden1, hidden2)
        
        full = self.linear1(full)
        full = F.relu(full)
        
        logits = self.linear2(full)
        
        return logits
    
    
    

In [None]:

#For the RNN, a single-layer, bi-directional GRU will suffice.
#We can take the last hidden state as the encoder output. 

class RNN(nn.Module):
    def __init__(self, emb, hidden_size, num_classes):

        super(RNN, self).__init__()

        self.hidden_size =  hidden_size
        self.num_layers = 1
        
        #use our previous loaded embedding matrix
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_idx=PAD_IDX)
        self.embedding.load_state_dict({'weight': emb})
        self.embedding.weight.requires_grad = False
          
        self.gru = nn.GRU(embedding_dim, hidden_size, 1, batch_first=True, bidirectional=True)
        self.linear1 = nn.Linear(hidden_size*2, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x1, lengths1, x2, lengths2 ): 
        batch_size, seq_len = x1.size() #assumes seq_len is same for both, which our loader does

        h1 = torch.randn(self.num_layers*2, batch_size, self.hidden_size)
        
        embed1 = self.embedding(x1.long())
        
        # pack padded sequence
        #embed1 = torch.nn.utils.rnn.pack_padded_sequence(embed, lengths1.numpy(), batch_first=True)
        
        #before working to sort, pack, unsort, attempt to train un-altered, which returned satisfactory results
        #in a reasonable time.
        
        
        #run the RNN
        rnn_out, h1 = self.gru(embed1, h1)
        
        #sum accross the sequences
        h1 = torch.sum(h1, dim=0)
        
        
        h2 = torch.randn(self.num_layers*2, batch_size, self.hidden_size)
        
        embed2 = self.embedding(x2.long())
        
        # pack padded sequence
        #embed2 = torch.nn.utils.rnn.pack_padded_sequence(embed, lengths2.numpy(), batch_first=True)
        
        # run the RNN
        rnn_out, h2 = self.gru(embed2, h2)
        
        
        #sum accross the sequences
        h2 = torch.sum(h2, dim=0)
        
        #print("hidden1:", hidden1.size())
        #print("hidden2:", hidden2.size())
        
        full = torch.cat((h1, h2), dim=1)
        #full = torch.mul(hidden1, hidden2)
        
        full = self.linear1(full)
        full = F.relu(full)
        logits = self.linear2(full)
        
        return logits
    
    
    

In [None]:
import time

def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1, lengths1, data2, lengths2, labels in loader:
        data_batch1, lengths_batch1, data_batch2, lengths_batch2, label_batch = data1, lengths1, data2, lengths2, labels
        outputs = F.softmax(model(data_batch1, lengths_batch1, data_batch2, lengths_batch2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

#code below was used to experiment and confirm correct running.
#Actual search of hyperparamters was done using SNLIModel.py 
#(an addapted script version of this notebook) on HPC/prince


model = CNN(emb=loaded_embeddings, hidden_size=200, num_classes=3)

learning_rate = 1e-3
num_epochs = 4 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

print("training.....")

report_progress = False

time_start = time.time()
max_val = 0
for epoch in range(num_epochs):
    for i, (data1, lengths1, data2, lengths2, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data1, lengths1, data2, lengths2)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 1000 == 0 and report_progress:
            # validate
            val_acc = test_model(val_loader, model)
            if (val_acc > max_val):
                max_val = val_acc
            
            if (val_acc + (0.1 * max_val) < max_val ):
                break
                
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))
            
    train_acc = test_model(train_loader, model)
    val_acc = test_model(val_loader, model)
    print('Epoch: [{}/{}], Train Acc, {} Validation Acc: {}'.format(
                   epoch+1, num_epochs, train_acc, val_acc))

    if (val_acc + (0.1 * max_val) < max_val ):
        break;
            
time_end = time.time()
print("Done in ", time_end- time_start)
print("Best Val ACC seen", max_val)
print("Final Val ACC:", test_model(val_loader, model))

##Actual search of hyperparamters was done using SNLIModel.py (an addapted script version of this notebook) on HPC/prince



In [None]:
#load our two best models, save from our exploration on HPC/prince

with open('c:\\development\\NYUCDS\\DSGA1011\\hw2\\CNN2_models.pkl', 'rb') as f:
    res_run = pickle.load(f)

print(res_run.keys())
    
best_cnn_model = CNN(emb=loaded_embeddings, hidden_size=200, num_classes=3)
best_cnn_model.state_dict = res_run['CNN-1']

with open('c:\\development\\NYUCDS\\DSGA1011\\hw2\\RNN1_models.pkl', 'rb') as f:
    res_run = pickle.load(f)

print(res_run.keys())
    
best_rnn_model = RNN(emb=loaded_embeddings, hidden_size=200, num_classes=3)
best_rnn_model.state_dict = res_run['RNN-200']

In [None]:
#Print out the first 20 with predictions to examine some correct/incorrect cases

def lookup_label(val):
    if val == 0:
        return 'neutral'
    if val == 1:
        return 'entailment'
    if val == 2:
        return 'contradiction'
    return 'unknown'

val_fixed_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=1)

f = open(data_src+val_snli_name, 'r', encoding='utf-8' )
f.readline()# skip header


seen = 0
best_cnn_model.eval()
for data1, lengths1, data2, lengths2, labels in val_fixed_loader:
    data_batch1, lengths_batch1, data_batch2, lengths_batch2, label_batch = data1, lengths1, data2, lengths2, labels
    outputs = F.softmax(model(data_batch1, lengths_batch1, data_batch2, lengths_batch2), dim=1)
    predicted = outputs.max(1, keepdim=True)[1]
    
    print(f.readline().strip(),':','vs',lookup_label(predicted))
    print()
    
    
    seen += 1
    if seen > 20:
        break
    
    
f.close()





In [None]:
#Load the MNLI data and run each genre through both "best" CNN and RNN models

BATCH_SIZE = 32
MAX_SENTENANCE_LENGTH = 35
        
#load our data, it is in a tab delimited row, with permise, hypothesis, label, genre
#our sentences are already pre-procesed, we just need to split
def loadmnli(filename):
    datas = {}
    with open(filename, "r", encoding='utf-8') as f:
        reader = csv.reader(f,delimiter='\t')
        next(reader)#skip the header
        for line in reader:
            prem = index_it(line[0])
            hypo = index_it(line[1])

            if (line[2] == 'neutral'):
                target = 0
            elif (line[2] == 'entailment'):
                target = 1
            elif (line[2] == 'contradiction'):
                target = 2
            else:
                target = 3 # shouldn't ever happen
                
            genre = line[3]
            
            if not genre in datas:
                datas[genre] = []
                
            datas[genre].append((prem,hypo,target))
            
    return datas
            
val_mnli_name = 'mnli_val.tsv'
val_mnli_data = loadmnli(data_src+val_mnli_name)

#this assumes we've previous loaded our two best models

print("genre\tCNN Acc\tRNN Acc" )

for genre in val_mnli_data.keys():
    
    vdata = np.array(val_mnli_data[genre])
    
    val_m_dataset = SNLIDataset(vdata[:,0], vdata[:,1], vdata[:,2], MAX_SENTENANCE_LENGTH)
    val_m_loader = torch.utils.data.DataLoader(dataset=val_m_dataset, batch_size=BATCH_SIZE)

    val_CNN_acc = test_model(val_m_loader, best_cnn_model)
    val_RNN_acc = test_model(val_m_loader, best_rnn_model)
    print(genre, val_CNN_acc, val_RNN_acc, sep='\t')
    

In [None]:
#plotting method reused from HW1

import matplotlib.pylab as plt
import matplotlib.ticker as ticker
import numpy as np
#Display a chart of the data, given the label, a list of runs, and the number of 
#columns to display the legend.
#assumes data is in {paramaters1:[[E1-trainacc,E1-valacc],[E2-trainacc,E2-valacc]...]}
def show_chart(plot_label, res_run, cols):

    fig, ax = plt.subplots(figsize=(12, 6))

    for key in res_run:
        arun = np.array(res_run[key])
        y = arun[:,0] # unpack a list of pairs into two tuples
        x = range(1,len(y)+1)
        ax.plot(x, y, label = key + ' train', linestyle="--")
        y = arun[:,1] # unpack a list of pairs into two tuples
        x = range(1,len(y)+1)
        ax.plot(x, y, label = key + ' validate', linestyle="-")


    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.grid('on')
    plt.title(plot_label)
    plt.xlabel("Epoch")
    plt.ylabel("Percent Accuracy")
    plt.legend()    
    handles, labels = ax.get_legend_handles_labels()
    lgd = ax.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5,-0.2), ncol=cols)
    plt.savefig(plot_label, bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.show()

In [None]:
#save our data after each test, useful if we need to redo graphs or tables, etc.
import pickle

def save_data(run_name, res_run):

    with open(run_name + '.pkl', 'wb') as f:
        pickle.dump(res_run, f)


In [None]:
import pickle
#load and build a new chart
def graph_pickle(run_name, slots):
    with open(run_name + '.pkl', 'rb') as f:
        res_run = pickle.load(f)
    show_chart(run_name, res_run,slots)
    
graph_pickle('/home/gandalf/NYUCDS/DS-GA1011/hw2/RNN2_results', 5)