In [1]:
import re
import pandas as pd
from numpy.random import RandomState

import spacy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import matplotlib.pyplot as plt

In [2]:
import json
## load data from json
with open('original_labelled_data.json', 'r') as jf:
    data=json.load(jf)

In [3]:
sentences,labels=data['sentences'], data['labels']

In [4]:
import sklearn
from sklearn.model_selection import train_test_split
from collections import Counter

In [5]:
## split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    sentences, labels, test_size=0.2, random_state=2020, stratify=labels)

In [6]:
import pandas as pd

In [48]:
train,val=pd.DataFrame(columns=['sentences', 'labels']),pd.DataFrame(columns=['sentences', 'labels'])

In [49]:
train['sentences'], train['labels']=X_train,y_train
val['sentences'], val['labels']=X_test,y_test

In [6]:
  
# set up fields
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = string.lower()    ## lower-wise case
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def process_document(sentence):
    """
    process a document 
    :params[in]: sentence, a string or list of strings
    :params[out]: tokens, list of tokens
    """
    if type(sentence)==str:
        clean = clean_str(sentence)  ## cleaned sentence
        tokens = clean.split()
        return tokens
    elif type(sentence)==list:
        res = []
        for it in sentence:
            res.append(clean_str(it).split())
        return res

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


In [9]:
tk_sentences = process_document(sentences)

In [10]:
max_words = 10000
tk = Tokenizer(lower = True, num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
               split=' ', char_level=False, oov_token='<unk>')
tk.fit_on_texts(tk_sentences)

In [67]:
print('Average word length of questions in train is {0:.0f}.'.format(np.mean(train['sentences'].apply(lambda x: len(clean_str(x).split())))))
print('Average word length of questions in test is {0:.0f}.'.format(np.std(train['sentences'].apply(lambda x: len(clean_str(x).split())))))


Average word length of questions in train is 65.
Average word length of questions in test is 96.


In [14]:
sentences[1]

'Thank you both so much, I hope the very best for both you and your families. Tom '

In [8]:
import numpy as np

In [9]:
w2v_file='../glove.6B/glove.6B.100d.txt'
def loadData_Tokenizer(X_train, X_test, MAX_NB_WORDS=75000,
                       MAX_SEQUENCE_LENGTH=500, w2v_file=w2v_file):
    """
    use glove embedding
    """
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH,padding='post', truncating='post')
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
    f = open(w2v_file, encoding="utf8") ## GloVe file which could be download https://nlp.stanford.edu/projects/glove/
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index,embeddings_index)


In [10]:
## construct the initialized embedding matrix based on pretrained matrix
def init_embed(word_index, embeddings_index, emb_dim=100):
    embedding_matrix = np.random.random((len(word_index) + 1, emb_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) !=len(embedding_vector):
                print("could not broadcast input array from shape",str(len(embedding_matrix[i])),
                                 "into shape",str(len(embedding_vector))," Please make sure your"
                                 " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)

            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [11]:
train_ids,test_ids,wd_ind,emb_ind=loadData_Tokenizer(X_train, X_test, MAX_NB_WORDS=10000,
                       MAX_SEQUENCE_LENGTH=256, w2v_file=w2v_file)
embedding_matrix=init_embed(wd_ind, emb_ind, emb_dim=100)


Found 7623 unique tokens.
(1756, 256)
Total 400000 word vectors.


In [15]:
train_ids

array([[  18,    7,    0, ...,    0,    0,    0],
       [  95,   60,   27, ...,    0,    0,    0],
       [ 324,    4,  180, ...,    0,    0,    0],
       ...,
       [1423,  923,   11, ...,    0,    0,    0],
       [  18,    7,    0, ...,    0,    0,    0],
       [   1,   12,    6, ...,    0,    0,    0]], dtype=int32)

### model class

In [19]:
class KIMCNN2D(nn.Module):
    
    def __init__(self, label_size, max_seq_len,
                 kernel_sizes, kernel_nums, keep_dropout, embedding_matrix):
        super(KIMCNN2D,self).__init__()
        self.embedding_dim = embedding_matrix.shape[1]
        self.label_size = label_size
        self.kernel_sizes = kernel_sizes
        self.kernel_nums = kernel_nums        
        self.keep_dropout = keep_dropout
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.max_seq_len = max_seq_len
        #self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks] -- Ci, Co -- channels in or out
        #self.convs1 = nn.ModuleList([nn.Conv2d(1, num, (size, self.embedding_dim)) for size,num in
        #                             zip(self.kernel_sizes,self.kernel_nums)])
        self.convs = nn.ModuleList([nn.Conv1d(1, num, self.embedding_dim * size, 
                                              stride=self.embedding_dim) for size,num in 
                                    zip(self.kernel_sizes, self.kernel_nums)])

        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(self.keep_dropout)
        self.fc = nn.Linear(sum(self.kernel_nums), self.label_size)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3) #(N,Co,W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, inp):
        x = self.embedding(inp).view(-1, 1, self.embedding_dim * self.max_seq_len)
#         if self.embedding_type == "multichannel":
#             x2 = self.embedding2(inp).view(-1, 1, self.embedding_dim * self.max_seq_len)
#             x = torch.cat((x, x2), 1)
        conv_results = [
            F.max_pool1d(F.relu(self.convs[i](x)), self.max_seq_len - self.kernel_sizes[i] + 1)
                .view(-1, self.kernel_nums[i])
            for i in range(len(self.convs))]

        x = torch.cat(conv_results, 1)
        x = F.dropout(x, p=self.keep_dropout)
        x = self.fc(x)
        return x


    def forward2(self, x):
        '''
        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
        '''
        x = self.embedding(x) # (N,W,D)
        x = x.unsqueeze(1) # (N,Ci,W,D)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        x = self.dropout(x) # (N,len(Ks)*Co)
        logit = self.fc(x) # (N,C)
        return logit

### LSTM with Attention

In [12]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import time,datetime
from sklearn.metrics import classification_report
import random
from torch.optim.lr_scheduler import StepLR
import pdb

In [13]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [14]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


No GPU available, using the CPU instead.


In [15]:
### evaluate the performance of current model
def evaluate_model(clf_model, validation_dataloader, save_dir):
    """
    :params[in]: clf_model, the pre-trained classifier
    :params[in]: validation_dataloader, the validation dataset
    :params[in]: save_dir, the directory name to save the fine-tuned model
    
    """
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    clf_model.eval()
    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    true_labels,pred_labels=[],[]
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = clf_model(b_input_ids)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        ## pred_labels/true_labels in a batch flatten
        pred_flat = np.argmax(logits, axis=1).flatten()
        true_flat = label_ids.flatten()

        # true labels and predicted labels
        true_labels += true_flat.tolist()
        pred_labels += pred_flat.tolist()
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy
        # Track the number of batches
        nb_eval_steps += 1
    ## pdb check
    #pdb.set_trace()
    # Report the final accuracy for this validation run
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    #clf_model.save_pretrained(save_dir)  ## save model
    print(classification_report(true_labels, pred_labels,digits=3))
    print(classification_report(true_labels, pred_labels,digits=3),
      file=open(save_dir+'/result.txt','w'))
    print("  Accuracy: {0:.3f}".format(eval_accuracy/nb_eval_steps),
          file=open(save_dir+'/result.txt','w'))


In [27]:
def train_eval(clf_model, train_dataloader, validation_dataloader, base_dir,
               lr, epochs=4, eval_every_num_iters=40, seed_val = 42, weights= None):
    """train and evaluate a deep learning model
    :params[in]: clf_model, a classifier
    :params[in]: train_dataloader, training data
    :params[in]: validation_dataloader, validation data
    :params[in]: base_dir, output directory to create the directory to save results
    :params[in]: lr, the learning rate
    :params[in]: epochs, the number of training epochs
    :params[in]: eval_every_num_iters, the number of iterations to evaluate
    :params[in]: seed_val, set a random seed
    """
    optimizer = torch.optim.Adam(clf_model.parameters(),
                      lr = lr)
    ## cross entropy loss
    criterion = nn.CrossEntropyLoss()

    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * epochs
    # Create the learning rate scheduler. # gamma = decaying factor
    scheduler = StepLR(optimizer, step_size=2, gamma=0.5)

    # see if weights is None:
    if weights != None:
        weights = torch.FloatTensor(weights)
    # Set the seed value all over the place to make this reproducible.
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    
    # Store the average loss after each epoch so we can plot them.
    loss_values = []
    
    # For each epoch...
    for epoch_i in range(0, epochs):
        
        # ========================================
        #               Training
        # ========================================
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        
        # Measure how long the training epoch takes.
        t0 = time.time()
    
        # Reset the total loss for this epoch.
        total_loss = 0
        ## print the learning rate
        for param_group in optimizer.param_groups:
            print(' learning rate is: ', param_group['lr'])
        
        # Put the model into training mode. Don't be mislead--the call to 
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    
        # For each batch of training data...
        for iters, batch in enumerate(train_dataloader):
            clf_model.train()  ## model training mode
            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the 
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)
    
            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because 
            # accumulating the gradients is "convenient while training RNNs". 
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            clf_model.zero_grad()        
    
            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because we
            # have provided the `labels`.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = clf_model(b_input_ids)
             
            # The call to `model` always returns a tuple, so we need to pull the 
            # loss value out of the tuple.
            loss = criterion(outputs, b_labels)
    
            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            total_loss += loss.item()
    
            # Perform a backward pass to calculate the gradients.
            loss.backward()
    
            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            optimizer.step()    
            # eveluate the performance after some iterations
            if iters % eval_every_num_iters == 0 and not iters == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(iters, len(train_dataloader), elapsed))
                tmp_dir = base_dir+'/epoch'+str(epoch_i+1)+'iteration'+str(iters)
                ## save pretrained model
                evaluate_model(clf_model, validation_dataloader, tmp_dir)
        # Update the learning rate each epoch
        scheduler.step()
        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)            
        #pdb.set_trace()
        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)
        # save the data after epochs
        tmp_dir = base_dir+'/epoch'+str(epoch_i+1)+'_done'
        ## save pretrained model
        evaluate_model(clf_model, validation_dataloader, tmp_dir)


In [29]:
    import os
    for seed in [42,52, 62, 72, 82]:
        batch_size = 8
        train_ids,test_ids,wd_ind,emb_ind=loadData_Tokenizer(X_train, X_test, MAX_NB_WORDS=10000,
                       MAX_SEQUENCE_LENGTH=256, w2v_file=w2v_file)
        embedding_matrix=init_embed(wd_ind, emb_ind, emb_dim=100)
        ## initialize a classifier
        clf_model=KIMCNN2D(label_size=3, kernel_sizes=[1,2,3,4,5], keep_dropout=.5,max_seq_len=256,
                  kernel_nums=[200, 300, 500, 500,200], embedding_matrix=torch.FloatTensor(embedding_matrix))
        # Create the DataLoader for our training set.
        train_data = TensorDataset(torch.LongTensor(train_ids), torch.tensor(y_train))
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
        # Create the DataLoader for our validation set.
        validation_data = TensorDataset(torch.LongTensor(test_ids), torch.tensor(y_test))
        validation_sampler = SequentialSampler(validation_data)
        validation_dataloader = DataLoader(validation_data, sampler=validation_sampler,
                                           batch_size=batch_size)
        base_dir = 'textCNN/CNN_seed'+str(seed)
        train_eval(clf_model, train_dataloader, validation_dataloader, base_dir, \
            lr=1.0e-2, epochs=5, eval_every_num_iters=160, seed_val = seed)


Found 7623 unique tokens.
(1756, 256)
Total 400000 word vectors.

Training...
 learning rate is:  0.01
  Batch   160  of    176.    Elapsed: 0:00:15.
              precision    recall  f1-score   support

           0      0.992     0.969     0.980       127
           1      0.897     0.822     0.858       191
           2      0.340     0.529     0.414        34

    accuracy                          0.847       352
   macro avg      0.743     0.773     0.751       352
weighted avg      0.877     0.847     0.859       352

              precision    recall  f1-score   support

           0      0.933     0.984     0.958       127
           1      0.896     0.764     0.825       191
           2      0.327     0.529     0.404        34

    accuracy                          0.821       352
   macro avg      0.719     0.759     0.729       352
weighted avg      0.854     0.821     0.832       352


Training...
 learning rate is:  0.01
  Batch   160  of    176.    Elapsed: 0:00:15.
   

              precision    recall  f1-score   support

           0      0.945     0.953     0.949       127
           1      0.875     0.880     0.877       191
           2      0.375     0.353     0.364        34

    accuracy                          0.855       352
   macro avg      0.732     0.728     0.730       352
weighted avg      0.852     0.855     0.854       352

Found 7623 unique tokens.
(1756, 256)
Total 400000 word vectors.

Training...
 learning rate is:  0.01
  Batch   160  of    176.    Elapsed: 0:00:15.
              precision    recall  f1-score   support

           0      1.000     0.890     0.942       127
           1      0.799     1.000     0.888       191
           2      0.000     0.000     0.000        34

    accuracy                          0.864       352
   macro avg      0.600     0.630     0.610       352
weighted avg      0.794     0.864     0.822       352

              precision    recall  f1-score   support

           0      0.918     0.969

  Batch   160  of    176.    Elapsed: 0:00:16.
              precision    recall  f1-score   support

           0      0.976     0.976     0.976       127
           1      0.886     0.932     0.908       191
           2      0.542     0.382     0.448        34

    accuracy                          0.895       352
   macro avg      0.801     0.764     0.778       352
weighted avg      0.885     0.895     0.888       352

              precision    recall  f1-score   support

           0      0.961     0.969     0.965       127
           1      0.915     0.848     0.880       191
           2      0.447     0.618     0.519        34

    accuracy                          0.869       352
   macro avg      0.774     0.811     0.788       352
weighted avg      0.886     0.869     0.876       352

Found 7623 unique tokens.
(1756, 256)
Total 400000 word vectors.

Training...
 learning rate is:  0.01
  Batch   160  of    176.    Elapsed: 0:00:14.
              precision    recall  f1-sco

In [30]:
res = np.array([[0.757,0.792,0.741,0.898],
               [0.788,0.774, 0.811,0.869],
                [0.756,0.818,0.734,0.889],
                [0.730,0.732,0.728,0.855],
                [0.737,0.746,0.732,0.869]
               ])

In [32]:
np.mean(res, axis=0)

array([0.7536, 0.7724, 0.7492, 0.876 ])

In [33]:
np.std(res, axis=0)

array([0.02016532, 0.03096837, 0.03118589, 0.01544021])

In [25]:
embedding_matrix[2,:]

array([-0.038194  , -0.24487001,  0.72812003, -0.39961001,  0.083172  ,
        0.043953  , -0.39140999,  0.3344    , -0.57545   ,  0.087459  ,
        0.28786999, -0.06731   ,  0.30906001, -0.26383999, -0.13231   ,
       -0.20757   ,  0.33395001, -0.33848   , -0.31742999, -0.48335999,
        0.1464    , -0.37303999,  0.34577   ,  0.052041  ,  0.44946   ,
       -0.46970999,  0.02628   , -0.54154998, -0.15518001, -0.14106999,
       -0.039722  ,  0.28277001,  0.14393   ,  0.23464   , -0.31020999,
        0.086173  ,  0.20397   ,  0.52623999,  0.17163999, -0.082378  ,
       -0.71787   , -0.41531   ,  0.20334999, -0.12763   ,  0.41367   ,
        0.55186999,  0.57907999, -0.33476999, -0.36559001, -0.54856998,
       -0.062892  ,  0.26583999,  0.30204999,  0.99774998, -0.80480999,
       -3.0243001 ,  0.01254   , -0.36941999,  2.21670008,  0.72201002,
       -0.24978   ,  0.92136002,  0.034514  ,  0.46744999,  1.10790002,
       -0.19358   , -0.074575  ,  0.23353   , -0.052062  , -0.22