Deep contextualized word representation has drawn wide attention because of state-of-the-art performances in downstream tasks. Contextualized embeddings can capture not only word-level information but also multi-sense information, thus improving the results in sentiment analysis, SQuad and etc. However, the language adopted in the [Elmo](https://allennlp.org/elmo) model were biLSTMs which contained a huge number of parameters, it was less likely for small labs to train and run such experiments.


In this project, we intend to make use of CNN language model in learning efficient word representations for sentiment analysis. We train a language model based on [Gated CNN architecture](https://arxiv.org/abs/1612.08083) proposed by Yann Daulphin, then do sentiment analysis with embeddings generated by the language model.

The language model training dataset is 1-billion-word-language.

In [1]:
import tensorflow as tf
import os
import time
import numpy as np
from bilm.training import load_options_latest_checkpoint, load_vocab
from bilm.data import Batcher, BidirectionalLMDataset
from conf_utils import *

  from ._conv import register_converters as _register_converters


## Initialize the configuration and prepare data batches

In [2]:
#with open('data/vocab-2016-09-10.txt') as f:
    #lines = f.readlines()

In [3]:
#Load the words
vocab_file = 'data/vocab-2016-09-10.txt'
vocab_file = 'data/wiki-vocab.txt'
vocab = load_vocab(vocab_file, 50)

In [4]:
class config:
    vocab_size = vocab.size
    embedding_size = 128
    filter_size = 64
    num_layers = 3
    block_size = 3
    filter_h = 5
    context_size = 50
    text_size = context_size
    batch_size = 64
    epochs = 5
    num_sampled = 64
    learning_rate = 1
    momentum = 0.99
    grad_clip = 0.1
    num_batches = 0
    ckpt_path = 'ckpt_char_gated_cnn'
    summary_path = 'logs'
    #data_dir = "data/texts/reviews/movie_reviews"
    data_dir = "data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled"

In [5]:
class single_config:
    vocab_size = vocab.size
    embedding_size = 128
    filter_size = 64
    num_layers = 3
    block_size = 3
    filter_h = 5
    context_size = 50
    text_size = context_size
    batch_size = 1
    epochs = 5
    num_sampled = 64
    learning_rate = 1
    momentum = 0.99
    grad_clip = 0.1
    num_batches = 0
    ckpt_path = 'ckpt_char_gated_cnn'
    summary_path = 'logs'
   

In [6]:
#Initialize configuration files
conf = prepare_conf(config)

## Build a CharCNN-based language model

Note the inputs are transformed into chars of words, so as to make use of subword information.

In [7]:
%load_ext autoreload
%autoreload 2

In [10]:
#Create a language model
#Note we need to save the models for subsequent tasks
from char_cnn_lm_model import gated_char_cnn_model
graph = tf.Graph()
with graph.as_default():
    with tf.variable_scope('gated_cnn'):
        model = gated_char_cnn_model(conf, is_bidirectional=False)
        all_variables = tf.get_collection_ref(tf.GraphKeys.GLOBAL_VARIABLES)
        var_list=[v for v in all_variables if "Adagrad" not in v.name]
    with tf.variable_scope('gated_cnn', reuse=True):
        model_test = gated_char_cnn_model(conf, is_train=False, is_bidirectional=False)
    with tf.variable_scope('gated_cnn', reuse=True):
        model_single = gated_char_cnn_model(single_config, is_train=False, is_bidirectional=False)    
    saver = tf.train.Saver(var_list=var_list)
    print("Started Model Training...")

Started Model Training...


In [9]:
# batch_idx = 0
# with tf.Session(graph=graph) as sess:
#     sess.run(tf.global_variables_initializer())
#     summary_writer = tf.summary.FileWriter(conf.summary_path, graph=sess.graph)

#     if os.path.exists(conf.ckpt_file+'.index'):
#         saver.restore(sess, conf.ckpt_file)
#         print("Model Restored")

#     for i in np.arange(conf.epochs):
#         start = time.time()
#         for j in np.arange(10000):
#         #for j in np.arange(21):
#             x = next(data_gen)
#             inputs, labels = x['tokens_characters'], x['next_token_id']
#             labels = labels.reshape(-1, 1)
#             _, l = sess.run([model.optimizer, model.loss], 
#                             feed_dict={model.X:inputs, model.y:labels})
#             if j%200 == 0:
#                 print('epoch'+str(i), 'loop'+str(j), l)
#             if j%2000 == 1999:
#                 perp = sess.run(model.perplexity, 
#                                 feed_dict={model.X:inputs, model.y:labels})
#                 print("Perplexity: %.2f"%perp)
#                 saver.save(sess, conf.ckpt_file)
#         end = time.time()
#         print("Epoch: %.2f, Time: %.2f,  Loss: %.2f"%(i, end-start, l))

#         if i % 2 == 0:
#             perp = sess.run(model.perplexity, feed_dict={model.X:inputs, model.y:labels})
#             print("Perplexity: %.2f"%perp)
#             saver.save(sess, conf.ckpt_file)

#         #summaries = sess.run(model.merged_summary_op, feed_dict={model.X:inputs, model.y:labels})
#         #summary_writer.add_summary(summaries, i)

## Sentiment Analysis

In this part, we need to use other datasets for sentiment analysis, like IMDB datasets.

In [11]:
import pandas as pd
file_train = pd.read_csv('data/movie_data/IMDB_review_train.csv', index_col=0)
file_test = pd.read_csv('data/movie_data/IMDB_review_test.csv', index_col=0)

In [12]:
text_train, y_train = file_train.text.values, file_train.sentiment.values
text_test, y_test = file_test.text.values, file_test.sentiment.values

## Map text in to sequence of embeddings

In [13]:
from sents_handler import generate_char_samples
train_gs = generate_char_samples(text_train, y_train, vocab_file, 50, True)

In [14]:
sent_vecs, sent_labels, lengths = train_gs.generate(64)

In [95]:
#Note, we fill the unknow letters as 261, not 0, this is very important.
def sent_emb_padding(sent_vecs):
    shape = sent_vecs.shape
    sent_char_matrix = np.ones([shape[0], 50, 50])*261
    if shape[1] < 50:
        sent_char_matrix[:, :shape[1], :] = sent_vecs[:, :, :]
    else:
        sent_char_matrix = sent_vecs[:, :50, :]
    return sent_char_matrix    

In [96]:
sent_vecs = sent_emb_padding(sent_vecs)
sent_vecs.shape

(64, 50, 50)

In [91]:
from bilm import Batcher, TokenBatcher
batcher = Batcher(vocab_file, 50)

In [94]:
sents_words=[['i', 'love', 'meat'], ['what', 'a', 'good', 'day']]
batcher.batch_sentences(list(sents_words))[0][0]

array([259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
       261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
       261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
       261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261])

## Twitter Data Preprocessing

Get the contextualized representation.

In [80]:
sess_lm = tf.Session(graph=graph)
saver.restore(sess_lm, conf.ckpt_file)
def sent2vec(inputs, sess):
    '''Get word representations'''
    #Get the contextualized representation
    #train_gs = generate_samples(np.array(text_train), np.array(y_train), word_to_idx, 20, False)
    #sent_vecs, sent_labels, lengths = train_gs.generate(32)
    assert inputs.shape[0] == conf.batch_size
    out_layer = sess.run(model_test.hidden_layer, feed_dict={model_test.X:inputs})
    return out_layer

def sent2vec_single(inputs, sess):
    '''Get word representations'''
    #Get the contextualized representation
    #train_gs = generate_samples(np.array(text_train), np.array(y_train), word_to_idx, 20, False)
    #sent_vecs, sent_labels, lengths = train_gs.generate(32)
    assert inputs.shape[0] == 1
    out_layer = sess.run(model_single.hidden_layer, feed_dict={model_single.X:inputs})
    return out_layer
    

INFO:tensorflow:Restoring parameters from ckpt_char_gated_cnn/vocab267743_embed128_filters64_batch64_layers3_block3_fdim5/model.ckpt


In [81]:
sent_vecs, sent_labels, lengths = train_gs.generate(64) 
sent_vecs = sent_emb_padding(sent_vecs)
print(sent_vecs.shape)
out_layer = sent2vec(sent_vecs, sess_lm)

(64, 50, 50)


In [97]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import utils as nn_utils
class CNNClassifier(nn.Module):

    def __init__(self, filters, embedding_dim, kernel_num, label_size):
        super(CNNClassifier, self).__init__()

        #self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.filter_sizes = filters
        #self.convs = []
        #self.maxpools = []
        #self.avgpools = []
        #self.conv = nn.Conv2d(1, kernel_num, (3, embedding_dim))
        #Convlutional layer
        for i, filter_size in enumerate(self.filter_sizes):
            conv = nn.Conv2d(1, kernel_num, (filter_size, embedding_dim))
            setattr(self, 'conv_{i}', conv)
            
            #Max pooling
            maxpool = nn.MaxPool2d((1, max_word_len-filter_size+1), 1)
            setattr(self, 'maxpool_{i}', maxpool)
            #self.maxpools.append(maxpool)
            #Average pooling
            avgpool = nn.AvgPool2d((1, max_word_len-filter_size+1), 1)
            setattr(self, 'avgpool_{i}', avgpool)
            #self.avgpools.append(avgpool)
            
        self.kernel_num = kernel_num
        
        num = len(self.filter_sizes)
        

        # The linear layer that maps from hidden state space to tag space
        self.hidden2label = nn.Linear((2*kernel_num)*num, label_size)
        #self.batch_size = batch_size
        #self.hidden = self.init_hidden(batch_size)
    def get_conv(self, i):
        return getattr(self, 'conv_{i}')
    
    def get_maxpool(self, i):
        return getattr(self, 'maxpool_{i}')
    
    def get_avgpool(self, i):
        return getattr(self, 'avgpool_{i}')

    def forward(self, sentence):
        #Batch_size, word_len, emb_size
        #embeds = self.word_embeddings(sentence)
        dropout = nn.Dropout(0.5)
        #embeds = nn.Dropout(sentence, 0.2, self.training)
        embeds = sentence
        if self.training:
            embeds = dropout(embeds)
        size = embeds.size()
        ##Batch_size, 1, word_len, emb_size
        inputs = embeds.view((size[0], 1, size[1], size[2]))
        #Batch_size, out_channel, n-stride+1, 1
        pools = []
  
        #pool = torch.cat([max_pool, avg_pool], dim=1)
        for i in range(len(self.filter_sizes)):
            outputs = self.get_conv(i)(inputs)
            outputs = outputs.view((size[0], 1, self.kernel_num, -1))
            outputs = F.relu(outputs)
            max_pool = self.get_maxpool(i)(outputs).squeeze(dim=1).squeeze(dim=2)
            avg_pool = self.get_avgpool(i)(outputs).squeeze(dim=1).squeeze(dim=2)
            pool = torch.cat([max_pool, avg_pool], dim=1)
            pools.append(pool)
            
        pooled = torch.cat(pools, dim=1)
        #print(pooled.size())
        #fully connected layer
        pooled = F.dropout(pooled, 0.5, self.training)
        probs = self.hidden2label(pooled)

        pred_scores = F.log_softmax(probs, dim=1)
        return pred_scores

In [99]:
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [110]:
filters = [3]
model = CNNClassifier(filters, 350, 128, 2)
#optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [120]:
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [121]:
epochs = 5
batch_size = 64
loops = int(len(text_train)/batch_size)
losses = []
model.train()
for i in np.arange(epochs):
    total_loss = 0
    print('Epoch:', i)
    for j in np.arange(loops):
        sent_vecs, sent_labels, lengths = train_gs.generate(64) 
        sent_vecs = sent_emb_padding(sent_vecs)#Padding
        out_layer = sent2vec(sent_vecs, sess_lm)
        sents = torch.FloatTensor(out_layer)
        model.zero_grad()
        outputs = model(sents)
        loss = loss_function(outputs, torch.LongTensor(sent_labels))
        
        #regularizer
        l2_reg = torch.tensor(0.)
        for param in model.parameters():
            l2_reg += torch.norm(param)
        loss += l2_reg * 0.001

        # Step 5. Do the backward pass and update the gradient
        loss.backward(retain_graph=True)
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        if j%80 == 0:
            print(loss.item())
    losses.append(total_loss)
print(losses)

Epoch: 0
0.7150627970695496
0.7172332406044006
0.6893958449363708
0.670016884803772
0.7130548357963562
Epoch: 1
0.7191800475120544
0.7569373250007629
0.7279062867164612
0.7354917526245117
0.7272653579711914
Epoch: 2
0.7073136568069458


KeyboardInterrupt: 

In [114]:
test_gs = generate_char_samples(text_train, y_train, vocab_file, 50, False)

In [115]:
#Evaluating Model
from sklearn.metrics import f1_score
loops = len(y_train)
correct_num = 0
predictions = []
for i in np.arange(loops):
    model.eval()
    sent_vecs, sent_labels, lengths = test_gs.generate(1)
    sent_vecs = sent_emb_padding(sent_vecs)#Padding
    out_layer = sent2vec_single(sent_vecs, sess_lm)
    sents = torch.FloatTensor(out_layer)
    outputs = model(sents)
    #Compare prediction
    pred = outputs.argmax(dim=1).numpy()[0]
    predictions.append(pred)
    count = sum(outputs.argmax(dim=1).numpy() == sent_labels)
    correct_num += count
print('Accuracy:', correct_num/loops)
print('Macro F1:', f1_score(y_train, predictions, average='macro'))

Accuracy: 0.620803886925795
Macro F1: 0.5576409866526607


In [75]:
correct_num

12503