Deep contextualized word representation has drawn wide attention because of state-of-the-art performances in downstream tasks. Contextualized embeddings can capture not only word-level information but also multi-sense information, thus improving the results in sentiment analysis, SQuad and etc. However, the language adopted in the [Elmo](https://allennlp.org/elmo) model were biLSTMs which contained a huge number of parameters, it was less likely for small labs to train and run such experiments.


In this project, we intend to make use of CNN language model in learning efficient word representations for sentiment analysis. We train a language model based on [Gated CNN architecture](https://arxiv.org/abs/1612.08083) proposed by Yann Daulphin, then do sentiment analysis with embeddings generated by the language model.

The language model training dataset is 1-billion-word-language.

In [1]:
import tensorflow as tf
import os
import time
import numpy as np
from bilm.training import load_options_latest_checkpoint, load_vocab
from bilm.data import Batcher, BidirectionalLMDataset
#from data_utils import data_helper
from conf_utils import *

  from ._conv import register_converters as _register_converters


## Initialize the configuration and prepare data batches

In [2]:
# with open('data/wiki.test.tokens') as f:
#     lines = f.readlines()
#     lines = [line for line in lines if len(line)>5]

In [3]:
# for i in range28):
#     start = i * 100
#     end = (i+1) * 100
#     texts = lines[start:end]
#     with open('data/wikitext-103-test/text'+str(i), 'w+') as f:
#         for line in texts:
#             f.write(line+'\n')

In [4]:
# for i in range(50):
#     start = i * 22000
#     end = (i+1) * 22000
#     texts = lines[start:end]
#     with open('data/wikitext-103/text'+str(i), 'w+') as f:
#         for line in texts:
#             f.write(line+'\n')
              

In [5]:
# with open('data/wiki-vocab.txt', 'w+') as f:
#     f.write('</S>\n')
#     f.write('<S>\n')
#     f.write('<UNK>\n')
#     for line in lines:
#         word = line.split()[0]
#         f.write(word+'\n')

In [6]:
#with open('data/vocab-2016-09-10.txt') as f:
    #lines = f.readlines()

In [7]:
#Load the words
vocab_file = 'data/vocab-2016-09-10.txt'
#vocab_file = 'data/wiki-vocab.txt'
vocab = load_vocab(vocab_file, 50)

In [25]:
class config:
    vocab_size = vocab.size
    embedding_size = 128
    filter_size = 64
    num_layers = 3
    block_size = 3
    filter_h = 5
    context_size = 50
    text_size = context_size
    batch_size = 64
    epochs = 5
    num_sampled = 64
    learning_rate = 3
    momentum = 0.99
    grad_clip = 0.1
    num_batches = 0
    ckpt_path = 'ckpt_char_gated_cnn'
    summary_path = 'logs'
    #data_dir = "data/texts/reviews/movie_reviews"
    data_dir = "data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled"

In [26]:
#Initialize configuration files
conf = prepare_conf(config)

In [10]:
train_prefix = 'data/1-billion-word-language-modeling-ben\
chmark-r13output/training-monolingual.tokenized.shuffled/*'

In [11]:
#train_prefix = 'data/wikitext-103/*'

In [12]:
data = BidirectionalLMDataset(train_prefix, vocab, test=False,
                                      shuffle_on_load=True)

Found 50 shards at data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/*
Loading data from: data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00034-of-00100
Loaded 305408 sentences.
Finished loading
Found 50 shards at data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/*
Loading data from: data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news.en-00059-of-00100
Loaded 306839 sentences.
Finished loading


In [13]:
data_gen = data.iter_batches(conf.batch_size * 1, conf.text_size)

In [14]:
x = next(data_gen)

In [15]:
x.keys()

dict_keys(['token_ids', 'tokens_characters', 'next_token_id', 'token_ids_reverse', 'tokens_characters_reverse', 'next_token_id_reverse'])

## Train a CharCNN-based language model

Note the inputs are transformed into chars of words, so as to make use of subword information.

In [16]:
%load_ext autoreload
%autoreload 2

In [27]:
#Create a language model
#Note we need to save the models for subsequent tasks
from bi_char_cnn_lm_model import gated_char_cnn_model
graph = tf.Graph()
with graph.as_default():
    with tf.variable_scope('gated_cnn'):
        model = gated_char_cnn_model(conf, is_bidirectional=True)
        all_variables = tf.get_collection_ref(tf.GraphKeys.GLOBAL_VARIABLES)
        var_list=[v for v in all_variables if "Adagrad" not in v.name]
    with tf.variable_scope('gated_cnn', reuse=True):
        model_test = gated_char_cnn_model(conf, is_train=False, is_bidirectional=True)
    saver = tf.train.Saver(var_list=var_list)
    print("Started Model Training...")

  num_elements)


Started Model Training...


In [None]:
batch_idx = 0
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    summary_writer = tf.summary.FileWriter(conf.summary_path, graph=sess.graph)

    if os.path.exists(conf.ckpt_file+'.index'):
        saver.restore(sess, conf.ckpt_file)
        print("Model Restored")

    for i in np.arange(conf.epochs):
        start = time.time()
        for j in np.arange(10000):
        #for j in np.arange(21):
            x = next(data_gen)
            inputs, labels = x['tokens_characters'], x['next_token_id']
            inputs_reverse, labels_reverse = x['tokens_characters_reverse'], \
                                                    x['next_token_id_reverse']
            labels = labels.reshape(-1, 1)
            labels_reverse = labels_reverse.reshape(-1, 1)
            feed_dict = {model.X:inputs, model.y:labels, 
                         model.X_reverse:inputs_reverse, model.y_reverse:labels_reverse}
            _, l = sess.run([model.optimizer, model.loss], 
                            feed_dict=feed_dict)
            if j%200 == 0:
                print('epoch'+str(i), 'loop'+str(j), l)
            if j%2000 == 1999:
                perp = sess.run(model.perplexity, 
                                feed_dict=feed_dict)
                print("Perplexity: %.2f"%perp)
                saver.save(sess, conf.ckpt_file)
        end = time.time()
        print("Epoch: %.2f, Time: %.2f,  Loss: %.2f"%(i, end-start, l))

        if i % 2 == 0:
            perp = sess.run(model.perplexity, feed_dict={model.X:inputs, model.y:labels})
            print("Perplexity: %.2f"%perp)
            saver.save(sess, conf.ckpt_file)

        #summaries = sess.run(model.merged_summary_op, feed_dict={model.X:inputs, model.y:labels})
        #summary_writer.add_summary(summaries, i)

epoch0 loop0 30.012493


## Language Model Testing

In [21]:
class testConfig:
    vocab_size = vocab.size
    embedding_size = 128
    filter_size = 64
    num_layers = 3
    block_size = 3
    filter_h = 5
    context_size = 50
    text_size = context_size
    batch_size = 32
    epochs = 8
    num_sampled = 64
    learning_rate = 1
    momentum = 0.995
    grad_clip = 0.1
    num_batches = 0
    ckpt_path = 'ckpt_nlp_block'
    summary_path = 'logs'
    #data_dir = "data/texts/reviews/movie_reviews"
    data_dir = "data/1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled"

In [22]:
test_prefix = 'data/wikitext-103-test/*'
data_test = BidirectionalLMDataset(test_prefix, vocab, test=True,
                                      shuffle_on_load=False)

Found 28 shards at data/wikitext-103-test/*
Loading data from: data/wikitext-103-test/text17
Loaded 200 sentences.
Finished loading
Found 28 shards at data/wikitext-103-test/*
Loading data from: data/wikitext-103-test/text17
Loaded 200 sentences.
Finished loading


In [23]:
data_gen_test = data_test.iter_batches(conf.batch_size * 1, conf.text_size)

In [24]:
###Testting
with tf.Session(graph=graph) as sess:
    if os.path.exists(conf.ckpt_file+'.index'):
        saver.restore(sess, conf.ckpt_file)
        print("Model Restored")
    losses = []
    for j in np.arange(10000):
        #for j in np.arange(21):
            x = next(data_gen_test)
            inputs, labels = x['tokens_characters'], x['next_token_id']
            #inputs_r, labels_r = x['token_ids_reverse'], x['next_token_id_reverse']
            feed_dict = {model_test.X:inputs, model_test.y:labels}
            l = sess.run(model_test.loss, feed_dict={model_test.X:inputs, model_test.y:labels})
            if j%100 == 0:
                print('Tesing Loss:', l)
            losses.append(l)
    print(np.mean(losses))

INFO:tensorflow:Restoring parameters from ckpt_char_gated_cnn/vocab267743_embed128_filters64_batch64_layers3_block3_fdim5/model.ckpt
Model Restored
Tesing Loss: 4.6965504
Loading data from: data/wikitext-103-test/text13
Loaded 200 sentences.
Finished loading
Loading data from: data/wikitext-103-test/text13
Loaded 200 sentences.
Finished loading
Loading data from: data/wikitext-103-test/text8
Loaded 200 sentences.
Finished loading
Loading data from: data/wikitext-103-test/text8
Loaded 200 sentences.
Finished loading
Loading data from: data/wikitext-103-test/text14
Loaded 200 sentences.
Finished loading
Loading data from: data/wikitext-103-test/text14
Loaded 200 sentences.
Finished loading
Loading data from: data/wikitext-103-test/text9
Loaded 200 sentences.
Finished loading
Loading data from: data/wikitext-103-test/text9
Loaded 200 sentences.
Finished loading
Loading data from: data/wikitext-103-test/text1
Loaded 200 sentences.
Finished loading
Loading data from: data/wikitext-103-test/

StopIteration: 

## Train a RNN model

## Sentiment Analysis

In this part, we need to use other datasets for sentiment analysis, like the one of SemEval2013.

In [19]:
file_train = 'data/semeval/downloaded.tsv'
file_dev = 'data/semeval/dev_downloaded.tsv'
file_test = 'data/semeval/test.txt'
with open(file_train) as f:
    tweets_train = f.readlines()
with open(file_dev) as f:
    tweets_dev = f.readlines()
with open(file_test) as f:
    tweets_test = f.readlines()
    


#Filter empty tweets
def is_available(text):
    if 'Not Available' in text:
        return False
    if '\t"objective' in text:
        return False
    if '\t"neutral' in text:
        return False
    if '\tobjective' in text:
        return False
    if '\tneutral' in text:
        return False
    return True


In [20]:
tweets_train = list(filter(is_available, tweets_train))
tweets_dev = list(filter(is_available, tweets_dev))
tweets_test = list(filter(is_available, tweets_test))

In [21]:
tweets_train = [item.split('\t') for item in tweets_train]
tweets_dev = [item.split('\t') for item in tweets_dev]
tweets_test = [item.split('\t') for item in tweets_test]
_, _, y_train, text_train = list(zip(*tweets_train))
_, _, y_dev, text_dev = list(zip(*tweets_dev))
_, _, y_test, text_test = list(zip(*tweets_test))


In [22]:
text_train, y_train = list(text_train), list(y_train)
text_dev, y_dev = list(text_dev), list(y_dev)
text_test, y_test = list(text_test), list(y_test)
y_test = ['"' + item + '"' for item in y_test]

In [23]:
#Encode the labels to numbers
from sklearn import preprocessing
label_encode = preprocessing.LabelEncoder()  # 建立模型
y_train = label_encode.fit_transform(y_train)
y_dev = label_encode.transform(y_dev)
y_test = label_encode.transform(y_test)

## Twitter Data Preprocessing

In [28]:
from sents_handler import generate_char_samples
import numpy as np

train_gs = generate_char_samples(np.array(text_train), np.array(y_train), vocab_file, 50, True)
#sent_vecs, sent_labels, lengths = train_gs.generate(32)

Get the contextualized representation.

In [55]:
sess_lm = tf.Session(graph=graph)
saver.restore(sess_lm, conf.ckpt_file)
def sent2vec(inputs, sess):
    '''Get word representations'''
    #Get the contextualized representation
    #train_gs = generate_samples(np.array(text_train), np.array(y_train), word_to_idx, 20, False)
    #sent_vecs, sent_labels, lengths = train_gs.generate(32)
    out_layer = sess.run(model.hidden_layers, feed_dict={model.X:inputs})
    return out_layer
    

INFO:tensorflow:Restoring parameters from ckpt_char_gated_cnn/vocab267743_embed128_filters64_batch64_layers3_block3_fdim5/model.ckpt


In [64]:
sent_vecs, sent_labels, lengths = train_gs.generate(64)
def sent_emb_padding(sent_vecs):
    shape = sent_vecs.shape
    sent_char_matrix = np.zeros([64, 50, 50])
    if shape[1] < 50:
        sent_char_matrix[:, :shape[1], :] = sent_vecs[:, :, :]
    else:
        sent_char_matrix = sent_vecs
    return sent_char_matrix    
sent_vecs = sent_emb_padding(sent_vecs)
out_layer = sent2vec(sent_vecs, sess_lm)

In [62]:
sent_vecs.shape

(64, 50, 50)

In [48]:
max_word_len = out_layer.shape[1]
from sentiment_analysis import CNN_Model_Pretrained_Emb
class trainConfig:
    max_doc_len = max_word_len
    label_size = 2
    embed_size = 350
    hidden_size = 250
    batch_size = 64
    layer_size = 2
    
class testConfig:
    max_doc_len = max_word_len
    label_size = 2
    embed_size = 350
    hidden_size = 250
    batch_size = 64
    layer_size = 2
    
class singleConfig:
    max_doc_len = max_word_len
    label_size = 2
    embed_size = 350
    hidden_size = 250#hidden size for hidden state of rnn
    batch_size = 1
    layer_size = 2


In [74]:
import tensorflow as tf
graph_cnn = tf.Graph()
#Create models for training and testing data
with graph_cnn.as_default():
    initializer = tf.random_uniform_initializer(-0.01, 0.01)
    with tf.name_scope('train'):
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            train_model = CNN_Model_Pretrained_Emb(trainConfig)
            saver_sent=tf.train.Saver()
    with tf.name_scope('test'):
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            test_model = CNN_Model_Pretrained_Emb(testConfig, False)
            single_model = CNN_Model_Pretrained_Emb(singleConfig, False)



Model Initialized!
Model Initialized!
Model Initialized!


In [59]:
train_chunk_num  = int(len(text_train)/trainConfig.batch_size)
test_chunk_num = int(len(text_test)/testConfig.batch_size)
remain_num = len(text_test) - testConfig.batch_size*test_chunk_num
remain_num

21

In [None]:
import time, os
epochs = 5
#train_chunk_num = 10
file = "ckpt_cnn_pretrained_emb/cnn.ckpt"
with tf.Session(graph=graph_cnn) as sess:
    #Initialize parameters
    init = tf.global_variables_initializer()
    if not os.path.exists("ckpt_cnn_pretrained_emb"):
        os.mkdir('ckpt_cnn_pretrained_emb')
    if os.path.exists("ckpt_cnn_pretrained_emb/cnn.ckpt.index"):
        saver_sent.restore(sess, file)
    else:
        sess.run(init)
    start_time = time.time()
    for m in range(epochs):
        for i in range(train_chunk_num):
            #sess.run(tf.assign(learning_rate, 0.002*((0.98)**m)))
            x, y, lengths = train_gs.generate(trainConfig.batch_size)
            x1 = sent_emb_padding(x)
            x2 = sent2vec(x1, sess_lm)
            feed_dict = {train_model.x:x2, train_model.y:y, train_model.lengths:lengths}
            l, _ = sess.run([train_model.cost, train_model.optimize], feed_dict=feed_dict)
            if i%100 == 0:
                print('Loss:', round(l, 4))
        end_time = time.time()
        print('Epoch', m, 'time:{:.2f}'.format(end_time - start_time))
        if m%2 == 0:
            saver_sent.save(sess,'ckpt_cnn_pretrained_emb/cnn.ckpt')

INFO:tensorflow:Restoring parameters from ckpt_cnn_pretrained_emb/cnn.ckpt
Loss: 0.6661
Epoch 0 time:21.20
Loss: 0.5504
Epoch 1 time:42.21
Loss: 0.5348


In [72]:
#Calculate Testing Accuracy
with tf.Session(graph=graph_cnn) as sess:
    print('Testing...')
    count = 0
    #saver = tf.train.import_meta_graph('ckpt_cnn/cnn.ckpt.meta')
    saver_sent.restore(sess,tf.train.latest_checkpoint('ckpt_cnn_pretrained_emb/'))
    print('Parameters restored')
    start_time = time.time()
    test_gs = generate_char_samples(np.array(text_test), np.array(y_test),vocab_file, 20, False)
    for _ in range(test_chunk_num):
        #Traverse each data
        x, y, lengths = test_gs.generate(testConfig.batch_size)
        x = sent_emb_padding(x)
        x = sent2vec(x, sess_lm)
        feed_dict = {test_model.x:x, test_model.y:y, test_model.lengths:lengths}
        n = sess.run(test_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    for i in range(remain_num):
        #Traverse each data
        x, y, lengths = test_gs.generate(1)
        x = sent_emb_padding(x)
        #print(i, len(x))
        x = sent2vec(x, sess_lm)
        feed_dict = {single_model.x:x, single_model.y:y, 
                     single_model.lengths:lengths}
        n = sess.run(single_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    end_time = time.time()
    print('Testing Time:{:.2f}'.format(end_time - start_time))
    print(count*1.0/len(text_test))

Testing...
INFO:tensorflow:Restoring parameters from ckpt_cnn_pretrained_emb/cnn.ckpt
Parameters restored


ValueError: Cannot feed value of shape (64, 50, 350) for Tensor 'test/Model/Placeholder_3:0', which has shape '(1, 50, 350)'

In [73]:
print(count*1.0/len(text_test))

0.7260057016154577


In [23]:
sess_lm.close()