Deep contextualized word representation has drawn wide attention because of state-of-the-art performances in downstream tasks. Contextualized embeddings can capture not only word-level information but also multi-sense information, thus improving the results in sentiment analysis, SQuad and etc. However, the language adopted in the [Elmo](https://allennlp.org/elmo) model were biLSTMs which contained a huge number of parameters, it was less likely for small labs to train and run such experiments.


In this project, we intend to make use of CNN language model in learning efficient word representations for sentiment analysis. We train a language model based on [Gated CNN architecture](https://arxiv.org/abs/1612.08083) proposed by Yann Daulphin, then do sentiment analysis with embeddings generated by the language model.

The language model training dataset is 1-billion-word-language.

In [1]:
import tensorflow as tf
import os
import time

from model import *
from data_utils import *
from conf_utils import *

  from ._conv import register_converters as _register_converters


## Initialize the configuration and prepare data batches

In [2]:
class config:
    vocab_size = 2000
    embedding_size = 200
    filter_size = 64
    num_layers = 5
    block_size = 2
    filter_h = 5
    context_size = 20
    text_size = context_size
    batch_size = 16
    epochs = 5
    num_sampled = 64
    learning_rate = 0.0001
    momentum = 0.99
    grad_clip = 0.1
    num_batches = 0
    ckpt_path = 'ckpt'
    summary_path = 'logs'
    #data_dir = "data/texts/reviews/movie_reviews"
    data_dir = "data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled"

In [4]:
#Initialize configuration files
conf = prepare_conf(config)
#Create data batches for language model training
x_batches, y_batches = prepare_data(conf)

In [5]:
conf.text_size

22

In [7]:
x_batches[0][0]

array([  1,   1,   2,  67,   0,   6,  57,   0, 611, 134, 650,   7,   0,
        46,  14,   9,   0, 427,  31, 785,   4,   3])

## Train a CNN-based language model

In [9]:
#Create a language model
#Note we need to save the models for subsequent tasks
model = GatedCNN(conf)
saver = tf.train.Saver(tf.trainable_variables())
print("Started Model Training...")

Started Model Training...


In [11]:
batch_idx = 0
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    summary_writer = tf.summary.FileWriter(conf.summary_path, graph=sess.graph)

    if os.path.exists(conf.ckpt_file):
        saver.restore(sess, conf.ckpt_file)
        print("Model Restored")

    for i in np.arange(1):
        start = time.time()
        #for j in np.arange(conf.num_batches):
        for j in np.arange(21):
            inputs, labels, batch_idx = get_batch(x_batches, y_batches, batch_idx)
            _, l = sess.run([model.optimizer, model.loss], feed_dict={model.X:inputs, model.y:labels})
            if j%20 == 0:
                print(l)
        end = time.time()
        print("Epoch: %.2f, Time: %.2f,  Loss: %.2f"%(i, end-start, l))

        if i % 3 == 0:
            perp = sess.run(model.perplexity, feed_dict={model.X:inputs, model.y:labels})
            print("Perplexity: %.2f"%perp)
            saver.save(sess, conf.ckpt_file)

        summaries = sess.run(model.merged_summary_op, feed_dict={model.X:inputs, model.y:labels})
        summary_writer.add_summary(summaries, i)

2581739.2
2589265.0
Epoch: 0.00, Time: 99.48,  Loss: 2589265.00
Perplexity: inf


## Sentiment Analysis

In [12]:
with tf.Session() as sess:
    saver.restore(sess, conf.ckpt_file)
    #Get the contextualized representation
    out_layer = sess.run(model.out_layer, feed_dict={model.X:inputs})
    
    #out_layer.resha

INFO:tensorflow:Restoring parameters from ckpt/vocab2000_embed200_filters64_batch16_layers5_block2_fdim5/model.ckpt


In [17]:
max_word_len = out_layer.shape[1]
from sentiment_analysis import CNN_Model_Pretrained_Emb
class trainConfig:
    max_doc_len = max_word_len
    label_size = 2
    embed_size = 100
    hidden_size = 250
    batch_size = 64
    layer_size = 2
    
class testConfig:
    max_doc_len = max_word_len
    label_size = 2
    embed_size = 100
    hidden_size = 250
    batch_size = 64
    layer_size = 2
    
class singleConfig:
    max_doc_len = max_word_len
    label_size = 2
    embed_size = 100
    hidden_size = 250#hidden size for hidden state of rnn
    batch_size = 1
    layer_size = 2


In [18]:
import tensorflow as tf
graph_cnn = tf.Graph()
#Create models for training and testing data
with graph_cnn.as_default():
    initializer = tf.random_uniform_initializer(-0.02, 0.02)
    with tf.name_scope('train'):
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            train_model = CNN_Model_Pretrained_Emb(trainConfig)
            saver=tf.train.Saver()
    with tf.name_scope('test'):
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            test_model = CNN_Model_Pretrained_Emb(testConfig, False)
            single_model = CNN_Model_Pretrained_Emb(singleConfig, False)



Model Initialized!
Model Initialized!
Model Initialized!
