Deep contextualized word representation has drawn wide attention because of state-of-the-art performances in downstream tasks. Contextualized embeddings can capture not only word-level information but also multi-sense information, thus improving the results in sentiment analysis, SQuad and etc. However, the language adopted in the [Elmo](https://allennlp.org/elmo) model were biLSTMs which contained a huge number of parameters, it was less likely for small labs to train and run such experiments.


In this project, we intend to make use of CNN language model in learning efficient word representations for sentiment analysis. We train a language model based on [Gated CNN architecture](https://arxiv.org/abs/1612.08083) proposed by Yann Daulphin, then do sentiment analysis with embeddings generated by the language model.

The language model training dataset is 1-billion-word-language.

In [1]:
import tensorflow as tf
import os
import time

from model import *
from data_utils import data_helper
from conf_utils import *

  from ._conv import register_converters as _register_converters


## Initialize the configuration and prepare data batches

In [2]:
class config:
    vocab_size = 2000
    embedding_size = 200
    filter_size = 64
    num_layers = 3
    block_size = 3
    filter_h = 5
    context_size = 20
    text_size = context_size
    batch_size = 32
    epochs = 5
    num_sampled = 64
    learning_rate = 0.0001
    momentum = 0.99
    grad_clip = 0.1
    num_batches = 0
    ckpt_path = 'ckpt'
    summary_path = 'logs'
    #data_dir = "data/texts/reviews/movie_reviews"
    data_dir = "data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled"

In [3]:
#Initialize configuration files
conf = prepare_conf(config)

Please download the data as mentioned in Requirements


In [6]:
#Create data batches for language model training
dh = data_helper(conf)
x_batches, y_batches, word_to_idx, idx_to_word = dh.prepare_data()

In [7]:
conf.text_size

20

In [8]:
len(x_batches[0][0])

20

## Train a CNN-based language model

In [4]:
conf.text_size

20

In [4]:
#Create a language model
#Note we need to save the models for subsequent tasks
model = GatedCNN(conf)
saver = tf.train.Saver(tf.trainable_variables())
print("Started Model Training...")

Tensor("Reshape:0", shape=(640, 200), dtype=float32)
Started Model Training...


In [9]:
#from RNN import RNN
#model = RNN(conf)
#saver = tf.train.Saver(tf.trainable_variables())

In [10]:
#labels.shape

In [7]:
conf.num_batches

32895

In [9]:
batch_idx = 0
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    summary_writer = tf.summary.FileWriter(conf.summary_path, graph=sess.graph)

    #if os.path.exists(conf.ckpt_file):
        #saver.restore(sess, conf.ckpt_file)
        #print("Model Restored")

    for i in np.arange(conf.epochs):
        start = time.time()
        for j in np.arange(conf.num_batches):
        #for j in np.arange(21):
            inputs, labels, batch_idx = dh.get_batch(x_batches, y_batches, batch_idx)
            _, l = sess.run([model.optimizer, model.loss], feed_dict={model.X:inputs, model.y:labels})
            if j%100 == 0:
                print(l)
        end = time.time()
        print("Epoch: %.2f, Time: %.2f,  Loss: %.2f"%(i, end-start, l))

        if i % 2 == 0:
            perp = sess.run(model.perplexity, feed_dict={model.X:inputs, model.y:labels})
            print("Perplexity: %.2f"%perp)
            saver.save(sess, conf.ckpt_file)

        summaries = sess.run(model.merged_summary_op, feed_dict={model.X:inputs, model.y:labels})
        summary_writer.add_summary(summaries, i)

223.95566
209.2966
152.3483
101.426
68.43827
77.2864
66.055534
59.393745
38.960575
45.392616
17.473696
29.895039
21.156122
27.863947
32.53392
23.234015
26.559147
9.5749
6.820913
9.40538
7.2287917
6.42615
4.7389736
5.236864
4.5747757
4.6145086
5.0947075
4.5440392
4.839633
5.118791
5.0830765
4.748791
4.250371
4.3809414
4.370448
4.378999
4.255619
4.187014
4.547824
5.1464524
4.793499
4.245529
4.2263618
4.189144
4.2556214
4.2774653
4.2033205
4.212135
4.310278
4.2548914
4.120658
4.1922274
4.4311514
4.186319
4.1670666
4.1775374
4.1077495
4.038469
4.2280746
4.210022
4.207857
4.2384706
4.1314454
4.170774
3.9936473
4.1679344
4.1079607
4.0399995
4.2991195
4.1547203
4.1683517
4.1830072
4.222092
4.1393027
4.2010527
4.1076794
4.0903864
3.9797409
4.1051283
4.1165414
4.0133224
4.075248
3.8975117
3.990652
4.3057275
4.000367
4.1179743
4.2144375
4.044059
4.13034
4.0318656
3.9818788
3.969685
3.9911742
4.126415
4.097704
4.0329046
3.9665055
3.9559295
4.1285086
4.007395
4.0539637
4.070298
3.8510563
4.1116247

3.61334
3.643412
3.742365
3.6269886
3.6102433
3.5786102
3.5949948
3.6666284
3.6070457
3.5942636
3.6784482
3.6857471
3.62155
3.6090817
3.6326137
3.6653435
3.5948634
3.5928218
3.5155098
3.5341084
3.7249234
3.8118472
3.4735093
3.6714077
3.7636433
3.7590127
3.6077123
3.724278
3.6936889
3.5271218
3.5646882
3.7233834
3.5090365
3.6663532
3.582542
3.594579
3.5936196
3.6765742
3.623345
3.5867755
3.558548
3.6308064
3.5472329
3.7154994
3.6727226
3.6493888
3.7159762
3.5534186
3.6026845
3.69044
3.6279721
3.614453
3.6128547
3.575107
3.7331283
3.6972938
3.6730857
3.6319084
3.6509414
3.7335098
3.6913924
3.6239052
3.544989
3.5987067
3.651792
3.660118
3.6290488
3.6209805
3.7093391
3.6154819
3.5838382
3.646777
3.5448864
3.6452394
3.509856
3.6227226
3.648582
3.7298806
3.6311371
3.5810673
3.5327764
3.6230507
3.7241974
3.5833907
3.677082
3.6396255
3.7697308
3.6717458
3.5915933
3.7259097
3.6938825
3.7029343
3.6557014
3.587111
3.5233421
3.5504947
3.5925224
3.6441612
3.5414085
3.6674962
3.6699517
3.7582583
3.7

## Train a RNN model

## Sentiment Analysis

In this part, we need to use other datasets for sentiment analysis, like the one of SemEval2013.

In [1]:
#Use spacy tokenize sentences into words
#!pip install spacy
#!python -m spacy download en
#import spacy
#nlp = spacy.load('en')
#def sent_split(sent):
    #words = []
    #sent = nlp(sent.strip())
    #for w in sent:
        #words.append(w.text.lower())
    #return words

In [10]:
file_train = 'data/semeval/downloaded.tsv'
file_dev = 'data/semeval/dev_downloaded.tsv'
file_test = 'data/semeval/test.txt'
with open(file_train) as f:
    tweets_train = f.readlines()
with open(file_dev) as f:
    tweets_dev = f.readlines()
with open(file_test) as f:
    tweets_test = f.readlines()
    


#Filter empty tweets
def is_available(text):
    if 'Not Available' in text:
        return False
    if '\t"objective' in text:
        return False
    if '\t"neutral' in text:
        return False
    if '\tobjective' in text:
        return False
    if '\tneutral' in text:
        return False
    return True


In [11]:
tweets_train = list(filter(is_available, tweets_train))
tweets_dev = list(filter(is_available, tweets_dev))
tweets_test = list(filter(is_available, tweets_test))

In [12]:
tweets_train = [item.split('\t') for item in tweets_train]
tweets_dev = [item.split('\t') for item in tweets_dev]
tweets_test = [item.split('\t') for item in tweets_test]
_, _, y_train, text_train = list(zip(*tweets_train))
_, _, y_dev, text_dev = list(zip(*tweets_dev))
_, _, y_test, text_test = list(zip(*tweets_test))


In [13]:
text_train, y_train = list(text_train), list(y_train)
text_dev, y_dev = list(text_dev), list(y_dev)
text_test, y_test = list(text_test), list(y_test)
y_test = ['"' + item + '"' for item in y_test]

In [14]:
#Encode the labels to numbers
from sklearn import preprocessing
label_encode = preprocessing.LabelEncoder()  # 建立模型
y_train = label_encode.fit_transform(y_train)
y_dev = label_encode.transform(y_dev)
y_test = label_encode.transform(y_test)

## Twitter Data Preprocessing

In [45]:
from sents_handler import generate_samples
import numpy as np

train_gs = generate_samples(np.array(text_train), np.array(y_train), word_to_idx, 20, True)
#sent_vecs, sent_labels, lengths = train_gs.generate(32)

In [46]:
sess_lm = tf.Session()
saver.restore(sess_lm, conf.ckpt_file)
def sent2vec(inputs, sess):
    '''Get word representations'''
    #Get the contextualized representation
    #train_gs = generate_samples(np.array(text_train), np.array(y_train), word_to_idx, 20, False)
    #sent_vecs, sent_labels, lengths = train_gs.generate(32)
    out_layer = sess.run(model.out_layer, feed_dict={model.X:inputs})
    return out_layer
    

INFO:tensorflow:Restoring parameters from ckpt/vocab2000_embed200_filters64_batch32_layers3_block3_fdim5/model.ckpt


In [47]:
sent_vecs, sent_labels, lengths = train_gs.generate(32)
out_layer = sent2vec(sent_vecs, sess_lm)

In [23]:
sess_lm.close()

In [36]:
max_word_len = out_layer.shape[1]
from sentiment_analysis import CNN_Model_Pretrained_Emb
class trainConfig:
    max_doc_len = max_word_len
    label_size = 2
    embed_size = 200
    hidden_size = 250
    batch_size = 32
    layer_size = 2
    
class testConfig:
    max_doc_len = max_word_len
    label_size = 2
    embed_size = 200
    hidden_size = 250
    batch_size = 32
    layer_size = 2
    
class singleConfig:
    max_doc_len = max_word_len
    label_size = 2
    embed_size = 200
    hidden_size = 250#hidden size for hidden state of rnn
    batch_size = 1
    layer_size = 2


In [37]:
import tensorflow as tf
graph_cnn = tf.Graph()
#Create models for training and testing data
with graph_cnn.as_default():
    initializer = tf.random_uniform_initializer(-0.02, 0.02)
    with tf.name_scope('train'):
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            train_model = CNN_Model_Pretrained_Emb(trainConfig)
            saver_sent=tf.train.Saver()
    with tf.name_scope('test'):
        #Set different models for different buckets
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            test_model = CNN_Model_Pretrained_Emb(testConfig, False)
            single_model = CNN_Model_Pretrained_Emb(singleConfig, False)



Model Initialized!
Model Initialized!
Model Initialized!


In [38]:
train_chunk_num  = int(len(text_train)/trainConfig.batch_size)
test_chunk_num = int(len(text_test)/testConfig.batch_size)
remain_num = len(text_test) - trainConfig.batch_size*test_chunk_num
remain_num

21

In [49]:
import time, os
epochs = 2
#train_chunk_num = 10
file = "ckpt_cnn_pretrained_emb/cnn.ckpt"
with tf.Session(graph=graph_cnn) as sess:
    #Initialize parameters
    init = tf.global_variables_initializer()
    if not os.path.exists("ckpt_cnn_pretrained_emb"):
        os.mkdir('ckpt_cnn_pretrained_emb')
    if os.path.exists("ckpt_cnn_pretrained_emb/cnn.ckpt.index"):
        saver_sent.restore(sess, file)
    else:
        sess.run(init)
    start_time = time.time()
    for m in range(epochs):
        for i in range(train_chunk_num):
            #sess.run(tf.assign(learning_rate, 0.002*((0.98)**m)))
            x, y, lengths = train_gs.generate(trainConfig.batch_size)
            x = sent2vec(x, sess_lm)
            feed_dict = {train_model.x:x, train_model.y:y, train_model.lengths:lengths}
            l, _ = sess.run([train_model.cost, train_model.optimize], feed_dict=feed_dict)
            if i%100 == 0:
                print('Loss:', round(l, 4))
        end_time = time.time()
        print('Epoch', m, 'time:{:.2f}'.format(end_time - start_time))
        
    saver_sent.save(sess,'ckpt_cnn_pretrained_emb/cnn.ckpt')

Loss: 0.7179
Loss: 0.5585
Epoch 0 time:20.26
Loss: 0.5651
Loss: 0.6671
Epoch 1 time:40.22


In [55]:
#Calculate Testing Accuracy
with tf.Session(graph=graph_cnn) as sess:
    print('Testing...')
    count = 0
    #saver = tf.train.import_meta_graph('ckpt_cnn/cnn.ckpt.meta')
    saver_sent.restore(sess,tf.train.latest_checkpoint('ckpt_cnn_pretrained_emb/'))
    print('Parameters restored')
    start_time = time.time()
    test_gs = generate_samples(np.array(text_test), np.array(y_test),word_to_idx, 20, False)
    for _ in range(test_chunk_num):
        #Traverse each data
        x, y, lengths = test_gs.generate(testConfig.batch_size)
        x = sent2vec(x, sess_lm)
        feed_dict = {test_model.x:x, test_model.y:y, test_model.lengths:lengths}
        n = sess.run(test_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    for _ in range(remain_num):
        #Traverse each data
        x, y, lengths = test_gs.generate(1)
        x = sent2vec(x, sess_lm)
        feed_dict = {single_model.x:x, single_model.y:y, 
                     single_model.lengths:lengths}
        n = sess.run(single_model.correct_num, feed_dict=feed_dict)
        count += np.sum(n)
    end_time = time.time()
    print('Testing Time:{:.2f}'.format(end_time - start_time))
    print(count*1.0/len(test_processed))

Testing...
INFO:tensorflow:Restoring parameters from ckpt_cnn_pretrained_emb/cnn.ckpt
Parameters restored


ValueError: Cannot feed value of shape (1, 20) for Tensor 'X:0', which has shape '(32, 20)'

In [57]:
count*1.0/test_chunk_num/32

0.7308673469387755

In [89]:
import numpy as np
import spacy
nlp = spacy.load('en')
class generate_samples:
    '''
    Generate samples of training data or testing data for data analysis
    '''
    def __init__(self, data, labels, word_to_idx, max_sent_len=20, is_training=True):
        '''
        Args:
        data: numpy
        labels: numpy
        '''
        self.data = data
        self.labels = labels
        self.word_to_idx = word_to_idx
        self.is_training = is_training
        self.max_sent_len = max_sent_len
        self.index = 0
        
    def sent_split(self, sent):
        '''
        Split a sentence into tokens
        '''
        words = []
        sent = nlp(sent.strip())
        for w in sent:
            words.append(w.text.lower())
        return words
        
    def generate_samples(self, sents, labels, batch_size=64):
        '''
        Select a batch_size of sentences
        Transform each sentence into a sequence of idx
        '''
        indice = np.random.choice(len(sents), batch_size)
        sents = sents[indice]
        labels = labels[indice]
        #sent_vecs, sent_lens = self.create_sent_idx(sents)
        sent_vecs, sent_lens = self.create_sent_idx(sents)
        return sent_vecs, labels, sent_lens
        #return self.create_sent_idx(sents), labels, sent_lens
    
    
    def create_sent_idx(self, sents):
        '''
        Map sents into idx
        '''
        sents_lens = list(map(self.sent2idx, sents))
        sents_idx, sents_lens = zip(*sents_lens)
        return sents_idx, sents_lens
        
        
    def sent2idx(self, sent):
        '''Map a sentence into a sequence of idx'''
        sent_idx = []
        words = self.sent_split(str(sent))
        lens = len(words)
        ##Cut long sentences
        if lens > self.max_sent_len:
            words = words[:self.max_sent_len]
            lens = self.max_sent_len
        for w in words:
            idx = self.word_to_idx.get(w)
            idx = idx if idx else self.word_to_idx['<unk>']
            sent_idx.append(idx)
        ###Pad short sentences
        for i in np.arange(lens, self.max_sent_len):
            idx = self.word_to_idx['<pad>']
            sent_idx.append(idx)
        return sent_idx, lens
    
    def generate(self, batch_size=64):
        if self.is_training:
            sent_vecs, sent_labels, lengths = self.generate_samples(self.data, 
                                                               self.labels,
                                                              batch_size)
        else:
            start = self.index
            end = start + batch_size
            if end > len(self.data):
                print('Out of sample size')
                self.index = 0
            sents = self.data[start:end]
            sent_labels = self.labels[start:end]
            sent_vecs, lengths = self.create_sent_idx(sents)
            self.index = end
        return sent_vecs, sent_labels, lengths