In [1]:
import os
import pandas as pd
from collections import Counter

In [6]:
data = pd.read_csv('Datasets/imdb_master_train.csv', encoding="latin-1")
print("Shape of training data = ", data.shape)
data.sample(10)

Shape of training data =  (25000, 5)


Unnamed: 0.1,Unnamed: 0,type,review,sentiment,Processed_Reviews
7169,32169,train,Contains spoilers The movie plot can be summar...,0,contains spoiler the movie plot can be summari...
4032,29032,train,After being hugely entertained by Mr. Brosnan'...,0,after being hugely entertained by mr brosnan p...
12919,37919,train,This film is an hour or so of good entertainme...,1,this film is an hour or so of good entertainme...
18075,43075,train,"Oh, come on, learn to have a little fun. When ...",1,oh come on learn to have little fun when wa ki...
11821,36821,train,The number of goofs in this episode was higher...,0,the number of goof in this episode wa higher t...
7791,32791,train,"<br /><br />""step aside for hollywood veterans...",0,br br step aside for hollywood veteran the way...
13501,38501,train,This is probably the best documentary I have s...,1,this is probably the best documentary have see...
3386,28386,train,"Skippy from ""Family Ties"" plays Eddie, a wussy...",0,skippy from family tie play eddie wussy metal ...
2455,27455,train,"She may have an Oscar and a Golden Globe, but ...",0,she may have an oscar and golden globe but thi...
9810,34810,train,This was an awful movie. Basically Jane March ...,0,this wa an awful movie basically jane march wa...


In [8]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(data, test_size=0.2)

In [4]:
#Adding white space separated full stop to each sentence in data. There are 25K sentences in train.csv here.
data_train['Processed_Reviews'] = data_train['Processed_Reviews'] + " ."
data_train['Processed_Reviews'].head()

0    story of man who ha unnatural feeling for pig ...
1    airport 77 start a brand new luxury 747 plane ...
2    this film lacked something couldn put my finge...
3    sorry everyone know this is supposed to be an ...
4    when wa little my parent took me along to the ...
Name: Processed_Reviews, dtype: object

In [9]:
# As the training requires multiple files with one text sentence per line, 
# we will create 4K training files by writing 6 sentences per file. 
# After running the below python snippet, we get 4K files in train directory.

if not os.path.exists("imdb/train"):
    os.makedirs("imdb/train")
 
for i in range(0,data_train.shape[0],6):
    text = "\n".join(data_train['Processed_Reviews'][i:i+6].tolist())
    fp = open("imdb/train/"+str(i)+".txt","w")
    fp.write(text)
    fp.close()

In [12]:
# Validation data is also prepared in the similar manner as training data.
data_val['Processed_Reviews'] = data_val['Processed_Reviews'] + " ."
if not os.path.exists("imdb/dev"):
    os.makedirs("imdb/dev")
 
for i in range(0,data_val.shape[0],6):
    text = "\n".join(data_val['Processed_Reviews'][i:i+6].tolist())
    fp = open("imdb/dev/"+str(i)+".txt","w")
    fp.write(text)
    fp.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Preparing Vocabulary File
The vocabulary file is a a text file with one token per line. It must also include the special tokens <S>, </S> and <UNK> (case sensitive) in the file. The vocabulary file should be sorted in descending order by token count in your training data. The first three lines should be the special tokens (<S>, </S> and <UNK>), then the most common token in the training data, ending with the least common token.

In [13]:
texts = " ".join(data_train['Processed_Reviews'].tolist())
words = texts.split(" ")
print("Number of tokens in Training data = ",len(words))
dictionary = Counter(words)
print("Size of Vocab",len(dictionary))
sorted_vocab = ["<S>","</S>","<UNK>"]
sorted_vocab.extend([pair[0] for pair in dictionary.most_common()])
 
text = "\n".join(sorted_vocab)
fp = open("imdb/vocab.txt","w")
fp.write(text)
fp.close()

Number of tokens in Training data =  4690217
Size of Vocab 66146


## Train the biLM model
We are ready to train our custom biLM model now.`

In [1]:
# !python bin/train_elmo.py --train_prefix='imdb/train/*' --vocab_file 'imdb/vocab.txt' --save_dir 'imdb/checkpoint'

In [2]:
# python bin/run_test.py --test_prefix='./imdb/dev/*' --vocab_file './imdb/vocab.txt' --save_dir './imdb/checkpoint'

In [4]:
# python bin/dump_weights.py --save_dir 'imdb/checkpoint' --outfile 'imdb/imdb_weights.hdf5'

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import numpy as np
import scipy.spatial.distance as ds
from bilm import Batcher, BidirectionalLanguageModel, weight_layers
 
# Location of pretrained LM.  Here we use the test fixtures.
datadir = os.path.join('imdb', 'model')
vocab_file = os.path.join(datadir, 'vocab.txt')
options_file = os.path.join(datadir, 'options.json')
weight_file = os.path.join(datadir, 'imdb_weights.hdf5')
 
# Create a Batcher to map text to character ids.
batcher = Batcher(vocab_file, 50)
 
# Input placeholders to the biLM.
context_character_ids = tf.placeholder('int32', shape=(None, None, 50))
 
# Build the biLM graph.
bilm = BidirectionalLanguageModel(options_file, weight_file)
 
# Get ops to compute the LM embeddings.
context_embeddings_op = bilm(context_character_ids)
 
# Get an op to compute ELMo (weighted average of the internal biLM layers)
elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
 
# Now we can compute embeddings.
raw_context = ['Technology has advanced so much in new scientific world',
                'My child participated in fancy dress competition',
                'Fashion industry has seen tremendous growth in new designs']
 
tokenized_context = [sentence.split() for sentence in raw_context]
print(tokenized_context)

In [None]:
with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())
 
    # Create batches of data.
    context_ids = batcher.batch_sentences(tokenized_context)
    print("Shape of context ids = ", context_ids.shape)
 
    # Compute ELMo representations (here for the input only, for simplicity).
    elmo_context_input_ = sess.run(
        elmo_context_input['weighted_op'],
        feed_dict={context_character_ids: context_ids}
    )
 
print("Shape of generated embeddings = ",elmo_context_input_.shape)

In [None]:
# Computing euclidean distance between words embedding
euc_dist_bet_tech_computer = np.linalg.norm(elmo_context_input_[1,5,:]-elmo_context_input_[0,0,:])
euc_dist_bet_computer_fashion = np.linalg.norm(elmo_context_input_[1,5,:]-elmo_context_input_[2,0,:])
# Computing cosine distance between words embedding
cos_dist_bet_tech_computer = ds.cosine(elmo_context_input_[1,5,:],elmo_context_input_[0,0,:])
cos_dist_bet_computer_fashion = ds.cosine(elmo_context_input_[1,5,:],elmo_context_input_[2,0,:])
 
print("Euclidean Distance Comparison - ")
print("\nDress-Technology = ",np.round(euc_dist_bet_tech_computer,2),"\nDress-Fashion = ",
      np.round(euc_dist_bet_computer_fashion,2))
print("\n\nCosine Distance Comparison - ")
print("\nDress-Technology = ",np.round(cos_dist_bet_tech_computer,2),"\nDress-Fashion = ",
      np.round(cos_dist_bet_computer_fashion,2))