### Get Sentence feature from a pretrained model

In [1]:
import os,re
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
print ("Packages loaded.")

Packages loaded.


### Get dataset

In [2]:
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(
        fname="aclImdb.tar.gz", 
        origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
        extract=True)

    train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))
    return train_df, test_df

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)
train_df, test_df = download_and_load_datasets()
train_df.head()

Unnamed: 0,sentence,sentiment,polarity
0,"My Take: A goofy, yet imaginative mess. <br />...",7,1
1,The first film had little ambition so nothing ...,10,1
2,I have to finish watching a movie once I start...,1,0
3,Omen IV (1991) was a bad made-for-T.V. movie. ...,1,0
4,I had watched this film from Ralph Bakshi (Wiz...,10,1


### Check dataset

In [3]:
for i in range(2):
    print (i,':',train_df['sentence'][i])
for i in range(2):
    print (i,':',train_df['polarity'][i])

0 : My Take: A goofy, yet imaginative mess. <br /><br />Keanu Reeves (Yes! That Keanu Reeves) and Alex Winter return as the two punk-rock idiots in this sequel to the time-trotting adventure comedy BILL AND TED'S EXCELLENT ADVENTURE, now a cult classic. In this sequel, Bill and Ted are given much more to do than travel through time. They might as well travel Heaven and Hell too! During the beginning of this sequel, Bill and Ted are preparing for a "Battle of the Bands" competition which may make them more famous than ever. Meanwhile, many years in a futuristic civilization, the time-wizard from the first film (the always watchable George Carlin) is running a university praising Bill and Ted's names. There, an evil tyrant (Joss Ackland, from THE HUNT FOR RED October) plots to get rid of the two idiot rock-stars once and for all. So he sends two identical android replicas (In the words of Bill and Ted: Robot Us's) to do the dirty job.<br /><br />There after, Bill and Ted experience death

### Get feature

In [4]:
embed_module = hub.Module("https://tfhub.dev/google/nnlm-en-dim128/1")
embed_train = embed_module(tf.reshape(train_df["sentence"], shape=[-1]))
embed_test = embed_module(tf.reshape(test_df["sentence"], shape=[-1]))
with tf.train.MonitoredTrainingSession(is_chief=True) as sess:
    x_train = sess.run(embed_train)
    x_test = sess.run(embed_test)
n_train,n_test = np.shape(x_train)[0],np.shape(x_test)[0]
y_train,y_test = np.zeros((n_train,2)),np.zeros((n_test,2))
for i in range(n_train):
    y_train[train_df['polarity'][0]] = 1
for i in range(n_test):
    y_test[test_df['polarity'][0]] = 1    
print("Shapes of 'x_train' and 'x_test' are %s and %s."%
      (x_train.shape,x_test.shape)) # (result: (1, 128))    
print("Shapes of 'y_train' and 'y_test' are %s and %s."%
      (y_train.shape,y_test.shape)) # (result: (1, 128))    

Shapes of 'x_train' and 'x_test' are (25000, 128) and (25000, 128).
Shapes of 'y_train' and 'y_test' are (25000, 2) and (25000, 2).
