# Data preprocessing
    - Download data to the server
    - Convert text to sequences.
    - Configure sequences for a RNN model.

## Download data to the server

### Command line in the server
    Path to data:
        cd /home/ubuntu/data/training/text/sentiment
    Download dataset: 
        wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    Uncompress it:
        tar -zxvf aclImdb_v1.tar.gz

## Convert text to sequences
    - List of all text files
    - Read files into python
    - Tokenize
    - Create dictionaries to recode
    - Recode tokens into ids and create sentences

In [1]:
#Imports and paths
from __future__ import print_function

import numpy as np

# GPU path
#data_path='/home/ubuntu/data/training/text/sentiment/aclImdb/'

data_path='../../data/aclImdb/'


In [2]:
# Generator of list of files in a folder and subfolders
import os
import shutil
import fnmatch

def gen_find(filepattern, toppath):
    '''
    Generator with a recursive list of files in the toppath that match filepattern 
    Inputs:
        filepattern(str): Command stype pattern 
        toppath(str): Root path
    '''
    for path, dirlist, filelist in os.walk(toppath):
        for name in fnmatch.filter(filelist, filepattern):
            yield os.path.join(path, name)

#Test
print(next(gen_find("*.txt", data_path+'train/pos/')))

In [3]:
def read_sentences(path):
    sentences = []
    sentences_list = gen_find("*.txt", path)
    for ff in sentences_list:
        with open(ff, 'r', encoding='utf8') as f:
            sentences.append(f.readline().strip())
    return sentences        

#Test
print(read_sentences(data_path+'train/pos/')[0:2])

['For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.', 'Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV\'s "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina\'s pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple Sylvia Miles & Beverly D\'Angelo, are a diabolical lot, and Eli Wallach is great fun as a wily police detect

In [4]:
print(read_sentences(data_path+'train/neg/')[0:2])

["Working with one of the best Shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.<br /><br />Branagh steals the film from under Fishburne's nose, and there's a talented cast on good form.", 'Well...tremors I, the original started off in 1990 and i found the movie quite enjoyable to watch. however, they proceeded to make tremors II and III. Trust me, those movies started going downhill right after they finished the first one, i mean, ass blasters??? Now, only God himself is capable of answering the question "why in Gods name would they create another one of these dumpster dives of a movie?" Tremors IV cannot be considered a bad movie, in fact it cannot be even considered an epitome of a bad movie, for it lives up to more than that. As i attempted to sit though it, i noticed that my eyes started to bleed, and i hoped profusely that the little girl from the ring would crawl through the TV and kill me. did they really think t

In [5]:
def tokenize(sentences):
    from nltk import word_tokenize
    print( 'Tokenizing...',)
    tokens = []
    for sentence in sentences:
        tokens += [word_tokenize(sentence)]
    print('Done!')

    return tokens

print(tokenize(read_sentences(data_path+'train/pos/')[0:2]))

Tokenizing...
Done!
[['For', 'a', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are', 'a', 'lot', 'of', 'memorable', 'quotes', 'listed', 'for', 'this', 'gem', '.', 'Imagine', 'a', 'movie', 'where', 'Joe', 'Piscopo', 'is', 'actually', 'funny', '!', 'Maureen', 'Stapleton', 'is', 'a', 'scene', 'stealer', '.', 'The', 'Moroni', 'character', 'is', 'an', 'absolute', 'scream', '.', 'Watch', 'for', 'Alan', '``', 'The', 'Skipper', "''", 'Hale', 'jr.', 'as', 'a', 'police', 'Sgt', '.'], ['Bizarre', 'horror', 'movie', 'filled', 'with', 'famous', 'faces', 'but', 'stolen', 'by', 'Cristina', 'Raines', '(', 'later', 'of', 'TV', "'s", '``', 'Flamingo', 'Road', "''", ')', 'as', 'a', 'pretty', 'but', 'somewhat', 'unstable', 'model', 'with', 'a', 'gummy', 'smile', 'who', 'is', 'slated', 'to', 'pay', 'for', 'her', 'attempted', 'suicides', 'by', 'guarding', 'the', 'Gateway', 'to', 'Hell', '!', 'The', 'scenes', 'with', 'Raines', 'modeling', 'are', 'very', 'well', 'captured', ',', 'the', 'mood', 

In [6]:
sentences_trn_pos = tokenize(read_sentences(data_path+'train/pos/'))
sentences_trn_neg = tokenize(read_sentences(data_path+'train/neg/'))
sentences_trn = sentences_trn_pos + sentences_trn_neg


Tokenizing...
Done!
Tokenizing...
Done!


In [7]:
#create the dictionary to conver words to numbers. Order it with most frequent words first
def build_dict(sentences):
#    from collections import OrderedDict

    '''
    Build dictionary of train words
    Outputs: 
     - Dictionary of word --> word index
     - Dictionary of word --> word count freq
    '''
    print( 'Building dictionary..',)
    wordcount = dict()
    #For each worn in each sentence, cummulate frequency
    for ss in sentences:
        for w in ss:
            if w not in wordcount:
                wordcount[w] = 1
            else:
                wordcount[w] += 1

    counts = list(wordcount.values()) # List of frequencies
    keys = list(wordcount) #List of words
    
    sorted_idx = reversed(np.argsort(counts))
    
    worddict = dict()
    for idx, ss in enumerate(sorted_idx):
        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
    print( np.sum(counts), ' total words ', len(keys), ' unique words')

    return worddict, wordcount


worddict, wordcount = build_dict(sentences_trn)

print(worddict['the'], wordcount['the'])

Building dictionary..
7056532  total words  134957  unique words
2 289300


In [8]:
# 
def generate_sequence(sentences, dictionary):
    '''
    Convert tokenized text in sequences of integers
    '''
    seqs = [None] * len(sentences)
    for idx, ss in enumerate(sentences):
        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in ss]

    return seqs

In [9]:
# Create train and test data

#Read train sentences and generate target y
train_x_pos = generate_sequence(sentences_trn_pos, worddict)
train_x_neg = generate_sequence(sentences_trn_neg, worddict)
X_train_full = train_x_pos + train_x_neg
y_train_full = [1] * len(train_x_pos) + [0] * len(train_x_neg)

print(X_train_full[0], y_train_full[0])

[333, 6, 25, 17, 225, 85, 1202, 68, 300, 35, 6, 189, 7, 918, 4813, 3630, 24, 19, 1573, 4, 4274, 6, 25, 141, 902, 16036, 9, 183, 182, 41, 7659, 16758, 9, 6, 157, 21658, 4, 21, 37442, 123, 9, 46, 1632, 3211, 4, 1231, 24, 1541, 32, 21, 24735, 31, 9450, 47112, 22, 6, 679, 7545, 4] 1


In [10]:
#Read test sentences and generate target y
sentences_tst_pos = read_sentences(data_path+'test/pos/')
sentences_tst_neg = read_sentences(data_path+'test/neg/')

test_x_pos = generate_sequence(tokenize(sentences_tst_pos), worddict)
test_x_neg = generate_sequence(tokenize(sentences_tst_neg), worddict)
X_test_full = test_x_pos + test_x_neg
y_test_full = [1] * len(test_x_pos) + [0] * len(test_x_neg)

print(X_test_full[0])
print(y_test_full[0])

Tokenizing...
Done!
Tokenizing...
Done!
[4136, 33, 46, 772, 80, 3, 320, 13470, 299, 2, 1655, 7, 46, 338, 1251, 3, 625, 631, 5, 531, 81, 2023, 5, 75, 20, 6280, 9213, 23, 52, 1910, 4, 137, 3778, 8, 57024, 23, 52, 853, 474, 50, 6, 63, 339, 8, 98, 271, 49, 16, 45, 3, 29, 73, 52, 35283, 20, 2649, 14, 1, 3, 75, 96, 36, 604, 2, 757, 23, 52, 853, 3, 5, 20, 911, 8, 864, 171, 377, 75, 96, 98, 81565, 4, 7782, 49, 2, 338, 33204, 4, 415, 2224, 14, 6, 317, 187, 75, 96, 2609, 58, 3, 75, 569, 6, 1233, 97, 2, 4428, 23, 6, 3707, 4909, 4, 32, 15, 802, 1640, 166, 14, 172, 4713, 15090, 3, 29, 194, 16988, 14, 87, 4, 15, 20, 4713, 559, 4, 31, 12, 13, 10, 11, 12, 13, 10, 11, 7103, 45, 780, 3184, 1969, 5, 75, 20, 1057, 14, 6, 1012, 11101, 4, 565, 73, 16, 613, 50, 75, 76, 4112, 5, 7954, 24619, 6, 1552, 3, 75, 234, 52, 3707, 4909, 98, 3857, 5, 351, 4, 142, 6, 3519, 380, 75, 858, 8, 1927, 49, 2, 781, 1552, 5, 373, 8, 2392, 104, 3, 23, 85, 215, 7, 769, 4, 97780, 52, 144, 20, 14, 2610, 4, 12, 13, 10, 11, 12, 13, 10

## Configure sequences for a RNN model
    - Remove words with low frequency
    - Truncate / complete sequences to the same length

In [11]:
#Median length of sentences
print('Median length: ', np.median([len(x) for x in X_test_full]))

Median length:  208.0


In [12]:
max_features = 50000 # Number of most frequent words selected. the less frequent recode to 0
maxlen = 200  # cut texts after this number of words (among top max_features most common words)

In [13]:
#Select the most frequent max_features, recode others using 0
def remove_features(x):
    return [[0 if w >= max_features else w for w in sen] for sen in x]

X_train = remove_features(X_train_full)
X_test  = remove_features(X_test_full)
y_train = y_train_full
y_test = y_test_full

print(X_test[1])

[61, 9, 6, 1573, 4, 214, 6, 1353, 4934, 391, 91, 2, 7528, 529, 20, 1019, 2106, 4, 6659, 23, 106, 460, 17, 1578, 87, 62, 6409, 4628, 127, 3, 108, 7565, 5, 353, 3222, 4, 51, 18, 239, 294, 4809, 8, 2, 246, 16, 18, 7551, 4, 335, 139, 166, 105, 602, 28, 42, 71, 521, 44, 2, 593, 7, 10670, 7, 6, 421, 14, 2, 3096, 27, 29, 100, 35, 4956, 8, 84, 16, 3, 6, 244, 50, 6, 549, 1753, 14, 634, 1559, 4, 21, 80, 14492, 107, 18695, 1379, 5, 1374, 62, 3793, 89, 36, 373, 5, 13793, 49, 801, 2, 49224, 3085, 7, 2, 561, 3, 22, 111, 1993, 23, 2, 3517, 7, 2, 101, 1749, 337, 543, 104, 3, 1504, 188, 48, 89, 30, 672, 104, 14, 74, 4, 51, 569, 87, 6, 184, 646, 8, 98, 58, 7, 2, 8607, 17, 17517, 87, 138, 342, 19, 0, 109, 704, 4]


In [14]:
from tensorflow.contrib.keras import preprocessing

# Cut or complete the sentences to length = maxlen
print("Pad sequences (samples x time)")

X_train = preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print(X_test[0])

  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.
Pad sequences (samples x time)
X_train shape: (25000, 200)
X_test shape: (25000, 200)
[    2   338 33204     4   415  2224    14     6   317   187    75    96
  2609    58     3    75   569     6  1233    97     2  4428    23     6
  3707  4909     4    32    15   802  1640   166    14   172  4713 15090
     3    29   194 16988    14    87     4    15    20  4713   559     4
    31    12    13    10    11    12    13    10    11  7103    45   780
  3184  1969     5    75    20  1057    14     6  1012 11101     4   565
    73    16   613    50    75    76  4112     5  7954 24619     6  1552
     3    75   234    52  3707  4909    98  3857     5   351     4   142
     6  3519   380    75   858     8  1927    49     2   781  1552     5
   373     8  2392   104     3    23    85   215     7   769     4     0
    52   144    20    14  2610     4    12    13    10    11    12    13
    10    11   897     9     6   272  

In [15]:
# Shuffle data
from sklearn.utils import shuffle

X_train, y_train = shuffle(X_train, y_train, random_state=0)

In [16]:
# Export train and test data
np.save(data_path + 'X_train', X_train)
np.save(data_path + 'y_train', y_train)
np.save(data_path + 'X_test',  X_test)
np.save(data_path + 'y_test',  y_test)


In [17]:
# Export worddict
import pickle

with open(data_path + 'worddict.pickle', 'wb') as pfile:
    pickle.dump(worddict, pfile)
