## Data Preparation 

In [1]:
from nltk.corpus import stopwords
import string
import re
from collections import Counter
import os

In [3]:
print(re.escape(string.punctuation))

\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^_\`\{\|\}\~


In [2]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    
    return text

In [3]:
def clean_doc(doc):
    
    tokens = doc.split()
    
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    
    tokens = [re_punc.sub('',w) for w in tokens]

    tokens = [word for word in tokens if word.isalpha()]
    
    stop_words = set(stopwords.words('english'))
    
    tokens = [word for word in tokens if not word in stop_words]
    
    tokens = [word for word in tokens if len(word)>1]
    
    return tokens

In [4]:
filename = '../../data/input/txt_sentoken/neg/cv000_29416.txt'

In [5]:
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)


['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guys', 'dies', 'girlfriend', 'continues', 'see', 'life', 'nightmares', 'whats', 'deal', 'watch', 'movie', 'sorta', 'find', 'critique', 'mindfuck', 'movie', 'teen', 'generation', 'touches', 'cool', 'idea', 'presents', 'bad', 'package', 'makes', 'review', 'even', 'harder', 'one', 'write', 'since', 'generally', 'applaud', 'films', 'attempt', 'break', 'mold', 'mess', 'head', 'lost', 'highway', 'memento', 'good', 'bad', 'ways', 'making', 'types', 'films', 'folks', 'didnt', 'snag', 'one', 'correctly', 'seem', 'taken', 'pretty', 'neat', 'concept', 'executed', 'terribly', 'problems', 'movie', 'well', 'main', 'problem', 'simply', 'jumbled', 'starts', 'normal', 'downshifts', 'fantasy', 'world', 'audience', 'member', 'idea', 'whats', 'going', 'dreams', 'characters', 'coming', 'back', 'dead', 'others', 'look', 'like', 'dead', 'strange', 'apparitions', 'disappearances', 'looooot', 'chase', 'scen

In [6]:
def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    
    tokens = clean_doc(doc)
    
    vocab.update(tokens)

In [7]:
def process_docs(directory, vocab):
    
    for filename in os.listdir(directory):
        #print(filename)
        
        if filename.startswith('cv9'):
            #print(filename)
            continue
            
        path = directory+'/'+filename
        #print(path)

        add_doc_to_vocab(path, vocab)

            

In [8]:
vocab = Counter()

process_docs('../../data/input/txt_sentoken/neg', vocab)
process_docs('../../data/input/txt_sentoken/pos', vocab)

print(len(vocab))

print(vocab.most_common(50))

44276
[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('bad', 1248), ('could', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]


In [9]:
min_occurence = 2

tokens = [k for k,c in vocab.items() if c>min_occurence]

In [10]:
len(tokens)
print(tokens)



In [11]:
def save_list(lines, filename):
    
    data = '\n'.join(lines)
    
    file = open(filename, 'w')
    
    file.write(data)
    
    file.close()

In [12]:
save_list(tokens, 'vocab.txt')

## Train CNN with Embedding Layer 

### Preparing data for putting into Keras Model 

In [13]:
def load_doc(filename):
    
    file = open(filename, 'r')
    
    text = file.read()
    
    file.close()
    
    return text

In [14]:
vocab = load_doc('vocab.txt')
vocab = set(vocab.split())

In [15]:
vocab

{'vu',
 'amalgamation',
 'utilized',
 'sang',
 'allout',
 'janssen',
 'whiplash',
 'civil',
 'remorse',
 'wrongdoing',
 'snobs',
 'silent',
 'dudes',
 'penance',
 'dozen',
 'unflattering',
 'endangers',
 'cuesta',
 'cabin',
 'bud',
 'quirky',
 'virgils',
 'level',
 'regain',
 'plummer',
 'thumb',
 'bummer',
 'imparted',
 'forefront',
 'wellrounded',
 'yawn',
 'serves',
 'sylvia',
 'hall',
 'punishment',
 'intervals',
 'kitty',
 'miserable',
 'smash',
 'lift',
 'mingna',
 'forman',
 'vanish',
 'accuses',
 'education',
 'immoral',
 'alicia',
 'carmen',
 'darn',
 'poise',
 'stewardess',
 'gwen',
 'hooks',
 'unapologetically',
 'slices',
 'intolerable',
 'tee',
 'mob',
 'slaps',
 'commitment',
 'mens',
 'mirren',
 'devises',
 'manifestations',
 'persecution',
 'tickets',
 'violet',
 'election',
 'ricardo',
 'haunted',
 'improved',
 'claw',
 'smugly',
 'hamunaptra',
 'smalltime',
 'compensate',
 'mall',
 'verdict',
 'semler',
 'flawlessly',
 'paragraphs',
 'horses',
 'brighter',
 'proficien

In [16]:
def clean_doc(doc):
    
    tokens = doc.split()
    
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    
    tokens = [re_punc.sub('',w) for w in tokens]
    
    tokens = [w for w in tokens if w in vocab]
    
    tokens = ' '.join(tokens)
    
    return tokens

In [17]:
def process_docs(directory, vocab, is_train):
    documents = list()
    
    for filename in os.listdir(directory): 
        
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        
        path = directory+'/'+filename

        doc = load_doc(path)

        tokens = clean_doc(doc, vocab)

        documents.append(tokens)
    
    return documents

In [18]:
# load and clean a dataset
def load_clean_dataset(vocab, is_train):
    # load documents
    neg = process_docs('txt_sentoken/neg', vocab, is_train)
    pos = process_docs('txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    # prepare labels
    labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels


In [19]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    
    return tokenizer

In [20]:
max_length = max([len(s.split()) for s in train_docs])

NameError: name 'train_docs' is not defined

In [21]:
def encode_docs(tokenizer, max_length, docs):
    
    encoded = tokenizer.texts_to_sequences(docs)
    
    padded = pad_sequences(encoded, maxlen= max_length, padding= 'post' )
    
    return padded

In [22]:
vocab = len(tokenizer.word_index) + 1
print(vocab)

NameError: name 'tokenizer' is not defined

In [23]:
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length = max_length))
    model.add(Conv1D(filters= 32, kernel_size =8, activation = 'relu'))
    model.add(MaxPooling1D(pool_size =2))
    model.add(Flatten())
    model.add(Dense(10, activation = 'relu' ))
    model.add(Dense(1, activation = 'sigmoid'))
    
    model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    model.summary()
    plot_model(model, to_file = 'model.png', show_shapes = True)
    
    return model

In [5]:
import string
import re
import os
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.utils.vis_utils import plot_model

def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    
    return text

def clean_doc(doc, vocab):
    
    tokens = doc.split()
    
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    
    tokens = [re_punc.sub('',w) for w in tokens]
    
    tokens = [w for w in tokens if w in vocab]
    
    tokens = ' '.join(tokens)
    
    return tokens

def process_docs(directory, vocab, is_train):
    documents = list()
    
    for filename in os.listdir(directory): 
        
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        
        path = directory+'/'+filename

        doc = load_doc(path)

        tokens = clean_doc(doc, vocab)

        documents.append(tokens)
    
    return documents

# load and clean a dataset
def load_clean_dataset(vocab, is_train):
    # load documents
    neg = process_docs('../../data/input/txt_sentoken/neg', vocab, is_train)
    pos = process_docs('../../data/input/txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    # prepare labels
    labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels

def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    
    return tokenizer

def encode_docs(tokenizer, max_length, docs):
    
    encoded = tokenizer.texts_to_sequences(docs)
    
    padded = pad_sequences(encoded, maxlen= max_length, padding= 'post' )
    
    return padded

def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length = max_length))
    model.add(Conv1D(filters= 32, kernel_size =8, activation = 'relu'))
    model.add(MaxPooling1D(pool_size =2))
    model.add(Flatten())
    model.add(Dense(10, activation = 'relu' ))
    model.add(Dense(1, activation = 'sigmoid'))
    
    model.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    model.summary()
    #plot_model(model, to_file = 'model.png', show_shapes = True)
    
    return model



vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

train_docs, ytrain = load_clean_dataset(vocab, True)
tokenizer = create_tokenizer(train_docs)

print(tokenizer)

vocab_size = len(tokenizer.word_index) +1

print('Vocab size %d' % vocab_size)

max_length = max([len(s.split()) for s in train_docs])

print('Maximum length %d' % max_length)

Xtrain = encode_docs(tokenizer, max_length, train_docs)

print(Xtrain)

model = define_model(vocab_size, max_length)

model.fit(Xtrain, ytrain, epochs = 10, verbose =2)

model.save('model.h5')

<keras_preprocessing.text.Tokenizer object at 0x00000146EF782FD0>
Vocab size 19605
Maximum length 1282


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 1/10
 - 8s - loss: 0.6891 - acc: 0.5389
Epoch 2/10
 - 8s - loss: 0.5645 - acc: 0.7194
Epoch 3/10
 - 8s - loss: 0.1155 - acc: 0.9628
Epoch 4/10
 - 8s - loss: 0.0089 - acc: 0.9989
Epoch 5/10
 - 9s - loss: 0.0037 - acc: 0.9994
Epoch 6/10
 - 9s - loss: 0.0026 - acc: 0.9994
Epoch 7/10
 - 9s - loss: 0.0020 - acc: 0.9994
Epoch 8/10
 - 10s - loss: 0.0016 - acc: 0.9994
Epoch 9/10
 - 10s - loss: 0.0012 - acc: 0.9994
Epoch 10/10
 - 10s - loss: 7.9814e-04 - acc: 0.9994


In [42]:
!pip install pydot

Collecting pydot
  Using cached https://files.pythonhosted.org/packages/33/d1/b1479a770f66d962f545c2101630ce1d5592d90cb4f083d38862e93d16d2/pydot-1.4.1-py2.py3-none-any.whl
Installing collected packages: pydot
Successfully installed pydot-1.4.1


distributed 1.21.8 requires msgpack, which is not installed.
tensorflow 1.10.0 has requirement numpy<=1.14.5,>=1.13.3, but you'll have numpy 1.16.3 which is incompatible.
tensorflow 1.10.0 has requirement setuptools<=39.1.0, but you'll have setuptools 41.0.1 which is incompatible.
ipython 6.4.0 has requirement prompt-toolkit<2.0.0,>=1.0.15, but you'll have prompt-toolkit 1.0.14 which is incompatible.
You are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [44]:
!pip list


Package                            Version  
---------------------------------- ---------
absl-py                            0.7.1    
alabaster                          0.7.10   
anaconda-client                    1.6.14   
anaconda-navigator                 1.8.7    
anaconda-project                   0.8.2    
aniso8601                          8.0.0    
APScheduler                        3.6.0    
asn1crypto                         0.24.0   
astor                              0.8.0    
astroid                            2.2.5    
astropy                            3.0.2    
attrs                              18.1.0   
Automat                            0.7.0    
awsebcli                           3.15.3   
Babel                              2.5.3    
backcall                           0.1.0    
backports.shutil-get-terminal-size 1.0.0    
beautifulsoup4                     4.6.0    
bitarray                           0.8.1    
bkcharts                           0.2      
blaze     

You are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


### Explore 

In [None]:
1. How to make padding to mean length of review
2. Different way of converting text into integers