## Importing the libraries

In [1]:
import string 
import re
from os import listdir
from keras.preprocessingprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from pandas import DataFrame
from matplotlib import pyplot

### Load the Document into memory

In [2]:
def load_doc(filename):
    #Open the file as read only
    file=open(filename,'r')
    text=file.read()
    file.close()
    return text

In [3]:
def clean_doc(doc):
    tokens=doc.split()
    re_punc=re.compile('[%s]'%re.escape(string.punctuation))
    tokens=[re_punc.sub('',w) for w in tokens]
    #remove non-alphabetic values
    tokens=[word for word in tokens if word.isalpha()]
    #Stop Words
    stop_words=set(stopwords.words('english'))
    tokens=[w for w in tokens if w not in stop_words]
    #Filter our short tokens
    tokens=[word for word in tokens if len(word)>1]
    return tokens

In [5]:
#load doc, clean and return line of tokens
def doc_to_line(filename,vocab):
    doc=load_doc(filename)
    #clean doc
    tokens=clean_doc(doc)
    #filter by vocab
    tokens=[w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [6]:
#Load all docs in a directory
def process_docs(directory, vocab, is_train):
    lines=list()
    for filename in listdir(directory):
        #Skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path=directory+'/'+filename
        #load and clean the doc
        line=doc_to_line(path, vocab)
        lines.append(line)
    return lines

In [7]:
def load_clean_dataset(vocab, is_train):
    #Laod documents
    neg=process_docs('./txt_sentoken/neg/', vocab, is_train)
    pos=process_docs('./txt_sentoken/pos/', vocab,is_train)
    docs=neg+pos
    #prepare labels
    labels=[0 for _ in range(len(neg))]+[1 for _ in range(len(pos))]
    return docs, labels


## Defining the model

In [8]:
def define_model(n_words):
    #Define the network
    model=Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def evaluate_model(Xtrain, ytrain, Xtest, ytest):
    scores=list()
    n_repeats=10
    n_words=Xtest.shape[1]
     
    for i in range(n_words):
        #Define the network
        model=define_model(n_words)
        model.fit(Xtrain, ytrain, epochs=10, verbose=0)
        #evaluate
        _, acc=model.evaluate(Xtest, ytest, verbose=0)
        scores.append(acc)
        print('%d accuracy: %s'%((i+1), acc))
    return scores
