In [6]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
filename = '/home/tri/Downloads/MLdatasets/metamorphosis_clean.txt'
file = open(filename,'rt')
text = file.read()
file.close()
tokens = word_tokenize(text)
porter = PorterStemmer()
stemmed =[porter.stem(word) for word in tokens]
print(stemmed[:100])

['one', 'morn', ',', 'when', 'gregor', 'samsa', 'woke', 'from', 'troubl', 'dream', ',', 'he', 'found', 'himself', 'transform', 'in', 'hi', 'bed', 'into', 'a', 'horribl', 'vermin', '.', 'He', 'lay', 'on', 'hi', 'armour-lik', 'back', ',', 'and', 'if', 'he', 'lift', 'hi', 'head', 'a', 'littl', 'he', 'could', 'see', 'hi', 'brown', 'belli', ',', 'slightli', 'dome', 'and', 'divid', 'by', 'arch', 'into', 'stiff', 'section', '.', 'the', 'bed', 'wa', 'hardli', 'abl', 'to', 'cover', 'it', 'and', 'seem', 'readi', 'to', 'slide', 'off', 'ani', 'moment', '.', 'hi', 'mani', 'leg', ',', 'piti', 'thin', 'compar', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', ',', 'wave', 'about', 'helplessli', 'as', 'he', 'look', '.', '``', 'what', "'s", 'happen', 'to']


## Example of CountVectorizer

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
text = ["The quick brown fox jumped over the lazy dog."]
# Transform
vectorizer = CountVectorizer()
# Tokenize
vectorizer.fit(text)
print(vectorizer.vocabulary_)
# Encode document
vector = vectorizer.transform(text)

print(vector.shape)
print(type(vector))
print(vector.toarray())

{'brown': 0, 'the': 7, 'fox': 2, 'quick': 6, 'lazy': 4, 'over': 5, 'dog': 1, 'jumped': 3}
(1, 8)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 2]]


##  Example of TfidfVectorizer

Simply word count may be not useful when a word can appear multiple time but less informative. TfidfVectorizer deals with this problem by considering term frequency and Inverse Document Frequency. In fact, **TfidfVectorizer** equals to both **CountVectorizer** and **TfidfTransformer**.

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
text = ["The quick brown fox jumped over the lazy dog.","The dog.","The fox"]
vectorizer = TfidfVectorizer()
vectorizer.fit(text)
print(vectorizer.vocabulary_)
print(vectorizer.idf_)


vector = vectorizer.transform([text[0]])
print(vector.shape)
print(vector.toarray())


{'brown': 0, 'the': 7, 'fox': 2, 'quick': 6, 'lazy': 4, 'over': 5, 'dog': 1, 'jumped': 3}
[ 1.69314718  1.28768207  1.28768207  1.69314718  1.69314718  1.69314718
  1.69314718  1.        ]
(1, 8)
[[ 0.36388646  0.27674503  0.27674503  0.36388646  0.36388646  0.36388646
   0.36388646  0.42983441]]


## Hashing with HashingVectorizer
Count and frequencies have a limitation when vocabulary become large. One approach is **hashing** that converts text into number using fixed length vector.

In [26]:
from sklearn.feature_extraction.text import HashingVectorizer
text= ['The quick brown fox jumped over the lazy dog.']
vectorizer = HashingVectorizer(n_features=20)

vector = vectorizer.transform(text)

print(vector.shape)
print(vector.toarray())

(1, 20)
[[ 0.          0.          0.          0.          0.          0.33333333
   0.         -0.33333333  0.33333333  0.          0.          0.33333333
   0.          0.          0.         -0.33333333  0.          0.
  -0.66666667  0.        ]]


## Text operation with Keras   
Instead of nltk functions, keras provides similar methods. In the following example, we increase the vcabulary size by one-third to minimize collision when hashing words.

In [24]:
text_to_word_sequence("hello world")

['hello', 'world']

In [27]:
from keras.preprocessing.text import one_hot, text_to_word_sequence

text = "The quick brown fox jumpedover the lazy dog."   # note that there is no bracket

# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))
vocab_size = len(words)

result =one_hot(text,n=round(vocab_size*1.3),lower=True)

print(result)

[7, 5, 6, 6, 7, 7, 7, 7]


## Hashing trick from keras   
Count-based encoding approach requires to maintain vocabulary of words and their mapping to integer. Aternative, a one-way hash function avoids to keep track of a vocabulary to convert words to integer. Keras provide **hashing trick** to tokenize and numeric encoding similar to **one_hot** function. More importance, **hashing_trick** function from keras allows either **hash** or other hash functions such as **md5**.

In [3]:
from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import text_to_word_sequence
text = 'The quick brown fox jumped over the lazy dog.'
set(text_to_word_sequence(text))

{'brown', 'dog', 'fox', 'jumped', 'lazy', 'over', 'quick', 'the'}

In [2]:
from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import text_to_word_sequence
text = 'The quick brown fox jumped over the lazy dog.'

words =set(text_to_word_sequence(text))
vocab_size = len(words)

result = hashing_trick(text, round(vocab_size*1.3), hash_function='md5')
print(result)

Using TensorFlow backend.


[6, 4, 1, 2, 7, 5, 6, 2, 6]


## Example of Tokenizer in Keras

In [4]:
from keras.preprocessing.text import Tokenizer

docs = ['Well done!','Good work','Great effort','nice work','Excellent!']

t = Tokenizer()
t.fit_on_texts(docs)
print(t.word_counts)

OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])


In [5]:
print(t.document_count)

5


In [6]:
print(t.word_index)

{'well': 2, 'effort': 6, 'excellent': 8, 'good': 4, 'great': 5, 'done': 3, 'nice': 7, 'work': 1}


In [7]:
print(t.word_docs)

{'done': 1, 'effort': 1, 'excellent': 1, 'good': 1, 'great': 1, 'well': 1, 'nice': 1, 'work': 2}


In [8]:
encoded_docs = t.texts_to_matrix(docs, mode='count')
print(encoded_docs)

[[ 0.  0.  1.  1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  1.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  1.]]


## Bag of Word   


## Movie Review

In [26]:
from collections import Counter
from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
import string
import re

In [5]:
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]'% re.escape(string.punctuation))
    
    tokens =[re_punc.sub('',w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens  = [w for w in tokens if not w in stop_words ]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >1]
    return tokens

In [7]:
# load doc and add to vocab
def add_doc_to_vocab(filename,vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)
    
def process_docs(directory,vocab):
    # walk through all files
    for filename in listdir(directory):
        if filename.startswith('cv9'):
            continue
        # create the full path 
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)
        
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()

In [8]:
vocab = Counter()

process_docs('/home/tri/Downloads/txt_sentoken/pos',vocab)
process_docs('/home/tri/Downloads/txt_sentoken/neg',vocab)

print(len(vocab))

44276


In [9]:
min_occurence = 2
tokens =[k for k,c in vocab.items() if c >= min_occurence]
print(len(tokens))

25767


In [10]:
save_list(tokens,'/home/tri/Downloads/vocab.txt')

## Refine and adding useful function

In [22]:
def doc_to_line(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

# Modify process_docs
def process_docs(directory, vocab, is_train):
    lines = list()
    for filename in listdir(directory):
        # skip any reviews in the test set
        if filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
            
        # create the full path of the file to open    
        
        path = directory +'/'+ filename
        # load and clean the doc
        line = doc_to_line(path,vocab)
        # add to list
        lines.append(line)
    return lines

def load_clean_dataset(vocab, is_train):
    neg = process_docs('/home/tri/Downloads/txt_sentoken/neg',vocab, is_train)
    pos = process_docs('/home/tri/Downloads/txt_sentoken/pos',vocab, is_train)
    docs = neg+ pos
    # prepare label
    labels = [0 for _ in range(len(neg))] +[1 for _ in range(len(pos))]
    return docs, labels


In [24]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


In [27]:
# Redo the process, load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)

tokenizer = create_tokenizer(train_docs)
# encoding data

Xtrain = tokenizer.texts_to_matrix(train_docs,mode = 'freq')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='freq')

print(Xtrain.shape, Xtest.shape)

(1800, 25768) (0, 25768)


## Building the first sentiment analysis model   
Here we include full code

In [9]:
import re
import string
from os import listdir
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from keras.layers import Dense
from keras.models import Sequential
from keras.utils.vis_utils import plot_model
from keras.preprocessing.text import Tokenizer


In [2]:
def load_doc(filename):
    file= open(filename,'r')
    text = file.read()
    file.close()
    return text

def clean_doc(doc):
    tokens =doc.split()
    re_punc = re.compile('[%s]'% re.escape(string.punctuation))
    # Remove punctuation 
    tokens = [re_punc.sub('',w) for w in tokens]
    
    # Remoeve remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    
    # filter out stopword
    stop_words =set(stopwords.words('english'))
    tokens =[w for w in tokens if not w in stop_words]
    
    # filter out short tokens
    tokens = [word for word in tokens if len(word)>1]
    return tokens

# load doc, clean and generate line of tokens
def doc_to_line(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens) 

# load all docs from directory
def process_docs(directory, vocab, is_train):
    lines = list()
    # walk through all files
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        
        path = directory +'/'+filename
        # load and clean
        line = doc_to_line(path,vocab)
        lines.append(line)
    return lines

# load and clean dataset
def load_clean_dataset(vocab, is_train):
    neg = process_docs('/home/tri/Downloads/txt_sentoken/neg',vocab, is_train)
    pos = process_docs('/home/tri/Downloads/txt_sentoken/pos',vocab,is_train)
    docs = neg+pos
    labels = np.array([0 for _ in range(len(neg))] +[1 for _ in range(len(pos))])
    return docs,labels
    
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# create model
def create_model(n_words):
    model = Sequential()
    model.add(Dense(50,input_shape=(n_words,),activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    
    model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model
      

In [3]:
vocab_filename = '/home/tri/Downloads/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

# load reviews
train_docs , ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab,False)
# create tokenizer
tokenizer =  create_tokenizer(train_docs)
# encoding
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='freq')
Xtest  = tokenizer.texts_to_matrix(test_docs,mode='freq')

n_words = Xtest.shape[1]
model= create_model(n_words)

model.fit(Xtrain,ytrain,epochs =10,verbose=0)

# Evaluation
loss, acc = model.evaluate(Xtest, ytest, verbose =0)
print('Test accuracy %f' %(acc*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                1288450   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 1,288,501
Trainable params: 1,288,501
Non-trainable params: 0
_________________________________________________________________
Test accuracy 87.500000


## Comparing word scoring methods
The **text_to_matrix()** function for the Tokenizer in Keras API provides 4 different methods for scoring words:
* **binary** Where words are marked as present (1) or above (0)
* **count** Where occurence count for each word is marked s an integer
* **tfidf** Where each word is scored based on their frequency s.t more frequent words will be penalized.   
* **freq**: words are scored based on their frequenct of occurence within document.   

Let experiment the above model with different scoring by building **prepare_data()**

In [4]:
# prepare bag-of-word encoding of docs
def prepare_data(train_docs, test_docs,mode):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_docs)
    # encoding training data set
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    # encoding testing 
    Xtest  = tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest


Since Neural network models is schochastic, they can genereate different results due to initial random weights and shuffling of pattern during mini-batch gradiant descent. As such we will introduce the **evaluate_mode** function to evaluate the performance on different test set.

In [6]:
def evaluate_mode(X_train,ytrain,Xtest,ytest):
    scores = list()
    n_repeats = 30
    n_words = Xtest.shape[1]
    for i in range(n_repeats):
        model = Sequential()
        model.add(Dense(50,input_shape=(n_words,),activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        
        model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
        model.fit(Xtrain,ytrain,epochs=10, verbose=0)
        # evaluate
        loss, acc =model.evaluate(Xtest,ytest,verbose=0)
        scores.append(acc)
        print('%d accuracy: %s' % ((i+1), acc))
    return scores   

In [10]:
# remaining program
vocab_filename = '/home/tri/Downloads/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

# load reviews
train_docs , ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab,False)


modes = ['binary','count','tfidf','freq']
results = pd.DataFrame()

for mode in modes:
    Xtrain, Xtest = prepare_data(train_docs, test_docs,mode)
    # evaluate
    results[mode] = evaluate_mode(Xtrain, ytrain, Xtest, ytest)
# summarize
print(results.describe())

results.boxplot()
plt.show()

1 accuracy: 0.93
2 accuracy: 0.92
3 accuracy: 0.935
4 accuracy: 0.925
5 accuracy: 0.94
6 accuracy: 0.925
7 accuracy: 0.925
8 accuracy: 0.935
9 accuracy: 0.93
10 accuracy: 0.915
11 accuracy: 0.925
12 accuracy: 0.915
13 accuracy: 0.925
14 accuracy: 0.925
15 accuracy: 0.93
16 accuracy: 0.92
17 accuracy: 0.935
18 accuracy: 0.935
19 accuracy: 0.925
20 accuracy: 0.915
21 accuracy: 0.925
22 accuracy: 0.935
23 accuracy: 0.93
24 accuracy: 0.925
25 accuracy: 0.93
26 accuracy: 0.93
27 accuracy: 0.925
28 accuracy: 0.93
29 accuracy: 0.935
30 accuracy: 0.93
1 accuracy: 0.895
2 accuracy: 0.9
3 accuracy: 0.905
4 accuracy: 0.885
5 accuracy: 0.89
6 accuracy: 0.91
7 accuracy: 0.91
8 accuracy: 0.9
9 accuracy: 0.89
10 accuracy: 0.895
11 accuracy: 0.905
12 accuracy: 0.91
13 accuracy: 0.91
14 accuracy: 0.905
15 accuracy: 0.9
16 accuracy: 0.895
17 accuracy: 0.905
18 accuracy: 0.89
19 accuracy: 0.91
20 accuracy: 0.895
21 accuracy: 0.9
22 accuracy: 0.895
23 accuracy: 0.895
24 accuracy: 0.9
25 accuracy: 0.895
26

NameError: name 'plt' is not defined

## Predicting sentiment for new Reviews   
Finally, we can develop a final model to make prediction for new reviews.

In [11]:
def predict_sentiment(review, vocab, tokenizer, model):
    tokens =clean_doc(review)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    line = ' '.join(tokens)
    # encoding
    encoded = tokenizer.texts_to_matrix([line], mode='binary')
    # predict sentiment
    yhat = model.predict(encoded, verbose=0)
    # retrieve predicted percentage and label
    percent_pos = yhat[0,0]
    
    if round(percent_pos) ==0:
        return (1 - percent_pos), 'NEGATIVE'
    return percent_pos, 'POSITIVE'

In [12]:
text = 'Best movie ever! It was greate, I recommend it.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' %(text,sentiment,percent*100))
# test negative text
text ='This is a bad movie.'
percent, sentiment = predict_sentiment(text,vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)'% (text, sentiment,percent *100))

Review: [Best movie ever! It was greate, I recommend it.]
Sentiment: NEGATIVE (83.057%)
Review: [This is a bad movie.]
Sentiment: NEGATIVE (100.000%)
