In [85]:
# imports
import math
import pickle
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
import contractions

from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## Convert the raw data to DataFrame

In [86]:
# create a dataframe
df = pd.DataFrame(columns=['text', 'label'])

# add the negative examples
with open('data/negative.txt', 'r', encoding="UTF-8") as f:
    for line in f:
        df = df.append({'text': line, 'label': 'huh'}, ignore_index=True)

# add the positive examples
with open('data/positive.txt', 'r', encoding="UTF-8") as f:
    for line in f:
        df = df.append({'text': line, 'label': 'twss'}, ignore_index=True)

# shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

## Clean the data

In [87]:
# Clean the data

# Lowercase all words, remove contractions and whitespace
df['text'] = df['text'].apply(lambda x: contractions.fix(x.lower()).strip())

# tokenize
df['tokens'] = df['text'].apply(lambda x: word_tokenize(x))

# create lemmatizer
l = WordNetLemmatizer()

# lemmatize
df['tokens'] = df['tokens'].apply(lambda x: [l.lemmatize(word) for word in x])

# clean words
df['cleaned'] = df['tokens'].apply(lambda x: " ".join(x))

print(df.head())

                                                text label  \
0  if we do not believe in free expression for pe...   huh   
1  if you wind up with a boring, miserable life b...   huh   
2  do not you think these buns are a little too b...  twss   
3                     hey look it fits in the crack!  twss   
4  america stands strongest in challenging terror...   huh   

                                              tokens  \
0  [if, we, do, not, believe, in, free, expressio...   
1  [if, you, wind, up, with, a, boring, ,, misera...   
2  [do, not, you, think, these, bun, are, a, litt...   
3            [hey, look, it, fit, in, the, crack, !]   
4  [america, stand, strongest, in, challenging, t...   

                                             cleaned  
0  if we do not believe in free expression for pe...  
1  if you wind up with a boring , miserable life ...  
2  do not you think these bun are a little too bi...  
3                     hey look it fit in the crack !  
4  america stan

In [88]:
# texts
texts = df['cleaned'].values

# gets labels
labels = df['label'].values

In [89]:
# Uses label encoder to encode labels. Convert to 0/1
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
df['encoded'] = encoded_labels

encoder_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

encoded_texts = df['encoded'].values

# split 80/20 - train/test
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=0)


In [90]:
# Uses Count vectorizer to get frequency of the words
vectorizer = CountVectorizer(max_features=2000)

# encodes all training sentences
sents_encoded = vectorizer.fit_transform(X_train)

counts = sents_encoded.sum(axis=0).A1
vocab = list(vectorizer.get_feature_names())



## Naive Bayes Model

In [91]:
def zero():
    return 0

# Builds the model.
# Uses laplace smoothing for words in test set not present in vocab of train set
class NaiveBayes:

    def __init__(self, classes):
        self.classes = classes

    def group_by_class(self, X, y):
        data = dict()
        for c in self.classes:
            data[c] = X[np.where(y == c)]
        return data

    def fit(self, X, y):
        self.n_class_items = {}
        self.log_class_priors = {}
        self.word_counts = {}
        self.vocab = vocab

        n = len(X)

        grouped_data = self.group_by_class(X, y)

        for c, data in grouped_data.items():
            self.n_class_items[c] = len(data)
            # taking log for easier calculation
            self.log_class_priors[c] = math.log(self.n_class_items[c] / n)
            self.word_counts[c] = defaultdict(zero)

            for text in data:
                counts = Counter(word_tokenize(text))
                for word, count in counts.items():
                    self.word_counts[c][word] += count

        return self

    def laplace_smoothing(self, word, text_class):
        num = self.word_counts[text_class][word] + 1
        denom = self.n_class_items[text_class] + len(self.vocab)
        return math.log(num / denom)

    def predict(self, X):
        result = []
        for text in X:

            class_scores = {c: self.log_class_priors[c] for c in self.classes}

            words = set(word_tokenize(text))
            for word in words:
                if word not in self.vocab:
                    continue

                for c in self.classes:

                    log_w_given_c = self.laplace_smoothing(word, c)
                    class_scores[c] += log_w_given_c

            result.append(max(class_scores, key=class_scores.get))

        return result

In [92]:
# instantiate the naive bayes classifier
naive_bayes = NaiveBayes(classes=np.unique(labels)).fit(X_train, y_train)

# Tests the model on test set and reports the Accuracy
predicted_labels = naive_bayes.predict(X_test)

naive_bayes_accuracy = 100 * accuracy_score(y_test, predicted_labels)

print("Naive Bayes Accuracy: {:.2f}%".format(naive_bayes_accuracy))

Naive Bayes Accuracy: 83.07%


In [93]:
# texts on which we need to predict
sentences = ["That was big", "I am loving it", "I like chinese noodles"]

# Gets probabilities
prediction = naive_bayes.predict(sentences)
print(prediction)


['twss', 'huh', 'huh']


## LSTM Network Classifier

In [94]:
# Hyperparameters of the model
oov_tok = '<OOK>'
embedding_dim = 100
max_length = 150
padding_type='post'
trunc_type='post'

# tokenizes sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# vocabulary size
word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1

# converts train dataset to sequence and pads sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)

y_train_encoded = np.fromiter(map(lambda x: 0 if x == 'huh' else 1, y_train), dtype=np.int32)

# converts Test dataset to sequence and pads sequences
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [95]:
# model initialization
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64)),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

# compiles model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 150, 100)          895700    
                                                                 
 bidirectional_4 (Bidirectio  (None, 128)              84480     
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 24)                3096      
                                                                 
 dense_9 (Dense)             (None, 1)                 25        
                                                                 
Total params: 983,301
Trainable params: 983,301
Non-trainable params: 0
_________________________________________________________________


In [96]:
# training the model
history = model.fit(train_padded, y_train_encoded, epochs=5, verbose=1, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [97]:
# Gets probabilities
prediction = model.predict(test_padded)

# Gets labels based on probability 1 if p>= 0.5 else 0
for each in prediction:
    if each[0] >= 0.5:
        each[0] = 1
    else:
        each[0] = 0
prediction = prediction.astype('int32')

y_test_encoded = np.fromiter(map(lambda x: 0 if x == 'huh' else 1, y_test), dtype=np.int32)

lstm_accuracy = 100 * accuracy_score(y_test_encoded, prediction)

# Calculates accuracy on Test data
print("The accuracy of the LSTM Network is {:.2f}%".format(lstm_accuracy))

The accuracy of the LSTM Network is 94.06%


In [108]:
# texts on which we need to predict
sentence = ["That's big", "I am loving it", "I like chinese noodles"]

# converts to a sequence
test_sequences = tokenizer.texts_to_sequences(sentence)

# pads the sequence
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

# Gets probabilities
prediction = model.predict(test_padded)

results = []

# Gets labels based on probability 1 if p >= 0.5 else 0
for each in prediction:
    print(each)
    if each[0] >= 0.5:
        results.append('twss')
    else:
        results.append('huh')
        
print(results)

[0.7249748]
[0.590703]
[0.29367533]
['twss', 'twss', 'huh']


In [105]:
if lstm_accuracy > naive_bayes_accuracy:
    print("LSTM Network is better than Naive Bayes")
else:
    print("Naive Bayes is better than LSTM Network")

LSTM Network is better than Naive Bayes


In [106]:
# save the model
model.save('last_model')

# save tokenizer
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)



INFO:tensorflow:Assets written to: last_model\assets


INFO:tensorflow:Assets written to: last_model\assets
