<a href="https://colab.research.google.com/github/nam4dev/nlp_demonstration/blob/master/bag_of_words_demonstration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sentiment Classifier using Bag of Words (BoW) techniques

In [0]:
import json
# Import pandas
import pandas as pd
# Import CountVectorizer from scikit learn
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
COLUMNS = ['reviews', 'sentiment']

test_df = pd.read_json('movie_reviews_test.json')
test_df.columns = COLUMNS

train_df = pd.read_json('movie_reviews_train.json')
train_df.columns = COLUMNS

In [0]:
max_colwidth_ref = pd.get_option('display.max_colwidth')

# Allow to display full column length
pd.set_option('display.max_colwidth', -1)

In [0]:
# Print data frames before cleaning
print('Training materials')
print(train_df['reviews'].shape)
print(train_df['reviews'].head(10))

print('Testing materials')
print(test_df['reviews'].shape)
print(test_df['reviews'].head(25))

In [0]:
# Cleaning function (removes some html tags)
def clean_br(review):
  for html_tag in ('<br>', '<br/>', '<br >', '<br />'):
    review = review.replace(html_tag, '')
  return review

In [0]:
%time X_test = test_df['reviews'].apply(clean_br)
%time X_train = train_df['reviews'].apply(clean_br)

In [0]:
print('Training materials')
print(X_train.shape)
print(X_train.head(10))

In [0]:
print('Testing materials')
print(X_test.shape)
print(X_test.head(10))

In [0]:
# Setting default value back to normal
pd.set_option('display.max_colwidth', max_colwidth_ref)

In [0]:
# Create a CountVectorizer instance
vectorizer = CountVectorizer(stop_words='english')
# N-grams
# vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1))
# vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2))
# vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 3))

# Fit and transform training materials
X_train_bow = vectorizer.fit_transform(X_train)

# Transform testing materials
X_test_bow = vectorizer.transform(X_test)

In [0]:
print('test', X_test_bow.shape)
print('train', X_train_bow.shape)

In [0]:
from sklearn.naive_bayes import MultinomialNB

In [0]:
y_test = test_df['sentiment']
y_train = train_df['sentiment']

In [0]:
# Sentiment map
SENTIMENTS = {
    0: 'Negative',
    1: 'Positive'
}

In [0]:
# Create a MultinomialNB instance
clf = MultinomialNB()

# Fit the classifier
clf.fit(X_train_bow, y_train)

# Measure accuracy of the trained model
accuracy = clf.score(X_test_bow, y_test)
print("The classifier accuracy on the test set is {0:.2f}%".format(accuracy * 100))

In [0]:
# Predict the sentiment of some reviews
for (review, expected) in (
    ("The movie was awful. The music was hard to ear and the acting bad.", 0),
    ("The movie was great! The music was overwhelming. Really nice!", 1),
    ("The movie was not so good, even pretty bad :( Actors were not brillant.", 0),
):
  prediction = clf.predict(vectorizer.transform([review]))[0]
  print(
      "The sentiment predicted by the classifier is {} (expected {})"
      .format(SENTIMENTS[prediction], SENTIMENTS[expected])
  )