In [None]:
# Movie review - Sentiment Analysis

# Given a list of 10 reviews with labels
reviews = [
    'A very good film of Tony Stark',
    'Fantastic fighting scene , I love watching Jet Li',
    'Not bad ! I was impressed by the action scene',
    'The love story was amazing , but I do not prefer romantic movie at all',
    'The main actor was stupid',
    'I can not imagine such an ugggly guy like the main actor'
]

labels = [
    'Positive',
    'Positive',
    'Positive',
    'Negative',
    'Negative',
    'Negative'
]


In [None]:
# Rule: "Good", "Fantastic", "Amazing" - "Bad", "Ugly", "Stupid"
positive_words = ["good", "fantastic", "amazing"]
negative_words = ["bad", "ugly", "stupid"]
def review_classification(review):
  # lowercase all the words in the review
  review_lower = review.lower()

  # split the text to separated words
  review_words = review_lower.split()

  # Assign label based on the keywords
  for word in review_words:
    if word in positive_words:
      return 'Positive'
    if word in negative_words:
      return 'Negative'

  # return neutral by default
  return 'Neutral'

for review in reviews:
  print(review_classification(review))

Positive
Positive
Negative
Positive
Negative
Neutral


In [None]:
# Create vocabulary

all_tokens = [] # we use all_tokens variable to store all the words in all the reviews
for review in reviews:
  review_lower = review.lower()
  tokens = review_lower.split()
  all_tokens = all_tokens + tokens

print(all_tokens)
print(len(all_tokens))
vocab = set(all_tokens)
print(vocab)
print(len(vocab))

['a', 'very', 'good', 'film', 'of', 'tony', 'stark', 'fantastic', 'fighting', 'scene', ',', 'i', 'love', 'watching', 'jet', 'li', 'not', 'bad', '!', 'i', 'was', 'impressed', 'by', 'the', 'action', 'scene', 'the', 'love', 'story', 'was', 'amazing', ',', 'but', 'i', 'do', 'not', 'prefer', 'romantic', 'movie', 'at', 'all', 'the', 'main', 'actor', 'was', 'stupid', 'i', 'can', 'not', 'imagine', 'such', 'an', 'ugggly', 'guy', 'like', 'the', 'main', 'actor']
58
{'prefer', 'was', 'good', 'fighting', 'love', 'guy', 'i', 'do', 'impressed', 'all', 'stupid', ',', 'main', 'scene', 'imagine', 'by', 'amazing', 'ugggly', 'actor', 'jet', 'romantic', 'very', 'a', 'bad', 'film', 'but', 'not', 'such', 'can', 'of', 'like', 'fantastic', 'li', 'movie', '!', 'stark', 'action', 'story', 'at', 'watching', 'the', 'an', 'tony'}
43


In [None]:
# Vectorize a review

def review_to_vector(review):
  review_vector = []
  review_lower = review.lower()
  review_tokens = review_lower.split()
  for token in vocab:
    if token in review_tokens:
      review_vector.append(1)
    else:
      review_vector.append(0)
  return review_vector

In [None]:
# Vectorize a review

reviews2vectors = [review_to_vector(r) for r in reviews]
print(reviews[0])
print(len(reviews2vectors))
print(reviews2vectors[0])

A very good film of Tony Stark
6
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]


In [None]:
# Use a Naive Bayes classifier in sklearn to build our model

from sklearn.naive_bayes import MultinomialNB

clf1 = MultinomialNB()
clf1.fit(reviews2vectors, labels)

MultinomialNB()

In [None]:
# A faster way, using sklearn pre-built library to convert reviews to vector

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
review_vectorizer = vectorizer.fit_transform(reviews)
print(vectorizer.get_feature_names())
print(review_vectorizer.toarray())

['action', 'actor', 'all', 'amazing', 'an', 'at', 'bad', 'but', 'by', 'can', 'do', 'fantastic', 'fighting', 'film', 'good', 'guy', 'imagine', 'impressed', 'jet', 'li', 'like', 'love', 'main', 'movie', 'not', 'of', 'prefer', 'romantic', 'scene', 'stark', 'story', 'stupid', 'such', 'the', 'tony', 'ugggly', 'very', 'was', 'watching']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0
  0 0 1]
 [1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0
  0 1 0]
 [0 0 1 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0
  0 1 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0
  0 1 0]
 [0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1
  0 0 0]]




In [None]:
# Use a Naive Bayes classifier in sklearn to build our model

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(review_vectorizer, labels)

MultinomialNB()

In [None]:
# Try to predict the sentiment class of a new review
new_review = ["Good actors . I love it"]
new_review_vector = vectorizer.transform(new_review)
print('New review vector: ', new_review_vector.toarray())
print(clf.predict(new_review_vector))

New review vector:  [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0]]
['Positive']


# Test with our own dataset

In [None]:
# Read the dataset IMDB_dataset.csv, extract only 1000 rows
import pandas as pd
imdb_dataset = pd.read_csv('.../IMDB_Dataset.csv', nrows=1000)
# print out 5 first rows
print(imdb_dataset.head(5))

In [None]:
# Extract the list of reviews X
X = imdb_dataset['review'].values.tolist()
# Extract the labels y
y = imdb_dataset['sentiment'].values.tolist()

In [None]:
# Use CountVectorizer to convert each review in X to a vector of number

# Show the list of unique words (vocabulary) in the dataset

# Show the number of unique words in the vocabulary


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a Naive Bayes model to classify review vectors
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
# Predict a new review
new_review = ["'Doctor Strange' is not a perfect film. If you expect a movie to have depth and the point, you should skip, not only this one, but whole Marvel production."]
new_review_vectorized = vectorizer.transform(new_review)
print(clf.predict(new_review_vectorized.toarray()))
print(clf.predict_proba(new_review_vectorized.toarray()))

In [None]:
# Evaluate by the accuracy score
print(clf.score(X_test, y_test))

# Homework

In [None]:
# Read the dataset IMDB_dataset.csv, extract only 2000 rows
import pandas as pd
imdb_dataset = pd.read_csv('.../IMDB_Dataset.csv', nrows=2000)
# print out 5 first rows
print(imdb_dataset.head(5))

In [None]:
# Extract the list of reviews X
X = imdb_dataset['review'].values.tolist()
# Extract the labels y
y = imdb_dataset['sentiment'].values.tolist()

In [None]:
# Remove stop words (a, an, the, another, is, are, was, were, will) from each comment in X

# The whole list of stopwords can be found in stopwords.txt

In [None]:
# Use TfidfVectorizer to convert each review in X to a vector of number

# Show the list of unique words (vocabulary) in the dataset

# Show the number of unique words in the vocabulary


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes model to classify review vectors
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)

# Evaluate by the accuracy score
print(clf.score(X_test, y_test))