In [1]:
# Sentiment Analysis of Movie Reviews from IMDB data

In [2]:
# Introduction
# Sentiment Analysis determines the emotional tone (Positive, Negative, Neutral)
# Classify movie reviews as either Positive or Negative

In [3]:
# Importing libraries
import nltk
import random
import string
from nltk.corpus import movie_reviews
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
# Download necessary NLTK resources
nltk.download('movie_reviews')

nltk.download('stopwords')

nltk.download('wordnet')

nltk.download('punkt_tab')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
# Load the data
documents_raw = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

In [9]:
documents_raw[1]

('the happy bastard\'s quick movie review \ndamn that y2k bug . \nit\'s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . \nlittle do they know the power within . . . \ngoing for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance . \nwe don\'t know why the crew was really out in the middle of nowhere , we don\'t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don\'t know why donald sutherland is stumbling around drunkenly throughout . \nhere , it\'s just " hey , let\'s chase these people around with some robots " . \nthe acting is below average , even from the likes of curtis . \nyou\'re more likely to get a kick out of 

In [10]:
# Shuffle the document for unbiased splitting.
random.seed(42)
random.shuffle(documents_raw)
print(f'Number of documents:{len(documents_raw)}')
print(f'First document:{documents_raw[0]}')
print(f'Sentiment of first document:{documents_raw[0][1]}')

Number of documents:2000
First document:("mr . bean , a bumbling security guard from england is sent to la to help with the grandiose homecoming of a masterpiece american painting . \nthe first two words should have said enough to let you know what occurs during bean's trip to la , but if they didn't look out because you are in for a rather interesting if not odd ride . \nheck depending on your humor you might end up laughing through the whole flick . \neither way look out america bean is coming . \nwell , what can really be said about this movie , there is very little discernible plot . \nthat much is not hard to grapple with for it is a slapstick comedy . \nit achieves that goal rather admirably , but because it is that , the plot is just screaming for help . \nthe whole premise that the movie is based on is to say the least flawed . \nthe movie had its funny moments but there was no real story line other than something that could be thought up on a whim and carried through and in ma

In [11]:
# Separate reviews and their labels
reviews_text = [doc[0] for doc in documents_raw]
sentiments = [1 if doc[1] =='pos' else 0 for doc in documents_raw]

In [12]:
# Step 2 - Text Preprocessing

In [21]:
# initialize lemmatizer and stopword list
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

punctuation = set(string.punctuation)


def preprocess_text(text):
  """
  Clean the data, tokenize the data, lowercase, remove stopwords, remove punctuations, lemmatize
  """
  # lowercase the text
  text = text.lower()

  # tokenize the text
  words = nltk.word_tokenize(text)

  # clean,filter and lemmatize tokens

  processed_words = []
  for word in words:
    cleaned_word = ''.join(char for char in word if char not in punctuation)

    if cleaned_word and cleaned_word not in stop_words and cleaned_word.isalpha():
      lemmatized_word = lemmatizer.lemmatize(cleaned_word)
      processed_words.append(lemmatized_word)

  return processed_words

In [22]:
# Apply the preprocessing to all reviews

preprocessed_reviews = [" ".join(preprocess_text(review)) for review in reviews_text]

print(f"Example of preprocessed review :")
print(f'Original Review :{reviews_text[0][:200]}')
print(f'Preprocessed Review :{preprocessed_reviews[0][:200]}')

Example of preprocessed review :
Original Review :mr . bean , a bumbling security guard from england is sent to la to help with the grandiose homecoming of a masterpiece american painting . 
the first two words should have said enough to let you know
Preprocessed Review :mr bean bumbling security guard england sent la help grandiose homecoming masterpiece american painting first two word said enough let know occurs bean trip la nt look rather interesting odd ride heck


In [23]:
# Step 3 Feature Extraction (TF-IDF)

# convert text data into numerical data using TF-IDF

tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=5, max_df=0.7)



In [24]:
# Fit transform the training data

X_train_text, X_test_text, y_train, y_test = train_test_split(preprocessed_reviews, sentiments, test_size=0.2, random_state= 42, stratify=sentiments)

# Fit TF - IDF on the training data and transform training text

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)

X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

print(f"Shape of TF-IDF training data: {X_train_tfidf.shape}")
print(f"Shape of TF-IDF testing data: {X_test_tfidf.shape}")
print(f"First 5 feature names: {tfidf_vectorizer.get_feature_names_out()[:5]}")

Shape of TF-IDF training data: (1600, 5000)
Shape of TF-IDF testing data: (400, 5000)
First 5 feature names: ['abandon' 'abandoned' 'ability' 'able' 'aboard']


In [25]:
# Step 4 - Model training (Naive Bayes and Logistic Regression)

nb_classifier = MultinomialNB()

nb_classifier.fit(X_train_tfidf, y_train)

print(f'Naive bayes trained')

lr_classifier = LogisticRegression(solver='liblinear', random_state=42, C=1)
lr_classifier.fit(X_train_tfidf, y_train)

print(f'Logistic Regression trained')

Naive bayes trained
Logistic Regression trained


In [26]:
# Step 5 - Model Evaluation

def evaluate_model(classifier, X_test_features, y_test_labels, model_name = "Model"):
  y_pred = classifier.predict(X_test_features)

  # calculate the accuracy
  accuracy = accuracy_score(y_test_labels, y_pred)
  print(f"accuracy score {model_name}: {accuracy}")

  # generate classification report
  print(f"Classification report for Model : {model_name}")
  print(classification_report(y_test_labels, y_pred, target_names=['Negative (0)', 'Positive (1)']))

  # Generate the confusion matrix
  print(f"Confusion Matrix for Model : {model_name}")
  cm = confusion_matrix(y_test_labels, y_pred)
  print(cm)

In [27]:
# Evaluate Naive Bayes
evaluate_model(nb_classifier,X_test_tfidf, y_test, "Naive Bayes")

accuracy score Naive Bayes: 0.785
Classification report for Model : Naive Bayes
              precision    recall  f1-score   support

Negative (0)       0.75      0.84      0.80       200
Positive (1)       0.82      0.72      0.77       200

    accuracy                           0.79       400
   macro avg       0.79      0.78      0.78       400
weighted avg       0.79      0.79      0.78       400

Confusion Matrix for Model : Naive Bayes
[[169  31]
 [ 55 145]]


In [28]:
# Evaluate Logistic Regression
evaluate_model(lr_classifier, X_test_tfidf, y_test, "Logistic Regression")

accuracy score Logistic Regression: 0.825
Classification report for Model : Logistic Regression
              precision    recall  f1-score   support

Negative (0)       0.82      0.82      0.82       200
Positive (1)       0.82      0.82      0.82       200

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400

Confusion Matrix for Model : Logistic Regression
[[165  35]
 [ 35 165]]


In [29]:
# Step 6 - Testing with custom input

def predict_sentiment_tfidf(text, vectorizer, classifier):
  #1 Preprocess the input text
  preprocessed_text = " ".join(preprocess_text(text))

  #2 Transform the preprocessed text into TF-IDF vector
  text_vector = vectorizer.transform([preprocessed_text])

  #3 Predict the sentiment using the trained classifier
  prediction_probability = classifier.predict_proba(text_vector)[0]
  prediction = classifier.predict(text_vector)[0]

  sentiment = "Positive" if prediction == 1 else "Negative"

  confidence_pos = prediction_probability[1]
  confidence_neg = prediction_probability[0]

  return sentiment, confidence_pos if prediction ==1 else confidence_neg

In [30]:
# Testing with custom review

review_1 = "This movie was absolutely fantastic. A masterpiece of cinema"

review_2 = "This movie was terrible. I did not enjoyed at all"


for review in [review_1, review_2]:
  pred_sentiment, pred_confidence = predict_sentiment_tfidf(review,tfidf_vectorizer, lr_classifier)
  print(f"Review : {review}")
  print(f'Predicted sentiment : {pred_sentiment}, Confidence : {pred_confidence}')

Review : This movie was absolutely fantastic. A masterpiece of cinema
Predicted sentiment : Positive, Confidence : 0.6536673853388507
Review : This movie was terrible. I did not enjoyed at all
Predicted sentiment : Negative, Confidence : 0.5575248729111488
