In [1]:
# --- 1. Import necessary libraries ---
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # For stemming words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB # For the sentiment classification model
from sklearn.metrics import accuracy_score, classification_report # For evaluating the model
import pickle
import re # For regular expressions in text preprocessing


In [2]:

# --- 2. Download NLTK data (if not already downloaded) ---
# These are essential for text preprocessing (stopwords) and the VADER lexicon check (if needed by Flask)
print("Checking and downloading NLTK data...")
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
    print("NLTK 'stopwords' downloaded.")

try:
    nltk.data.find('sentiment/vader_lexicon')
except LookupError:
    nltk.download('vader_lexicon')
    print("NLTK 'vader_lexicon' downloaded.")
print("NLTK data check complete.")


Checking and downloading NLTK data...
NLTK 'vader_lexicon' downloaded.
NLTK data check complete.


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:

# --- 3. Load the dataset ---
# This assumes 'reviews.csv' is in the same directory as the notebook.
# It should have 'review' and 'sentiment' columns.
try:
    dataset = pd.read_csv('reviews.csv')
    print(f"Dataset loaded successfully. Shape: {dataset.shape}")
    print("First 5 rows of the dataset:")
    print(dataset.head())
except FileNotFoundError:
    print("Error: 'reviews.csv' not found. Please ensure the file is in the same directory.")
    # Exit or handle gracefully if the dataset isn't found
    exit()


Dataset loaded successfully. Shape: (50000, 2)
First 5 rows of the dataset:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [4]:

# --- 4. Prepare text preprocessing tools ---
# Set of English stopwords
stopset = set(stopwords.words('english'))
# Initialize Porter Stemmer for reducing words to their root form
ps = PorterStemmer()


In [5]:

# --- 5. Define text preprocessing function ---
# This function will clean and transform each review comment.
def preprocess_text(text):
    # Remove non-alphabetic characters and replace with space
    review = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    review = review.lower()
    # Split into individual words
    review = review.split()
    # Apply stemming and remove stopwords
    # Only stem if the word is not a stopword
    review = [ps.stem(word) for word in review if not word in stopset]
    # Join words back into a single string
    review = ' '.join(review)
    return review


In [6]:

# --- 6. Apply preprocessing to the 'review' column ---
print("\nPreprocessing movie reviews... This might take a moment.")
# Apply the preprocessing function to each review in the 'review' column
# Using .apply() with a lambda for cleaner syntax
corpus = dataset['review'].apply(preprocess_text)
print("Preprocessing complete.")



Preprocessing movie reviews... This might take a moment.
Preprocessing complete.


In [7]:
# --- 7. Convert text data to TF-IDF features ---
# TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer
# It assigns a weight to each word, indicating its importance in a document relative to the corpus.
# use_idf: Enable inverse-document frequency reweighting.
# lowercase: Convert all characters to lowercase before tokenizing.
# strip_accents: Remove accents during the preprocessing step.
# stop_words: Remove common English stop words.
print("Creating TF-IDF features...")
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=list(stopset)) # <-- Changed here
X = vectorizer.fit_transform(corpus) # Fit and transform the preprocessed text data
print(f"TF-IDF features created. Shape: {X.shape}")


Creating TF-IDF features...
TF-IDF features created. Shape: (50000, 68929)


In [8]:

# --- 8. Save the trained TF-IDF Vectorizer ---
# The vectorizer needs to be saved so it can be used later in the Flask app
# to transform new movie reviews in the same way the training data was transformed.
filename_vectorizer = 'tranform.pkl'
with open(filename_vectorizer, 'wb') as file:
    pickle.dump(vectorizer, file)
print(f"TF-IDF Vectorizer saved as '{filename_vectorizer}'")


TF-IDF Vectorizer saved as 'tranform.pkl'


In [9]:

# --- 9. Prepare labels (target variable) ---
# Map 'positive' to 1 and 'negative' to 0 for numerical classification
y = dataset['sentiment'].map({'positive': 1, 'negative': 0})
print(f"Sentiment labels mapped. First 5 labels: {y.head().tolist()}")


Sentiment labels mapped. First 5 labels: [1, 1, 1, 0, 1]


In [10]:

# --- 10. Split data into training and testing sets ---
# Split the data to evaluate the model's performance on unseen data.
# test_size=0.20: 20% of data for testing, 80% for training.
# random_state=42: Ensures reproducibility of the split.
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print(f"Training set shape (X_train, y_train): {X_train.shape}, {y_train.shape}")
print(f"Test set shape (X_test, y_test): {X_test.shape}, {y_test.shape}")


Splitting data into training and testing sets...
Training set shape (X_train, y_train): (40000, 68929), (40000,)
Test set shape (X_test, y_test): (10000, 68929), (10000,)


In [11]:

# --- 11. Train the Naive Bayes Classifier ---
# Multinomial Naive Bayes is a common and effective model for text classification.
print("\nTraining Multinomial Naive Bayes classifier...")
clf = MultinomialNB()
clf.fit(X_train, y_train) # Train the classifier on the training data
print("Classifier training complete.")



Training Multinomial Naive Bayes classifier...
Classifier training complete.


In [12]:

# --- 12. Evaluate the model on the test set ---
print("\nEvaluating model on test set...")
y_pred_test = clf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test) * 100
print(f"Accuracy on test set: {accuracy_test:.2f}%")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))




Evaluating model on test set...
Accuracy on test set: 86.16%

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      4961
           1       0.87      0.85      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [13]:

# --- 13. Re-train the classifier on the full dataset ---
# It's common practice to train the final model on the entire dataset
# after evaluating it, to maximize the data used for learning.
print("\nRe-training classifier on the full dataset...")
clf_full_data = MultinomialNB()
clf_full_data.fit(X, y) # Train on all TF-IDF features and labels
print("Full data classifier training complete.")



Re-training classifier on the full dataset...
Full data classifier training complete.


In [14]:

# --- 14. Save the trained Naive Bayes Classifier ---
# This is the model that the Flask app will load to make sentiment predictions.
filename_model = 'nlp_model.pkl'
with open(filename_model, 'wb') as file:
    pickle.dump(clf_full_data, file)
print(f"NLP model saved as '{filename_model}'")

print("\nSentiment analysis model training and saving process finished successfully!")
print("You can now find 'nlp_model.pkl' and 'tranform.pkl' in your project directory.")


NLP model saved as 'nlp_model.pkl'

Sentiment analysis model training and saving process finished successfully!
You can now find 'nlp_model.pkl' and 'tranform.pkl' in your project directory.
