<a href="https://colab.research.google.com/github/shivasankari266/portfolio/blob/main/Sentiment_Analysis_Movie_review_using_Multinomial_Naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy pandas scikit-learn nltk joblib



In [2]:
import nltk
import random
from nltk.corpus import movie_reviews

# Download dataset
nltk.download('movie_reviews')

# Load movie reviews data
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

# Display first 3 data samples
print(documents[:3])

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


[(['you', 'know', 'you', "'", 're', 'in', 'for', 'a', 'truly', 'different', 'cinematic', 'experience', 'the', 'moment', 'you', 'realize', 'steve', 'buscemi', 'has', 'been', 'cast', 'as', 'the', 'movie', 'in', 'question', "'", 's', 'most', 'normal', 'character', '.', 'such', 'is', 'the', 'case', 'with', '"', 'the', 'big', 'lebowski', ',', '"', 'joel', 'and', 'ethan', 'coen', "'", 's', 'first', 'venture', 'since', 'their', '"', 'fargo', '"', 'nabbed', 'nationwide', 'acclaim', 'and', 'a', 'mantlepiece', 'of', 'academy', 'award', 'nods', '.', 'there', "'", 's', 'no', 'need', 'to', 'fear', 'that', 'the', 'coens', 'suffered', 'creative', 'burnout', 'with', 'that', 'accomplishment', ',', 'because', '"', 'lebowski', '"', 'is', 'an', 'effort', 'that', 'makes', 'words', 'like', '"', 'strange', '"', 'and', '"', 'unusual', '"', 'seem', 'like', 'an', 'understatement', '.', 'thanks', 'in', 'part', 'to', 'delicious', 'characterization', 'and', 'an', 'all', '-', 'encompassing', 'sense', 'of', 'humor',

In [3]:
# Get most common words
all_words = [word.lower() for word in movie_reviews.words() if word.isalpha()]
common_words = list(nltk.FreqDist(all_words).keys())[:3000]  # Top 3000 words

# Function to extract features from a review
def document_features(document):
    words = set(document)
    return {word: (word in words) for word in common_words}

# Convert data into feature sets
feature_sets = [(document_features(d), c) for (d, c) in documents]

# Display sample feature set
print(feature_sets[0])




In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Convert dictionary features to numerical format
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform([features for features, label in feature_sets])
y = [label for features, label in feature_sets]

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Test model accuracy
y_pred = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred):.2f}")


Model Accuracy: 0.82


In [5]:
import joblib
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Save model and vectorizer
joblib.dump(model, "/content/drive/My Drive/sentiment_model.pkl")
joblib.dump(vectorizer, "/content/drive/My Drive/vectorizer.pkl")


Mounted at /content/drive


['/content/drive/My Drive/vectorizer.pkl']

In [8]:
# Load the trained model and vectorizer
model = joblib.load("/content/drive/My Drive/sentiment_model.pkl")
vectorizer = joblib.load("/content/drive/My Drive/vectorizer.pkl")

# Function to predict sentiment
def predict_sentiment(review):
    words = review.lower().split()
    features = {word: (word in words) for word in vectorizer.feature_names_}
    X_new = vectorizer.transform([features])
    return model.predict(X_new)[0]

# Test the model
print(predict_sentiment("This movie was fantastic, I loved it!"))  # Expected: pos
print(predict_sentiment("The film was terrible and boring."))  # Expected: neg


pos
neg


In [20]:
reviews = [
    "Amazing movie, I really enjoyed it!",
    "Worst film ever, waste of time.",
    "It was okay, not the best but not the worst.",
]

for review in reviews:
    print(f"Review: {review} --> Sentiment: {predict_sentiment(review)}")


Review: Amazing movie, I really enjoyed it! --> Sentiment: pos
Review: Worst film ever, waste of time. --> Sentiment: neg
Review: It was okay, not the best but not the worst. --> Sentiment: pos


In [21]:
user_review = input("Enter a movie review: ")
print(f"Predicted Sentiment: {predict_sentiment(user_review)}")


Enter a movie review: waste of time
Predicted Sentiment: neg


In [22]:
import joblib

# Load model and vectorizer
model = joblib.load("/content/drive/My Drive/sentiment_model.pkl")
vectorizer = joblib.load("/content/drive/My Drive/vectorizer.pkl")

print("Model loaded successfully!")

Model loaded successfully!


In [24]:
import joblib

# Save model and vectorizer to Google Drive
joblib.dump(model, "/content/drive/My Drive/sentiment_model.pkl")
joblib.dump(vectorizer, "/content/drive/My Drive/vectorizer.pkl")

print("Model saved successfully!")


Model saved successfully!


In [25]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
