In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, classification_report

nltk.download('stopwords')
nltk.download('punkt')

file_path = "Movies_Reviews_modified_version1.csv"
df = pd.read_csv(file_path)

df['Reviews'] = df['Reviews'].fillna('')

df['Sentiment'] = df['Ratings'].apply(lambda x: 'positive' if x >= 3 else 'negative')

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['Cleaned_Reviews'] = df['Reviews'].astype(str).apply(preprocess_text)

df['Sentiment'] = df['Sentiment'].map({'positive': 1, 'negative': 0})

X_train, X_test, y_train, y_test = train_test_split(df['Cleaned_Reviews'], df['Sentiment'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_tfidf, y_train)

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

logistic_preds = logistic_model.predict(X_test_tfidf)
nb_preds = nb_model.predict(X_test_tfidf)

def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Model Performance:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1-score:", f1_score(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("-" * 50)

evaluate_model("Logistic Regression", y_test, logistic_preds)
evaluate_model("Naïve Bayes", y_test, nb_preds)

def predict_sentiment(review):
    cleaned_review = preprocess_text(review)
    review_tfidf = vectorizer.transform([cleaned_review])
    log_pred = logistic_model.predict(review_tfidf)[0]
    nb_pred = nb_model.predict(review_tfidf)[0]
    return {
        "Logistic Regression": "Positive" if log_pred == 1 else "Negative",
        "Naïve Bayes": "Positive" if nb_pred == 1 else "Negative"
    }

user_review = "This movie was absolutely fantastic! The storyline was engaging."
predictions = predict_sentiment(user_review)
print("User Review Sentiment Prediction:", predictions)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NAC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NAC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Logistic Regression Model Performance:
Accuracy: 0.9355711965349215
F1-score: 0.9628519697821065
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.65      0.76      1421
           1       0.94      0.99      0.96      7814

    accuracy                           0.94      9235
   macro avg       0.92      0.82      0.86      9235
weighted avg       0.93      0.94      0.93      9235

--------------------------------------------------

Naïve Bayes Model Performance:
Accuracy: 0.8943151055766108
F1-score: 0.9410770345327215
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.33      0.49      1421
           1       0.89      1.00      0.94      7814

    accuracy                           0.89      9235
   macro avg       0.92      0.66      0.71      9235
weighted avg       0.90      0.89      0.87      9235

--------------------------------------------------
User Rev

In [2]:

import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NAC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NAC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
file_path = "Movies_Reviews_modified_version1.csv"  # Ensure correct path
df = pd.read_csv(file_path)

In [5]:
df['Reviews'] = df['Reviews'].fillna('')

In [6]:
df['Sentiment'] = df['Ratings'].apply(lambda x: 'positive' if x >= 3 else 'negative')

In [7]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))  # Load stopwords
    
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return " ".join(tokens)

In [8]:
df['Cleaned_Reviews'] = df['Reviews'].astype(str).apply(preprocess_text)

df['Sentiment'] = df['Sentiment'].map({'positive': 1, 'negative': 0})


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['Cleaned_Reviews'], df['Sentiment'], test_size=0.2, random_state=42)

In [10]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [11]:
logistic_model = LogisticRegression(max_iter=1000)  
logistic_model.fit(X_train_tfidf, y_train)


In [12]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [13]:
logistic_preds = logistic_model.predict(X_test_tfidf)
nb_preds = nb_model.predict(X_test_tfidf)

In [14]:
def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Model Performance:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1-score:", f1_score(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("-" * 50)

In [15]:
evaluate_model("Logistic Regression", y_test, logistic_preds)
evaluate_model("Naïve Bayes", y_test, nb_preds)



Logistic Regression Model Performance:
Accuracy: 0.9355711965349215
F1-score: 0.9628519697821065
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.65      0.76      1421
           1       0.94      0.99      0.96      7814

    accuracy                           0.94      9235
   macro avg       0.92      0.82      0.86      9235
weighted avg       0.93      0.94      0.93      9235

--------------------------------------------------

Naïve Bayes Model Performance:
Accuracy: 0.8943151055766108
F1-score: 0.9410770345327215
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.33      0.49      1421
           1       0.89      1.00      0.94      7814

    accuracy                           0.89      9235
   macro avg       0.92      0.66      0.71      9235
weighted avg       0.90      0.89      0.87      9235

--------------------------------------------------


In [16]:
def predict_sentiment(review):
    cleaned_review = preprocess_text(review)
    review_tfidf = vectorizer.transform([cleaned_review])
    
    log_pred = logistic_model.predict(review_tfidf)[0]
    nb_pred = nb_model.predict(review_tfidf)[0]
    
    return {
        "Logistic Regression": "Positive" if log_pred == 1 else "Negative",
        "Naïve Bayes": "Positive" if nb_pred == 1 else "Negative"
    }

In [17]:
user_review = "This movie was absolutely fantastic! The storyline was engaging."
predictions = predict_sentiment(user_review)
print("User Review Sentiment Prediction:", predictions)

User Review Sentiment Prediction: {'Logistic Regression': 'Positive', 'Naïve Bayes': 'Positive'}


In [18]:
user_review = input("Enter a movie review: ")
predictions = predict_sentiment(user_review)
print("Predicted Sentiment:", predictions)



Enter a movie review:  Fair Game


Predicted Sentiment: {'Logistic Regression': 'Positive', 'Naïve Bayes': 'Positive'}
