In [21]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [22]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/rahul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/rahul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
def replace_censored_words(text, replacement='CENSORED'):
    #asterisks words
    text = re.sub(r'\b\w*\*+\w*\b', replacement, text)
    return text

In [24]:
def preprocess_text(text):
    text = replace_censored_words(text)
    #to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Join words back to text
    text = ' '.join(words)
    return text

In [25]:
data = pd.read_csv('/Users/rahul/Downloads/IMDB Dataset.csv')
data['review'] = data['review'].apply(preprocess_text)

In [15]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())       # Classifier
])

In [26]:
X = data['review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
pipeline.fit(X_train, y_train)

In [18]:

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.86      0.88      0.87      4961
    positive       0.88      0.85      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [19]:
def test_review(review):
    preprocessed_review = preprocess_text(review)
    prediction = pipeline.predict([preprocessed_review])
    return prediction[0]

In [20]:
my_review = "I just watched the movie and it was absolutely amazing but it was average. please avoid it!"

# Get sentiment prediction
predicted_sentiment = test_review(my_review)
print(f'Your review sentiment: {predicted_sentiment}')

Your review sentiment: negative
