In [None]:
!pip install nltk

In [None]:
!pip install wordcloud

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from wordcloud import WordCloud

# Load the dataset into a pandas DataFrame
data = pd.read_csv("IMDB Dataset.csv")


In [None]:
data.head()

# Dataset Preparation


In [None]:
# Display the first 10 rows of the dataset
print(data.head(10))

# Check for missing values
print(data.isnull().sum())

# Check for duplicates
print(data.duplicated().sum())


In [None]:
# Convert text to lowercase
data['review'] = data['review'].str.lower()

# Remove punctuation
data['review'] = data['review'].str.replace('[^\w\s]', '')

# Tokenization and Stopword removal
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Stemming (reducing words to their base forms)
stemmer = PorterStemmer()
data['review'] = data['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))


# Tokenization

In [None]:
# Splitting the dataset into features (X) and target (y)
X = data['review']
y = data['sentiment']

# Convert text data into numerical feature vectors using TF-IDF
vectorizer = TfidfVectorizer(max_features=2000)  # You can adjust the max_features as needed
X_tfidf = vectorizer.fit_transform(X).toarray()


# Naive Bayes Classification

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)

# Evaluate the performance of the classifier
y_pred = naive_bayes_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


# Prediction with New Data

In [None]:
def predict_sentiment(review):
    # Preprocess the review
    review = review.lower().replace('[^\w\s]', '')
    review = ' '.join([word for word in review.split() if word not in stop_words])
    review = ' '.join([stemmer.stem(word) for word in review.split()])

    # Convert the review into numerical feature vector using TF-IDF
    review_tfidf = vectorizer.transform([review]).toarray()

    # Predict the sentiment
    prediction = naive_bayes_classifier.predict(review_tfidf)

    return prediction[0]


In [None]:
# Generate a word cloud visualization for positive and negative reviews separately
positive_reviews = data[data['sentiment'] == 'positive']['review'].values
negative_reviews = data[data['sentiment'] == 'negative']['review'].values

def generate_word_cloud(reviews, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(reviews))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

generate_word_cloud(positive_reviews, "Word Cloud - Positive Reviews")
generate_word_cloud(negative_reviews, "Word Cloud - Negative Reviews")
