This project aims to develop a sentiment analysis system to classify text data as positive, negative, or neutral. Using a dataset of movie reviews, it preprocesses and converts text into numerical features with TF-IDF, then trains and evaluates Logistic Regression, Naive Bayes, and SVM models. Additionally, it provides a real-time sentiment analysis function for immediate sentiment feedback on new text inputs, showcasing the application of NLP and machine learning in extracting insights from textual data.

Import Libraries and Download NLTK Datasets

In [17]:
import pandas as pd
import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


Download required NLTK datasets

In [20]:
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/riteshgaire/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/riteshgaire/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/riteshgaire/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/riteshgaire/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Load and Preprocess the Data

In [23]:
# Load the movie reviews dataset from NLTK
def load_movie_reviews():
    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    return pd.DataFrame(documents, columns=['Review', 'Sentiment'])

# Preprocess the data by cleaning the text
def preprocess_data(df):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    # Function to clean a single review
    def clean_review(review):
        # Lowercase the words
        review = [word.lower() for word in review]
        # Remove punctuation
        review = [word for word in review if word not in punctuation]
        # Remove stop words and lemmatize
        review = [lemmatizer.lemmatize(word) for word in review if word not in stop_words]
        return ' '.join(review)

    # Apply cleaning to all reviews in the dataset
    df['Review'] = df['Review'].apply(clean_review)
    return df

# Load and preprocess the data
df = load_movie_reviews()
df = preprocess_data(df)
print(df.head())


                                              Review Sentiment
0  plot two teen couple go church party drink dri...       neg
1  happy bastard quick movie review damn y2k bug ...       neg
2  movie like make jaded movie viewer thankful in...       neg
3  quest camelot warner bros first feature length...       neg
4  synopsis mentally unstable man undergoing psyc...       neg


Convert Text to Numerical Features (TF-IDF)

In [26]:
# Convert the text data to TF-IDF features
def convert_to_tfidf(df):
    vectorizer = TfidfVectorizer(max_features=3000)
    X = vectorizer.fit_transform(df['Review']).toarray()
    return X, df['Sentiment'], vectorizer

# Convert the dataset
X, y, vectorizer = convert_to_tfidf(df)


Split the Data and Train the Models

In [29]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate Logistic Regression model
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))

# Train and evaluate Naive Bayes model
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)
y_pred_nb = model_nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))

# Train and evaluate SVM model
model_svm = SVC(kernel='linear')
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))


Logistic Regression Accuracy: 0.815
Logistic Regression Classification Report:
               precision    recall  f1-score   support

         neg       0.82      0.81      0.81       199
         pos       0.81      0.82      0.82       201

    accuracy                           0.81       400
   macro avg       0.82      0.81      0.81       400
weighted avg       0.82      0.81      0.81       400

Naive Bayes Accuracy: 0.8
Naive Bayes Classification Report:
               precision    recall  f1-score   support

         neg       0.78      0.83      0.81       199
         pos       0.82      0.77      0.79       201

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400

SVM Accuracy: 0.81
SVM Classification Report:
               precision    recall  f1-score   support

         neg       0.82      0.79      0.81       199
         pos       0.80      0.83      0.81     

Real-Time Sentiment Analysis Function

In [32]:
# Function to analyze sentiment of new text input
def analyze_sentiment(text, model, vectorizer):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    # Function to clean a single review
    def clean_review(review):
        review = word_tokenize(review)
        review = [word.lower() for word in review]
        review = [word for word in review if word not in punctuation]
        review = [lemmatizer.lemmatize(word) for word in review if word not in stop_words]
        return ' '.join(review)

    # Clean and vectorize the input text
    processed_text = clean_review(text)
    tfidf_text = vectorizer.transform([processed_text]).toarray()
    sentiment = model.predict(tfidf_text)
    return sentiment[0]

# Example usage with Logistic Regression
print(analyze_sentiment("This movie was absolutely fantastic!", model_lr, vectorizer))


pos
