In [1]:
import random
import nltk
from nltk.corpus import twitter_samples
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from utils import process_tweet

In [2]:
# downloads sample twitter dataset.
nltk.download('twitter_samples')

# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# Combine positive and negative tweets
all_tweets = all_positive_tweets + all_negative_tweets

# Create labels for the tweets (1 for positive, 0 for negative)
labels = [1] * len(all_positive_tweets) + [0] * len(all_negative_tweets)

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/bapary/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_tweets, labels, test_size=0.2, random_state=42)

# Preprocess the training and testing data
X_train_preprocessed = [' '.join(process_tweet(tweet)) for tweet in X_train]
X_test_preprocessed = [' '.join(process_tweet(tweet)) for tweet in X_test]

In [4]:
# Convert tweets to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_preprocessed)
X_test_tfidf = vectorizer.transform(X_test_preprocessed)

In [5]:
# Train Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [6]:
# Make predictions
y_pred = nb_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

Accuracy: 0.742
Confusion Matrix:
[[753 235]
 [281 731]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.76      0.74       988
           1       0.76      0.72      0.74      1012

    accuracy                           0.74      2000
   macro avg       0.74      0.74      0.74      2000
weighted avg       0.74      0.74      0.74      2000



In [7]:
# Create a list of custom tweets for testing
custom_tweets = [
    "I love using this product! It's amazing.",
    "This movie is terrible, I didn't enjoy it at all.",
    "The weather is beautiful today!",
    "Feeling down and tired, not a good day."
]

# Preprocess the custom tweets
custom_tweets_preprocessed = [' '.join(process_tweet(tweet)) for tweet in custom_tweets]

# Convert custom tweets to TF-IDF features
custom_tweets_tfidf = vectorizer.transform(custom_tweets_preprocessed)

# Make predictions for custom tweets
custom_sentiment_predictions = nb_model.predict(custom_tweets_tfidf)

# Print the custom sentiment predictions
for tweet, prediction in zip(custom_tweets, custom_sentiment_predictions):
    sentiment_label = "Positive" if prediction == 1 else "Negative"
    print(f"Tweet: {tweet}\nPredicted Sentiment: {sentiment_label}\n")

Tweet: I love using this product! It's amazing.
Predicted Sentiment: Positive

Tweet: This movie is terrible, I didn't enjoy it at all.
Predicted Sentiment: Positive

Tweet: The weather is beautiful today!
Predicted Sentiment: Positive

Tweet: Feeling down and tired, not a good day.
Predicted Sentiment: Negative

