In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [4]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re

print("Loading Sentiment140 dataset...")
dataset = load_dataset('sentiment140')

train_data = pd.DataFrame(dataset['train'])

train_data = train_data[train_data['sentiment'].isin([0, 4])]
train_data['sentiment'] = train_data['sentiment'].replace(4, 1)

train_data = train_data.sample(10000, random_state=42)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    return text

train_data['text'] = train_data['text'].apply(clean_text)

X = train_data['text']
y = train_data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit to 5000 features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

y_pred = nb_classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

def predict_sentiment(text, vectorizer, model):
    cleaned_text = clean_text(text)
    text_vector = vectorizer.transform([cleaned_text])
    prediction = model.predict(text_vector)
    return "positive" if prediction[0] == 1 else "negative"

custom_tweet1 = "I love this new phone, it’s amazing!"
result1 = predict_sentiment(custom_tweet1, vectorizer, nb_classifier)
print(f"\nCustom Tweet: '{custom_tweet1}'")
print(f"Predicted Sentiment: {result1}")

custom_tweet2 = "This weather is awful, so depressing."
result2 = predict_sentiment(custom_tweet2, vectorizer, nb_classifier)
print(f"\nCustom Tweet: '{custom_tweet2}'")
print(f"Predicted Sentiment: {result2}")

Loading Sentiment140 dataset...
The repository for sentiment140 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/sentiment140.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/498 [00:00<?, ? examples/s]

Accuracy: 71.05%

Classification Report:
              precision    recall  f1-score   support

    negative       0.69      0.73      0.71       980
    positive       0.73      0.69      0.71      1020

    accuracy                           0.71      2000
   macro avg       0.71      0.71      0.71      2000
weighted avg       0.71      0.71      0.71      2000


Custom Tweet: 'I love this new phone, it’s amazing!'
Predicted Sentiment: positive

Custom Tweet: 'This weather is awful, so depressing.'
Predicted Sentiment: negative
