In [2]:
!pip install datasets


Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from datasets import load_dataset

In [5]:
# Load dataset
dataset = load_dataset("carblacac/twitter-sentiment-analysis")
tweets = dataset['train']['text']
labels = dataset['train']['feeling']

In [6]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2, random_state=42)

In [7]:
# Text preprocessing and vectorization
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [8]:
# Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)

In [9]:

# Predictions
y_pred = nb_classifier.predict(X_test_vec)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))


Accuracy: 0.7638136511375948
              precision    recall  f1-score   support

           0       0.73      0.83      0.78     11954
           1       0.80      0.70      0.75     12044

    accuracy                           0.76     23998
   macro avg       0.77      0.76      0.76     23998
weighted avg       0.77      0.76      0.76     23998



In [16]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import HashingVectorizer

In [13]:
x_train, x_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2, random_state=42)

In [17]:
model_svm = make_pipeline(HashingVectorizer(), SVC())

In [18]:
subset_size = 1000
x_train_subset, y_train_subset = x_train[:subset_size], y_train[:subset_size]

In [20]:
model_svm.fit(x_train_subset, y_train_subset)

In [21]:
y_pred_svm = model_svm.predict(x_test)

In [22]:
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)
print("Classification Report:\n",classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.6607633969497458
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.65      0.66     11954
           1       0.66      0.67      0.67     12044

    accuracy                           0.66     23998
   macro avg       0.66      0.66      0.66     23998
weighted avg       0.66      0.66      0.66     23998

