# Import necessary libraries

In [1]:
#import numpy
import numpy as np

#import libraries for Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
import requests
from sklearn import preprocessing
from sklearn.metrics import classification_report

#import libraries for SVM
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

#import libraries for BI-LSTM
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense




# Text preprocessing: Cleaning data using nltk

In [2]:
#import nltk libarary for text preprocessing
import re
import requests
import nltk

# Download NLTK stopwords
import ssl
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Bypass SSL certificate verification
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


# Download NLTK stopwords
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)


# Fetch the data
url = "https://datasets-server.huggingface.co/first-rows?dataset=carblacac%2Ftwitter-sentiment-analysis&config=default&split=train"
data = requests.get(url).json()
texts = [row['row']['text'] for row in data['rows']]
labels = [row['row']['feeling'] for row in data['rows']]

# Function to clean and preprocess the text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and links using regular expressions
    text = re.sub(r'http\S+|www\S+|https\S+|\[.*?\]|\W', ' ', text, flags=re.MULTILINE)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word for word in words if word.isalnum() and word not in stop_words]
    
    return ' '.join(words)

# Apply the cleaning function to each text
cleaned_texts = [clean_text(text) for text in texts]

# Print cleaned text for the first few samples
for i in range(5):
    print(f"Original: {texts[i]}")
    print(f"Cleaned: {cleaned_texts[i]}\n")


Original: @fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser
Cleaned: fa6ami86 happy salman btw 14sec clip truely teaser

Original: @phantompoptart .......oops.... I guess I'm kinda out of it.... Blonde moment -blushes- epic fail
Cleaned: phantompoptart oops guess kinda blonde moment blushes epic fail

Original: @bradleyjp decidedly undecided. Depends on the situation. When I'm out with the people I'll be in Chicago with? Maybe.
Cleaned: bradleyjp decidedly undecided depends situation people chicago maybe

Original: @Mountgrace lol i know! its so frustrating isnt it?!
Cleaned: mountgrace lol know frustrating isnt

Original: @kathystover Didn't go much of any where - Life took over for a while
Cleaned: kathystover go much life took



# Naive Bayes

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create a pipeline with CountVectorizer and Multinomial Naive Bayes
model_nb = make_pipeline(CountVectorizer(), MultinomialNB())

# Train the model
model_nb.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = model_nb.predict(X_test)

# Evaluate the model
accuracy_nb = metrics.accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Accuracy: {accuracy_nb}")

report_nb = classification_report(y_test, y_pred_nb)
print("Naive Bayes Classification Report:")
print(report_nb)



Naive Bayes Accuracy: 0.45
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.62      0.48         8
           1       0.57      0.33      0.42        12

    accuracy                           0.45        20
   macro avg       0.48      0.48      0.45        20
weighted avg       0.50      0.45      0.44        20



# Support Vector Machine

In [4]:
# Create a pipeline with TfidfVectorizer and Support Vector Machine
model_svm = make_pipeline(TfidfVectorizer(), SVC())

# Train the model
model_svm.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = model_svm.predict(X_test)

# Evaluate the model
accuracy_svm = metrics.accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm}")

report_svm = classification_report(y_test, y_pred_svm)
print("SVM Report:")
print(report_svm)



SVM Accuracy: 0.5
SVM Report:
              precision    recall  f1-score   support

           0       0.44      1.00      0.62         8
           1       1.00      0.17      0.29        12

    accuracy                           0.50        20
   macro avg       0.72      0.58      0.45        20
weighted avg       0.78      0.50      0.42        20



# BI-LSTM

In [5]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
total_words = len(tokenizer.word_index) + 1

# Convert text to sequences and pad sequences
sequences = tokenizer.texts_to_sequences(X_train)
X_train_lstm = pad_sequences(sequences)

# Convert labels to numpy array
y_train = np.array(y_train)

# Convert test labels to numpy array
y_test = np.array(y_test)

# Split the data into training and validation sets
X_train_lstm, X_val_lstm, y_train, y_val = train_test_split(X_train_lstm, y_train, test_size=0.2, random_state=42)

# Build the Bi-LSTM model
model_lstm = Sequential()
model_lstm.add(Embedding(total_words, 100, input_length=len(X_train_lstm[0])))
model_lstm.add(Bidirectional(LSTM(64)))
model_lstm.add(Dense(1, activation='sigmoid'))

# Compile the model
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model_lstm.fit(np.array(X_train_lstm), y_train, epochs=30, batch_size=32, validation_data=(np.array(X_val_lstm), y_val))

# Convert test data to sequences and pad sequences
X_test_lstm = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=len(X_train_lstm[0]))

# Evaluate the model
accuracy_lstm = model_lstm.evaluate(X_test_lstm, y_test)[1]
print(f"Bi-LSTM Accuracy: {accuracy_lstm}")


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Bi-LSTM Accuracy: 0.550000011920929
