In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.linear_model import LogisticRegression
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt_tab')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
#load data
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Tweets_5K.csv')
data.describe()

Unnamed: 0,textID,text,sentiment
count,5000,5000,5000
unique,5000,4999,3
top,cb774db0d1,may the 4th be with you,neutral
freq,1,2,2023


In [None]:
#Load the data:
raw_tweets = data['text']
sentiments = data['sentiment']

sentiment_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
labels = sentiments.map(sentiment_mapping)
raw_tweets.head()

Unnamed: 0,text
0,"I`d have responded, if I were going"
1,Sooo SAD I will miss you here in San Diego!!!
2,my boss is bullying me...
3,what interview! leave me alone
4,"Sons of ****, why couldn`t they put them on t..."


In [None]:
#Basic preprocessing
#split each tweet into words
basic_preproc_tweets = raw_tweets.str.strip().str.split(" ")
# basic_preproc_tweets.replace("", pd.NA, inplace=True)
basic_preproc_tweets.head()


Unnamed: 0,text
0,"[I`d, have, responded,, if, I, were, going]"
1,"[Sooo, SAD, I, will, miss, you, here, in, San,..."
2,"[my, boss, is, bullying, me...]"
3,"[what, interview!, leave, me, alone]"
4,"[Sons, of, ****,, why, couldn`t, they, put, th..."


In [None]:
basic_preproc_bow = basic_preproc_tweets.apply(lambda x: pd.Series(x).value_counts()).fillna(0)

In [None]:
basic_preproc_bow.shape

(5000, 15122)

In [None]:
print(basic_preproc_bow.iloc[1460, 1460])
print(basic_preproc_bow.columns[1460])

0.0
HUGE


In [None]:
Tweets_train, Tweets_test, Labels_train, Labels_test = train_test_split(basic_preproc_bow, labels, test_size=0.2, shuffle=False)



In [None]:
multi_nb = MultinomialNB(alpha=1.0)
model = multi_nb.fit(Tweets_train, Labels_train)
pred_labels = model.predict(Tweets_test)
accuracy = accuracy_score(Labels_test, pred_labels)
print("Accuracy of NB: ", accuracy)

Accuracy of NB:  0.577


In [None]:
labels_pred_most_common = np.array([Labels_train.value_counts().idxmax()] * Labels_test.count())
accuracy_most_common = accuracy_score(Labels_test, labels_pred_most_common)
print("Accuracy of most common: ", accuracy_most_common)

Accuracy of most common:  0.427


In [None]:
log_reg = LogisticRegression(max_iter=500)
model = log_reg.fit(Tweets_train, Labels_train)
pred_labels = model.predict(Tweets_test)
accuracy = accuracy_score(Labels_test, pred_labels)
print("Accuracy of Logistic Regression: ", accuracy)

Accuracy of Logistic Regression:  0.587


In [None]:
print(model.coef_.shape)

(3, 15122)


In [None]:
def preprocess_tweet(tweet):
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'#\w+', '', tweet)
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = tweet.lower()
    tokens = word_tokenize(tweet)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tweet = ' '.join(tokens)
    return tweet

adv_preproc_tweets = raw_tweets.apply(preprocess_tweet)
print(raw_tweets.head())
adv_preproc_tweets.head()


0                  I`d have responded, if I were going
1        Sooo SAD I will miss you here in San Diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     Sons of ****, why couldn`t they put them on t...
Name: text, dtype: object


Unnamed: 0,text
0,i d have responded if i were going
1,sooo sad i will miss you here in san diego
2,my bos is bullying me
3,what interview leave me alone
4,son of why couldn t they put them on the relea...


In [None]:
New_Tweets_train, New_Tweets_test, New_Labels_train, New_Labels_test = train_test_split(adv_preproc_tweets, labels, test_size=0.2, shuffle=False)

vectorizer = TfidfVectorizer()
Tfidf_Tweets_train = vectorizer.fit_transform(New_Tweets_train)
Tfidf_Tweets_test = vectorizer.transform(New_Tweets_test)

# Train a logistic regression model
model_tfidf = LogisticRegression()
model_tfidf.fit(Tfidf_Tweets_train, New_Labels_train)

# Make predictions
labels_tfidf_pred = model_tfidf.predict(Tfidf_Tweets_test)
accuracy_tfidf = accuracy_score(New_Labels_test, labels_tfidf_pred)
print("Accuracy of Logistic Regression with TF-IDF: ", accuracy_tfidf)

Accuracy of Logistic Regression with TF-IDF:  0.614


In [None]:
# Print top 50 tweets that were wrongly classified with predicted and actual labels
wrongly_classified_indices = np.where(labels_tfidf_pred != New_Labels_test)[0]
print(wrongly_classified_indices[:50])
wrongly_classified_tweets = New_Tweets_test.iloc[wrongly_classified_indices]
wrongly_classified_pred_labels = labels_tfidf_pred[wrongly_classified_indices]
wrongly_classified_actual_labels = New_Labels_test.iloc[wrongly_classified_indices]
wrongly_classified_df = pd.DataFrame({
    'Tweet': wrongly_classified_tweets,
    'Predicted Label': wrongly_classified_pred_labels,
    'Actual Label': wrongly_classified_actual_labels
})
# write to csv
wrongly_classified_df.to_csv('wrongly_classified_tweets.csv', index=False)