In [89]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.linear_model import LogisticRegression
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt_tab')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()



[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/shrenikborad/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shrenikborad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [90]:
#load data
data = pd.read_csv('/Users/shrenikborad/pless/csci6515_nlu/data/Tweets_5K.csv')
data.describe()

Unnamed: 0,textID,text,sentiment
count,5000,5000,5000
unique,5000,4999,3
top,375b64de34,may the 4th be with you,neutral
freq,1,2,2023


In [91]:
#Load the data:
raw_tweets = data['text']
sentiments = data['sentiment']

sentiment_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
labels = sentiments.map(sentiment_mapping)
raw_tweets.head()

0                  I`d have responded, if I were going
1        Sooo SAD I will miss you here in San Diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     Sons of ****, why couldn`t they put them on t...
Name: text, dtype: object

In [92]:
#Basic preprocessing
#split each tweet into words
basic_preproc_tweets = raw_tweets.str.strip().str.split(" ")
# basic_preproc_tweets.replace("", pd.NA, inplace=True)
basic_preproc_tweets.head()


0          [I`d, have, responded,, if, I, were, going]
1    [Sooo, SAD, I, will, miss, you, here, in, San,...
2                      [my, boss, is, bullying, me...]
3                 [what, interview!, leave, me, alone]
4    [Sons, of, ****,, why, couldn`t, they, put, th...
Name: text, dtype: object

In [93]:
basic_preproc_bow = basic_preproc_tweets.apply(lambda x: pd.Series(x).value_counts()).fillna(0)

In [94]:
basic_preproc_bow.shape

(5000, 15122)

In [95]:
print(basic_preproc_bow.iloc[1460, 1460])
print(basic_preproc_bow.columns[1460])

0.0
300th


In [96]:
Tweets_train, Tweets_test, Labels_train, Labels_test = train_test_split(basic_preproc_bow, labels, test_size=0.2, shuffle=False)



In [97]:
multi_nb = MultinomialNB(alpha=1.0)
model = multi_nb.fit(Tweets_train, Labels_train)
pred_labels = model.predict(Tweets_test)
accuracy = accuracy_score(Labels_test, pred_labels)
print("Accuracy of NB: ", accuracy)

Accuracy of NB:  0.577


In [98]:
labels_pred_most_common = np.array([Labels_train.value_counts().idxmax()] * Labels_test.count())
accuracy_most_common = accuracy_score(Labels_test, labels_pred_most_common)
print("Accuracy of most common: ", accuracy_most_common)

Accuracy of most common:  0.427


In [99]:
log_reg = LogisticRegression(max_iter=500)
model = log_reg.fit(Tweets_train, Labels_train)
pred_labels = model.predict(Tweets_test)
accuracy = accuracy_score(Labels_test, pred_labels)
print("Accuracy of Logistic Regression: ", accuracy)

Accuracy of Logistic Regression:  0.587


In [100]:
print(model.coef_.shape)

(3, 15122)


In [101]:
def preprocess_tweet(tweet):
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'#\w+', '', tweet)
    tweet = re.sub(r'`', '', tweet)
    tweet = re.sub(r'!', '', tweet)
    tweet = tweet.lower()
    tokens = tweet.split(" ")

    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tweet = ' '.join(tokens)
    return tweet

adv_preproc_tweets = raw_tweets.copy().apply(preprocess_tweet)
print(raw_tweets.head())
adv_preproc_tweets.head()


0                  I`d have responded, if I were going
1        Sooo SAD I will miss you here in San Diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     Sons of ****, why couldn`t they put them on t...
Name: text, dtype: object


0                   id have responded, if i were going
1           sooo sad i will miss you here in san diego
2                             my bos is bullying me...
3                        what interview leave me alone
4     son of ****, why couldnt they put them on the...
Name: text, dtype: object

In [102]:
New_Tweets_train, New_Tweets_test, New_Labels_train, New_Labels_test = train_test_split(adv_preproc_tweets, labels, test_size=0.2, shuffle=False)

vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2))
Tfidf_Tweets_train = vectorizer.fit_transform(New_Tweets_train)
Tfidf_Tweets_test = vectorizer.transform(New_Tweets_test)

# Train a logistic regression model
model_tfidf = LogisticRegression()
model_tfidf.fit(Tfidf_Tweets_train, New_Labels_train)

# Make predictions
labels_tfidf_pred = model_tfidf.predict(Tfidf_Tweets_test)
accuracy_tfidf = accuracy_score(New_Labels_test, labels_tfidf_pred)
print("Accuracy of Logistic Regression with TF-IDF: ", accuracy_tfidf)

Accuracy of Logistic Regression with TF-IDF:  0.617


In [105]:
# Print top 50 tweets that were wrongly classified with predicted and actual labels
wrongly_classified_indices = np.where(labels_tfidf_pred != New_Labels_test)[0]
wrongly_classified_tweets = New_Tweets_test.iloc[wrongly_classified_indices]
wrongly_classified_pred_labels = labels_tfidf_pred[wrongly_classified_indices]
wrongly_classified_actual_labels = New_Labels_test.iloc[wrongly_classified_indices]
wrongly_classified_df = pd.DataFrame({
    'Cleaned Tweet': wrongly_classified_tweets,
    'Original Tweet': raw_tweets.iloc[wrongly_classified_indices + 4000].values,
    'Predicted Label': wrongly_classified_pred_labels,
    'Actual Label': wrongly_classified_actual_labels
})


# get original tweets
wrongly_classified_df.to_csv('wrongly_classified_tweets.csv', index=False)


#counts of wrongly classified tweets
print(wrongly_classified_df['Actual Label'].value_counts())

Actual Label
-1    145
 0    122
 1    116
Name: count, dtype: int64


In [104]:
# check if missclossfication increses with length of tweet
def tweet_length(tweet):
    return len(tweet.split())

wrongly_classified_df['Tweet Length'] = wrongly_classified_df['Cleaned Tweet'].apply(tweet_length)
# make bins
bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
labels = ['0-5', '6-10', '11-15', '16-20', '21-25', '26-30', '31-35', '36-40', '41-45', '46-50']
wrongly_classified_df['Tweet Length Bin'] = pd.cut(wrongly_classified_df['Tweet Length'], bins=bins, labels=labels)
print(wrongly_classified_df['Tweet Length Bin'].value_counts())

Tweet Length Bin
6-10     89
11-15    84
21-25    74
16-20    68
0-5      42
26-30    26
31-35     0
36-40     0
41-45     0
46-50     0
Name: count, dtype: int64
