<a href="https://colab.research.google.com/github/ramarh1/FIFA-World-Cup-Sentiment-Analysis/blob/main/tweet_final_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import spacy
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


**References**

https://medium.com/@piocalderon/vader-sentiment-analysis-explained-f1c4f9101cd9

# **Read Dataset**

In [None]:
tweets = pd.read_csv("./fifa_world_cup_2022_tweets.csv", encoding='utf-8')
tweets

FileNotFoundError: ignored

In [None]:
#Check number columns and their name
tweets.columns

## **1. Visualization of the Dataset**

In [None]:
# Breakdown of tweets' sentiments

# setting figure size
plt.figure(figsize=(10, 5))

#make the bar chart
ax = pd.value_counts(tweets['Sentiment']).plot.bar(
    color=['purple', 'yellow', 'black'])

#call function to add value labels
for p in ax.patches:
   ax.annotate('{:.1f}'.format(p.get_height()),
               (p.get_x()+0.25, p.get_height()+0.01))
plt.show()


It looks like most of the tweets in the FIFA world cup is deemed positive. However, the predetermined sentiment tag does not seem to do well in deciding the tweets' sentiments if the emotions are not at extreme ends, resulting in a lot of neutral sentiment tags as well

In [None]:
# plot for sources of the tweets
sources = tweets['Source of Tweet'].value_counts()[:5]
labels = ['Twitter for iPhone', 'Twitter for Android',
          'Twitter Web App', 'TweetDeck', 'Twitter for iPa']
plt.pie(sources, labels=labels, autopct='%1.2f%%')
plt.show()


Most of the tweets seem to come from mobile platform, which makes sense, because Twitter is mobile-oriented social media platform

## **2. Data Preprocessing**

### 2.1 Extract urls, hashtags, usernames and remove them from original text

In [None]:
#extract all the urls in the tweets into an object
url = tweets.Tweet.str.extract(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})').head()
url

In [None]:
# function to print all the hashtags in a text
def extract_hashtags(text):
    # splitting the text into words
    hashtags = object()
    for word in text.split():
        if word[0] == "#":
            hashtags = word[1:]
    return(hashtags)


#create an object to store all the hashtags in tweets
tweets['hashtags'] = tweets['Tweet'].apply(lambda x: extract_hashtags(x))


In [None]:
# Breakdown of the hashtags in the tweets
# top 10 most mentioned hashtags
ax = tweets['hashtags'].value_counts()[:10].plot(kind='barh', figsize=(5, 5))

The host country Qatar is a hot topic during the FIFA Word Cup period, with it being mentioned quite often.


In [None]:
# Tweets have some newline - need to replace 
def whitespace_sub(text):
    tabs= re.findall( '\n', text)
    for tab in tabs:
        text = text.replace(tab[0], ' ')
    return text

tweets['Tweet']= tweets['Tweet'].apply(whitespace_sub)


In [None]:
# username extraction from tweets
def extract_usernames(text):
    usernames = ()
    for word in text.lower().split():
        if word.startswith('@') and len(word) > 1:
            usernames = word[1:]
    return usernames


tweets['usernames'] = tweets['Tweet'].apply(lambda x: extract_usernames(x))



In [None]:
tweets1 = tweets[tweets['usernames'].astype(bool)]
usernames = tweets1['usernames'].value_counts()[:10]
usernames.plot(kind='barh')


BTS, the opening singer for the WordCup, and Elon Musk, the new Twitter's CEO, are mentioned the most during the event period

In [None]:
tweets.head()

### 2.2 Remove stop words

In [None]:
#download the stopword library
nltk.download("stopwords")

#save stop words as a set
stop_words = set(stopwords.words("english"))


In [None]:
#custom stop words - to minimize repeated words
new_stop_words = ['FIFA','WordCup2022','World Cup', 'worldcup', 'qatar', 'qatar2022','world','cup','worldcup2022','qatarworldcup2022', 'qatar2022', '2022',
                  'fifaworldcup','qatarworldcup2022','football', 'fifa', 'qatarworldcup']
all_stop_words = stop_words.union(new_stop_words)

# convert the stop words into a list
all_stop_words_list = list(all_stop_words)


In [None]:
#stemming
snow = nltk.stem.SnowballStemmer('english')


In [None]:
# Combine all above to one function to fix texts
def fix_Text(text):
	letters = re.sub("[^a-zA-Z]https?:\/\/\S*", " ", str(text)) #remove all non-letters and urls
	letters_1 = re.sub("#[A-Za-z0-9_]+", "",str(letters)) #remove all hashtags
	letters_2 = re.sub("@[A-Za-z0-9_]+", "",str(letters_1)) #remove all mentions
	letters_3 = re.sub(r'[^\x00-\x7F]+', ' ', str(letters_2))

	words = letters_3.lower().split() #make all letters lowercase
	meaningful = [snow.stem(word)
               for word in words if word not in all_stop_words] #convert to stemmed words
	return (" ".join(meaningful)) 


In [None]:
#apply the function on all the tweets
clean_tweets = tweets.Tweet.apply(fix_Text)
clean_tweets.head()

In [None]:
#check for empty documents
clean_tweets.isnull().sum()

In [None]:
# fix the format on sentiment tag
def fix_sentiment(df):
  df = df.replace(['NEU','NEUTRAL'], 'neutral')
  df = df.replace('POS', 'positive')
  df = df.replace('NEG', 'negative')

### 2.3 Create Word Clouds

In [None]:

from wordcloud import WordCloud
import matplotlib.pyplot as plt


In [None]:
#need to convert from object dtypes to string
tweets['hashtags'] = tweets['hashtags'].astype(str)
# convert the list of hashtags into one global document for the wordcloud
hashtags = " ".join(tweets['hashtags'].tolist())

#create the wordcloud of all hashtags
hashtag_wordcloud = WordCloud(width=1600, height=800,
                              background_color='white').generate(hashtags)

# Display the wordcloud
plt.figure(figsize=(5, 5))
plt.imshow(hashtag_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# word cloud for all the tweets
text_wc = " ".join(tweets['Tweet'].to_list())
wordcloud_tweets = WordCloud(background_color="white").generate(text_wc)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud_tweets, interpolation="bilinear")
plt.axis("off")
plt.show()


### 2.4 Create TFIDF and Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()


In [None]:
# Combine all above to one function to fix texts
def fix_Text(text):
	letters = re.sub("[^a-zA-Z]https?:\/\/\S*", " ", str(text)
	                 )  # remove all non-letters and urls
	letters_1 = re.sub("#[A-Za-z0-9_]+", "", str(letters))  # remove all hashtags
	letters_2 = re.sub("@[A-Za-z0-9_]+", "", str(letters_1)
	                   )  # remove all mentions
	letters_3 = re.sub(r'[^\x00-\x7F]+', ' ', str(letters_2))

	words = letters_3.lower().split()  # make all letters lowercase
	meaningful = [snow.stem(word)
               for word in words if word not in all_stop_words]  # convert to stemmed words
	return (" ".join(meaningful))


#apply function
tweets['tweet_lem'] = tweets['Tweet'].apply(lambda x: fix_Text(x))

In [None]:
def fix_Text2(text):
	letters = re.sub("[^a-zA-Z]https?:\/\/\S*", " ", str(text)
	                 )  # remove all non-letters and urls
	letters_1 = re.sub("#[A-Za-z0-9_]+", "", str(letters))  # remove all hashtags
	letters_2 = re.sub("@[A-Za-z0-9_]+", "", str(letters_1)
	                   )  # remove all mentions
	letters_3 = re.sub(r'[^\x00-\x7F]+', ' ', str(letters_2))

	words = letters_3.lower().split()  # make all letters lowercase
	# meaningful = [snow.stem(word)
  #              for word in words if word not in all_stop_words] #convert to stemmed words
	return (" ".join(words))


#apply function
tweets['tweet_updated'] = tweets['Tweet'].apply(lambda x: fix_Text2(x))


In [None]:
# word cloud for all the tweets
text_wc = " ".join(tweets['tweet_updated'].to_list())
wordcloud_tweets = WordCloud(background_color="white").generate(text_wc)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud_tweets, interpolation="bilinear")
plt.axis("off")
plt.show()


In [None]:
cv = CountVectorizer(stop_words=all_stop_words_list, ngram_range=(
    1, 1), tokenizer=tweet_tokenizer.tokenize)
text_count = cv.fit_transform(tweets['tweet_lem'])


In [None]:
text_count

##  **3. Sentiment Analysis Using Vader**

We want to create our own sentiment labeling using a different module to see if the sentiment tagging aligns between two methods and potentially reduce the amount of neutral tags because neutraulity does not tell us much.

In [None]:
!pip3 install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sent = SentimentIntensityAnalyzer()


In [None]:
# Go through tweets and save polarity score
tweets['scores'] = tweets['Tweet'].apply(lambda Tweet: sent.polarity_scores(str(Tweet)))

#Extract compound score from polarity score
tweets['compound']=tweets['scores'].apply(lambda score_dict:score_dict['compound'])

In [None]:
#Creates the Vader Sentiment Analysis Column using polarity score values
tweets['vader_analysis']=''
tweets.loc[tweets.compound>0,'vader_analysis']='positive'
tweets.loc[tweets.compound==0,'vader_analysis']='neutral'
tweets.loc[tweets.compound<0,'vader_analysis']='negative'
tweets.head()

In [None]:
tweets['Sentiment'].value_counts()

In [None]:
tweets['vader_analysis'].value_counts()

The Vader module reduces the neutral tags but it also decreases the count of negative Tweets in the dataset. From our research, Vader lexicon is trained with consideration of slangs and colloquialisms, so we believe it would be more suitable in evaluatinng Tweets' sentiment. In the Twitter world, users often use slangs, emojis, punctuations, etc to express their ideas, so taking in consideration these nuances would be more useful for us to understand the tweets' sentiments.


## **4. Models**

### 4.1 Logistic Regression -- using vader analysis as the known sentiment label

In [None]:
# split dataset to train and test
#By default, Sklearn train_test_split will make random partitions for the two subsets.
X_train, X_test, y_train, y_test = train_test_split(
    tweets['Tweet'], tweets['vader_analysis'], test_size=0.2)


In [None]:
#create TFIDF on the split sets
tf = TfidfVectorizer()
tf.fit(X_train)
x_train = tf.transform(X_train)
x_test = tf.transform(X_test)


In [None]:
#fit model on train test
from sklearn.linear_model import LogisticRegression

#set seed
random_seed = 42

lr = LogisticRegression(random_state=random_seed)
lr.fit(x_train, y_train)


In [None]:
lr_predict = lr.predict(x_test)
lr_predict


In [None]:
# evaluate accurary
from sklearn.metrics import accuracy_score
print("Accuracy = {:.2f}%".format(accuracy_score(y_test, lr_predict)*100))


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, lr_predict))


In [None]:
#evaluate confusion matrix
cm2 = metrics.confusion_matrix(y_test,lr_predict)
disp = metrics.ConfusionMatrixDisplay(
    confusion_matrix=cm2, display_labels=lr.classes_)
disp.plot()
plt.show()


### 4.2 Naive Bayes Model

In [None]:
#By default, Sklearn train_test_split will make random partitions for the two subsets.
X_train, X_test, y_train, y_test = train_test_split(
    text_count, tweets['Sentiment'], test_size=0.2, random_state=5)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
#set seed
np.random.seed(42)
MNB = MultinomialNB()
MNB.fit(X_train,y_train)

In [None]:
nb_predict = MNB.predict(X_test)

In [None]:
accuracy_score = metrics.balanced_accuracy_score(y_test,nb_predict)
print("NB Accuracy = {:.2f}%".format(accuracy_score*100))


In [None]:
print(f"Classification Report:{metrics.classification_report(y_test,nb_predict)}")

In [None]:
cm = confusion_matrix(y_test,nb_predict)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=MNB.classes_)
disp.plot()
plt.show()

### 4.3 Random Forest Classifier (Ensemble Learning Model)

In [None]:
#By default, Sklearn train_test_split will make random partitions for the two subsets.
X_train, X_test, y_train, y_test = train_test_split(
    tweets['tweet_updated'], tweets['Sentiment'], test_size=0.2)

In [None]:
#convert the tweets to tfidf
tf = TfidfVectorizer()
tf.fit(X_train)
x_train2 = tf.transform(X_train) #matrix
x_test2 = tf.transform(X_test) #matrix

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=random_seed)
clf = clf.fit(x_train2,y_train)
rf_predict = clf.predict(x_test2)
accuracy_score2 = metrics.balanced_accuracy_score(y_test,rf_predict)

In [None]:
print("RF Accuracy = {:.2f}%".format(accuracy_score2*100))

In [None]:
print(f"Classification Report:{metrics.classification_report(y_test,rf_predict)}")

In [None]:
#evaluate confusion matrix
cm1 = metrics.confusion_matrix(y_test,rf_predict)
disp = metrics.ConfusionMatrixDisplay(
    confusion_matrix=cm1, display_labels=clf.classes_)
disp.plot()
plt.show()


In [None]:
RCF_preds_comparasion = pd.DataFrame()
RCF_preds_comparasion['RCF_prediction'] = y_predict
RCF_preds_comparasion['actual_sentiment'] = tweets['Sentiment']
RCF_preds_comparasion['tweet'] = tweets['Tweet']
RCF_preds_comparasion.head(10)

## Transformer Based Model

In [None]:
!pip3 install torch torchvision torchaudio


In [None]:
!pip install transformers
from transformers import pipeline
sentiment_pipeline = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")

In [None]:
def get_sentiment(tweet):
    sentiment = sentiment_pipeline(tweet)
    label = sentiment[0]['label']
   # print(label, tweet)
    return label
  

In [None]:
def get_score(tweet):
    sentiment = sentiment_pipeline(tweet)
    score = sentiment[0]['score']
    return score

In [None]:
tweets['Transformer Sentiment'] = tweets['tweet_updated'].apply(lambda x: get_sentiment(x))
tweets['Transformer Score'] = tweets['tweet_updated'].apply(lambda x: get_score(x))

In [None]:
tweets["Transformer Sentiment"] = tweets["Transformer Sentiment"].replace('NEU', 'neutral')
tweets["Transformer Sentiment"] = tweets["Transformer Sentiment"].replace('POS', 'positive')
tweets["Transformer Sentiment"] = tweets["Transformer Sentiment"].replace('NEG', 'negative')

In [None]:
tb_predict = tweets['Transformer Sentiment']

In [None]:
fix_sentiment(tweets['vader_analysis'])

In [None]:
tweets.head()

In [None]:
accuracy_score3 = metrics.balanced_accuracy_score(tweets['Sentiment'],tb_predict)
print("Transformer Base Accuracy = {:.2f}%".format(accuracy_score3*100))

In [None]:
print(f"Classification Report:{classification_report(tweets['Sentiment'],tb_predict)}")

In [None]:
#evaluate confusion matrix
cm2 = metrics.confusion_matrix(tweets['Sentiment'],tb_predict)
disp = metrics.ConfusionMatrixDisplay(
    confusion_matrix=cm2, display_labels=clf.classes_)
disp.plot()
plt.show()

In [None]:
#compare all models
preds_comparasion = pd.DataFrame()
preds_comparasion['LR_prediction'] = lr_predict
preds_comparasion['NB_prediction'] = nb_predict
preds_comparasion['RF_prediction'] = rf_predict
preds_comparasion['TB_prediction'] = tb_predict

preds_comparasion['original_sentiment'] = tweets['Sentiment']
preds_comparasion['vader_sentiment'] = tweets['vader_analysis']
preds_comparasion['tweet'] = tweets['Tweet']
preds_comparasion.head(50)


The random forest model seems to has the worst accuracy score as well as the most disagreements with the other models' sentiment taggings. 