In [11]:
import pandas as pd
import re
import emoji

df=pd.read_csv("covid19_tweets.csv")

Eseguiamo un text cleaning, emoji feature extraction, emoji semantic conversation e keyword filtering

In [12]:
#Clean text
def clean_tweet(text):
    text=text.lower() #lower text
    text=re.sub(r"https?://\S+","",text) #remove links
    text = re.sub(r"www\.\S+", "", text)
    text=re.sub(r"@\w+","",text) #remove mention
    text=re.sub(r"\brt\b","",text) #remove retweet
    text=re.sub(r"#(\w+)",r"\1",text) # remove hashtag
    text=re.sub(r"[\n\t]","",text) # remove newline and tab
    return text.strip()

df["text_clean"]=df["text"].apply(clean_tweet)

def has_emoji(text):
    return any(char in emoji.EMOJI_DATA for char in text)

def extract_emojis(text):
    return [char for char in text if char in emoji.EMOJI_DATA]
df["emojis"]=df["text_clean"].apply(extract_emojis)
df["emoji_count"]=df["emojis"].apply(len)
df["has_emoji"]=df["emoji_count"]>0

#Transform emoji in token
def demojize_text(text):
    text=emoji.demojize(text,language='en')
    text=re.sub(r":([a-z_]+):",r" emoji_\1",text)
    return text.strip()
df["text_demojize"]=df["text_clean"].apply(demojize_text)

# List of keywords to identify COVID-19 related tweets
covid_keywords=["covid","covid19","coronavirus","sars-cov-2","pandemic"]

# Function to check if a text mentions COVID-19
def is_covid_related(text):
    pattern = r"\b(" + "|".join(covid_keywords) + r")\b"
    return bool(re.search(pattern, text))

df_with_emoji=df[df["has_emoji"]].copy()
df_with_emoji=df_with_emoji[df_with_emoji["text_clean"].apply(is_covid_related)].copy()

def remove_duplicate_emoji(emoji_list):
    return list(dict.fromkeys(emoji_list))


df_with_emoji["emojis_unique"]=df_with_emoji["emojis"]

df_with_emoji[["text_clean","emojis","emoji_count","emojis_unique","text_demojize"]].sample(100)


Unnamed: 0,text_clean,emojis,emoji_count,emojis_unique,text_demojize
127412,watch live üîó: nz government to give the latest...,[üîó],1,[üîó],watch live emoji_link: nz government to give ...
109812,dr scott atlas now an advisor to the wh...hopi...,"[üôè, üèº]",2,"[üôè, üèº]",dr scott atlas now an advisor to the wh...hopi...
136114,coronavirusupdates: üìçtotal covid19 cases in in...,"[üìç, ‚û°]",2,"[üìç, ‚û°]",coronavirusupdates: emoji_round_pushpintotal ...
76295,bluetiful üíô..cakesinosogbo bbnaijalockdown202...,[üíô],1,[üíô],bluetiful emoji_blue_heart..cakesinosogbo bb...
150279,üìπ click like follow i love ‚ù§Ô∏è you my tribe! th...,"[üìπ, ‚ù§]",2,"[üìπ, ‚ù§]",emoji_video_camera click like follow i love e...
...,...,...,...,...,...
4512,covid19 impact on indiansüë∑üèΩ‚Äç‚ôÇÔ∏èmillions lost jo...,"[üë∑, üèΩ, ‚ôÇ, üö∂, üè†, üß¥]",6,"[üë∑, üèΩ, ‚ôÇ, üö∂, üè†, üß¥]",covid19 impact on indians emoji_man_constructi...
139355,‚ú®note: dr. limengyan‚Äôs allegation of china‚Äôs c...,[‚ú®],1,[‚ú®],emoji_sparklesnote: dr. limengyan‚Äôs allegation...
178584,hungary covid19 procurement ü§î,[ü§î],1,[ü§î],hungary covid19 procurement emoji_thinking_face
108016,be your own muse...‚ú®‚ú®üßöüèΩ‚Äç‚ôÄÔ∏ègoodmorningtwitterwo...,"[‚ú®, ‚ú®, üßö, üèΩ, ‚ôÄ]",5,"[‚ú®, ‚ú®, üßö, üèΩ, ‚ôÄ]",be your own muse... emoji_sparkles emoji_spark...


In [20]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

#Tokenize text using NLTK
df_with_emoji["tokens"]=df_with_emoji["text_demojize"].apply(word_tokenize)

#Remove Stopwords
stopwords=set(stopwords.words('english'))

def remove_stopwords(tokens):
    #Remove english and non-alphabetic tokens
    return [word for word in tokens if word.isalpha() and word not in stopwords]
df_with_emoji["tokens_no_stop"]=df_with_emoji["tokens"].apply(remove_stopwords)


#Lemmantization
lemmatizer=WordNetLemmatizer()

#Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'): 
        return wordnet.NOUN
    elif tag.startswith('R'): 
        return wordnet.ADV
    else:
        return wordnet.NOUN #Default

def lemmatize_tokens(tokens):
    if not isinstance(tokens,list) or len(tokens) ==0:
        return []
    tagged_tokens=pos_tag(tokens)
    return [lemmatizer.lemmatize(word,get_wordnet_pos(pos)) for word,pos in tagged_tokens]

df_with_emoji["final_tokens"]=df_with_emoji["tokens_no_stop"].apply(lemmatize_tokens)

df_with_emoji["final_text"]=df_with_emoji["final_tokens"].apply(lambda x: " ".join(x))
df_with_emoji[["tokens","tokens_no_stop","final_tokens","final_text"]].sample(10)



Unnamed: 0,tokens,tokens_no_stop,final_tokens,final_text
32841,"[everyday, day, i, open, social, media, my, he...","[everyday, day, open, social, media, heart, ac...","[everyday, day, open, social, medium, heart, a...",everyday day open social medium heart ache see...
122094,"[worried, about, hospital, bills, due, to, cov...","[worried, hospital, bills, due, stay, protecte...","[worried, hospital, bill, due, stay, protected...",worried hospital bill due stay protected healt...
62404,"[i, forgot, there, was, only, one, person, in,...","[forgot, one, person, whole, world, could, get]","[forgot, one, person, whole, world, could, get]",forgot one person whole world could get
90425,"[the, american, problem, emoji_backhand_index_...","[american, problem, right, get, leadership, co...","[american, problem, right, get, leadership, co...",american problem right get leadership covidiots
120583,"[emoji_fire, 10, coronavirus, scams, to, be, w...","[coronavirus, scams, wary, uk, finance, urges,...","[coronavirus, scam, wary, uk, finance, urge, p...",coronavirus scam wary uk finance urge people g...
167608,"[covid19, ‚Äò, scientific, ‚Äô, publications, :, o...","[scientific, publications, oops, also, see]","[scientific, publication, oops, also, see]",scientific publication oops also see
1051,"[maharashtra, police, covid19, update, frontli...","[maharashtra, police, update, frontlineheroes,...","[maharashtra, police, update, frontlineheroes,...",maharashtra police update frontlineheroes posi...
136860,"[clean, bold, ,, due, to, covid19, mamy, alrea...","[clean, bold, due, mamy, already, fear, n, dil...","[clean, bold, due, mamy, already, fear, n, dil...",clean bold due mamy already fear n dilem click...
147748,"[here, 's, your, solution, :, oncoming_fist_me...","[solution, bybyfitness, schoolsreopening, bbna...","[solution, bybyfitness, schoolsreopening, bbna...",solution bybyfitness schoolsreopening bbnaija ...
66817,"[covid19, be, a, good, start, so.u.k, ., start...","[good, start, starting, look, like, one, world...","[good, start, start, look, like, one, world, b...",good start start look like one world big problem


In [22]:
#Create Bag of Words

from sklearn.feature_extraction.text import CountVectorizer

vectorize=CountVectorizer()

X=vectorize.fit_transform(df_with_emoji["final_text"])

print(X.shape)

print(vectorize.get_feature_names_out)

(9073, 11109)
<bound method CountVectorizer.get_feature_names_out of CountVectorizer()>


In [26]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB,MultinomialNB
from sklearn.metrics import classification_report,accuracy_score

analyzer=SentimentIntensityAnalyzer()

#We generate labels using VADER (Automatically handles emohis)
def label_sentiment(text):
   score=analyzer.polarity_scores(text)['compound']
   if score>0.05:
      return "Positive"
   elif score<=-0.05:
      return "Negative"
   else:
      return "Neutral"
   
df_with_emoji["sentiment"]=df_with_emoji["text_clean"].apply(label_sentiment)

X=df_with_emoji["text_demojize"]
y=df_with_emoji["sentiment"] #Target labels

#Training
X_train_raw,X_test_raw,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#TF-IDF
tfidf=TfidfVectorizer(ngram_range=(1,2),token_pattern=r"\b\w\w+\b",stop_words="english")

#Fit on training data and transform both sets into numerical matrices
X_train_tfidf=tfidf.fit_transform(X_train_raw)
X_test_tfidf=tfidf.transform(X_test_raw)

#Initializing and training Complement Naive Bayes Classifier
model=MultinomialNB()
model.fit(X_train_tfidf,y_train)

#Model evaluation
y_pred=model.predict(X_test_tfidf)

print(f"Global Accuracy: {accuracy_score(y_test,y_pred):.2f}\n")
print(classification_report(y_test,y_pred))

Global Accuracy: 0.63

              precision    recall  f1-score   support

    Negative       0.92      0.22      0.35       386
     Neutral       0.77      0.50      0.61       632
    Positive       0.57      0.94      0.71       797

    accuracy                           0.63      1815
   macro avg       0.75      0.55      0.56      1815
weighted avg       0.71      0.63      0.60      1815

