In [11]:
import pandas as pd
import re
import emoji
import numpy as np

df=pd.read_csv("covid19_tweets.csv")

Eseguiamo un text cleaning, emoji feature extraction, emoji semantic conversation e keyword filtering

In [12]:
#Clean text
def clean_tweet(text):
    text=text.lower() #lower text
    text=re.sub(r"https?://\S+","",text) #remove links
    text = re.sub(r"www\.\S+", "", text)
    text=re.sub(r"@\w+","",text) #remove mention
    text=re.sub(r"\brt\b","",text) #remove retweet
    text=re.sub(r"#(\w+)",r"\1",text) # remove hashtag
    text=re.sub(r"[\n\t]","",text) # remove newline and tab
    return text.strip()

df["text_clean"]=df["text"].apply(clean_tweet)

def has_emoji(text):
    return any(char in emoji.EMOJI_DATA for char in text)

def extract_emojis(text):
    return [char for char in text if char in emoji.EMOJI_DATA]
df["emojis"]=df["text_clean"].apply(extract_emojis)
df["emoji_count"]=df["emojis"].apply(len)
df["has_emoji"]=df["emoji_count"]>0

#Transform emoji in token
def demojize_text(text):
    text=emoji.demojize(text,language='en')
    text=re.sub(r":([a-z_]+):",r" emoji_\1",text)
    return text.strip()
df["text_demojize"]=df["text_clean"].apply(demojize_text)

# List of keywords to identify COVID-19 related tweets
covid_keywords=["covid","covid19","coronavirus","sars-cov-2","pandemic"]

# Function to check if a text mentions COVID-19
def is_covid_related(text):
    pattern = r"\b(" + "|".join(covid_keywords) + r")\b"
    return bool(re.search(pattern, text))

df_with_emoji=df[df["has_emoji"]].copy()
df_with_emoji=df_with_emoji[df_with_emoji["text_clean"].apply(is_covid_related)].copy()

def remove_duplicate_emoji(emoji_list):
    return list(dict.fromkeys(emoji_list))


df_with_emoji["emojis_unique"]=df_with_emoji["emojis"]

df_with_emoji[["text_clean","emojis","emoji_count","emojis_unique","text_demojize"]].sample(100)


Unnamed: 0,text_clean,emojis,emoji_count,emojis_unique,text_demojize
25563,biotech stocks are like literally lottery tick...,[üòÇ],1,[üòÇ],biotech stocks are like literally lottery tick...
10781,indiafightscorona:covid19 labs in india (as on...,[üëá],1,[üëá],indiafightscorona:covid19 labs in india (as on...
63237,well done mate. waiting for arthroscopy on my ...,[üò°],1,[üò°],well done mate. waiting for arthroscopy on my ...
157545,ersa2020 web conference d-day -3!‚û°Ô∏è register n...,[‚û°],1,[‚û°],ersa2020 web conference d-day -3! emoji_right_...
104737,"üò∑ ""i am forced to celebrate my 60th birthday i...",[üò∑],1,[üò∑],"emoji_face_with_medical_mask ""i am forced to c..."
...,...,...,...,...,...
170502,üì¢coronavirusupdates:üìçcovid19 india tracker(as ...,"[üì¢, üìç, ‚û°, ‚û°]",4,"[üì¢, üìç, ‚û°, ‚û°]",emoji_loudspeakercoronavirusupdates: emoji_rou...
30854,and instead gave a big fu üñïüèª to covid19 preven...,"[üñï, üèª]",2,"[üñï, üèª]",and instead gave a big fu emoji_middle_finger...
111453,a new youth &amp; covid19 üò∑report finds that 6...,[üò∑],1,[üò∑],a new youth &amp; covid19 emoji_face_with_med...
108162,‚Å¶‚Å© said at the start of lockdown ‚Äúwe are all i...,[üëá],1,[üëá],‚Å¶‚Å© said at the start of lockdown ‚Äúwe are all i...


In [13]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

#Tokenize text using NLTK
df_with_emoji["tokens"]=df_with_emoji["text_demojize"].apply(word_tokenize)

#Remove Stopwords
stopwords=set(stopwords.words('english'))

def remove_stopwords(tokens):
    #Remove english and non-alphabetic tokens
    return [word for word in tokens if word.isalpha() and word not in stopwords]
df_with_emoji["tokens_no_stop"]=df_with_emoji["tokens"].apply(remove_stopwords)


#Lemmantization
lemmatizer=WordNetLemmatizer()

#Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'): 
        return wordnet.NOUN
    elif tag.startswith('R'): 
        return wordnet.ADV
    else:
        return wordnet.NOUN #Default

def lemmatize_tokens(tokens):
    if not isinstance(tokens,list) or len(tokens) ==0:
        return []
    tagged_tokens=pos_tag(tokens)
    return [lemmatizer.lemmatize(word,get_wordnet_pos(pos)) for word,pos in tagged_tokens]

df_with_emoji["final_tokens"]=df_with_emoji["tokens_no_stop"].apply(lemmatize_tokens)

df_with_emoji["final_text"]=df_with_emoji["final_tokens"].apply(lambda x: " ".join(x))
df_with_emoji[["tokens","tokens_no_stop","final_tokens","final_text"]].sample(10)



Unnamed: 0,tokens,tokens_no_stop,final_tokens,final_text
54293,"[isn, ‚Äô, t, everyone, technically, anti-covid1...","[everyone, technically, except, maybe, big, ph...","[everyone, technically, except, maybe, big, ph...",everyone technically except maybe big pharma
29814,"[australia, :, medical, gps, ,, specialists, ,...","[australia, medical, gps, specialists, nurses,...","[australia, medical, gps, specialist, nurse, a...",australia medical gps specialist nurse ambulan...
175878,"[please, read, this, thread, ~, emoji_hundred_...","[please, read, thread, scamdemic, coronavirus]","[please, read, thread, scamdemic, coronavirus]",please read thread scamdemic coronavirus
53157,"[august1st, it, is, the, first, time, that, ge...","[first, time, getting, closer, september, make...","[first, time, get, close, september, make, bit...",first time get close september make bit anxious
125522,"[while, senators, go, on, vacation, until, sep...","[senators, go, vacation, sept, infected, month]","[senator, go, vacation, sept, infect, month]",senator go vacation sept infect month
32967,"[emoji_loudspeakercoronavirusupdates, :, emoji...","[india, recovery, rate, crosses, improves, july]","[india, recovery, rate, cross, improve, july]",india recovery rate cross improve july
9469,"[time, emoji_watch, is, what, we, do, n't, hav...","[time, rushing, leave, legacy]","[time, rush, leave, legacy]",time rush leave legacy
158859,"[and, they, tested, positive, emoji_face_with_...","[tested, positive, im, starting, show, symptoms]","[test, positive, im, start, show, symptom]",test positive im start show symptom
151965,"[emoji_loudspeaker, pa, covid19, update, (, as...","[pa, update]","[pa, update]",pa update
34476,"[seriously, ?, they, should, have, known, bett...","[seriously, known, better, wearamask, stayhome...","[seriously, know, well, wearamask, stayhomesta...",seriously know well wearamask stayhomestaysafe...


In [14]:
#Create Bag of Words

from sklearn.feature_extraction.text import CountVectorizer

vectorize=CountVectorizer()

X=vectorize.fit_transform(df_with_emoji["final_text"])

print(X.shape)

print(vectorize.get_feature_names_out)

(9073, 11109)
<bound method CountVectorizer.get_feature_names_out of CountVectorizer()>


In [20]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB,MultinomialNB
from sklearn.metrics import classification_report,accuracy_score
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack


analyzer=SentimentIntensityAnalyzer()

#We generate labels using VADER (Automatically handles emohis)
def label_sentiment(text):
   score=analyzer.polarity_scores(text)['compound']
   if score>0.05:
      return "Positive"
   elif score<=-0.05:
      return "Negative"
   else:
      return "Neutral"

df_with_emoji["sentiment"]=df_with_emoji["text_clean"].apply(label_sentiment)

df_with_emoji["vader_compound"]=df_with_emoji["text_clean"].apply(lambda t: analyzer.polarity_scores(t)['compound'])

custom_emoji = {
    "üò≠": -2.5,
    "üò∑": -1.5,
    "üíâ": 0.5,
    "ü¶†": -2.0,
    "üôè": 1.2
}
def emoji_score(text):
   raw_score=sum(custom_emoji.get(ch,0) for ch in text)
   return np.tanh(raw_score/3)

df_with_emoji["emoji_score"]=df_with_emoji["text_clean"].apply(emoji_score)

X_text=df_with_emoji["text_demojize"]
X_extra=df_with_emoji[["vader_compound","emoji_score"]]
y=df_with_emoji["sentiment"] #Target labels

#Training
X_train_text,X_test_text,X_train_extra,X_test_extra,y_train,y_test=train_test_split(X_text,X_extra,y,test_size=0.2,random_state=42)

#TF-IDF
tfidf=TfidfVectorizer(ngram_range=(1,2),min_df=3,max_df=0.9,stop_words="english",sublinear_tf=True)

X_train_tfidf=tfidf.fit_transform(X_train_text)
X_test_tfidf=tfidf.fit_transform(X_test_text)

#Fit on training data and transform both sets into numerical matrices
X_train_tfidf=tfidf.fit_transform(X_train_text)
X_test_tfidf=tfidf.transform(X_test_text)

#Scaling numeric feature
scaler=MinMaxScaler()
X_train_extra_scaled=scaler.fit_transform(X_train_extra)
X_test_extra_scaled=scaler.fit_transform(X_test_extra)

#Union+Emoji
X_train_final=hstack([X_train_tfidf,X_train_extra_scaled])
X_test_final=hstack([X_test_tfidf,X_test_extra_scaled])
#Initializing and training Complement Naive Bayes Classifier
model=ComplementNB()
model.fit(X_train_final,y_train)

#Model evaluation
y_pred=model.predict(X_test_final)

print(f"Global Accuracy: {accuracy_score(y_test,y_pred):.2f}\n")
print(classification_report(y_test,y_pred))

Global Accuracy: 0.72

              precision    recall  f1-score   support

    Negative       0.68      0.71      0.70       386
     Neutral       0.76      0.56      0.65       632
    Positive       0.72      0.86      0.78       797

    accuracy                           0.72      1815
   macro avg       0.72      0.71      0.71      1815
weighted avg       0.73      0.72      0.72      1815

