In [66]:
import pandas as pd
import re
import emoji
import numpy as np

df=pd.read_csv("covid19_tweets.csv")

Eseguiamo un text cleaning, emoji feature extraction, emoji semantic conversation e keyword filtering

In [67]:
#Clean text
def clean_tweet(text):
    text=text.lower() #lower text
    text=re.sub(r"https?://\S+","",text) #remove links
    text = re.sub(r"www\.\S+", "", text)
    text=re.sub(r"@\w+","",text) #remove mention
    text=re.sub(r"\brt\b","",text) #remove retweet
    text=re.sub(r"#(\w+)",r"\1",text) # remove hashtag
    text=re.sub(r"[\n\t]","",text) # remove newline and tab
    return text.strip()

df["text_clean"]=df["text"].apply(clean_tweet)

def has_emoji(text):
    return any(char in emoji.EMOJI_DATA for char in text)

def extract_emojis(text):
    return [char for char in text if char in emoji.EMOJI_DATA]
df["emojis"]=df["text_clean"].apply(extract_emojis)
df["emoji_count"]=df["emojis"].apply(len)
df["has_emoji"]=df["emoji_count"]>0

#Transform emoji in token
def demojize_text(text):
    text=emoji.demojize(text,language='en')
    text=re.sub(r":([a-z_]+):",r" emoji_\1",text)
    return text.strip()
df["text_demojize"]=df["text_clean"].apply(demojize_text)

# List of keywords to identify COVID-19 related tweets
covid_keywords=["covid","covid19","coronavirus","sars-cov-2","pandemic"]

# Function to check if a text mentions COVID-19
def is_covid_related(text):
    pattern = r"\b(" + "|".join(covid_keywords) + r")\b"
    return bool(re.search(pattern, text))

df_with_emoji=df[df["has_emoji"]].copy()
df_with_emoji=df_with_emoji[df_with_emoji["text_clean"].apply(is_covid_related)].copy()

def remove_duplicate_emoji(emoji_list):
    return list(dict.fromkeys(emoji_list))


df_with_emoji["emojis_unique"]=df_with_emoji["emojis"]

df_with_emoji[["text_clean","emojis","emoji_count","emojis_unique","text_demojize"]].sample(100)


Unnamed: 0,text_clean,emojis,emoji_count,emojis_unique,text_demojize
41676,covid19 ü¶† - sdmc 29-7-2020 üö® üö® postponing on t...,"[ü¶†, üö®, üö®]",3,"[ü¶†, üö®, üö®]",covid19 emoji_microbe - sdmc 29-7-2020 emoji...
33095,‡¥Ü‡¥∂‡¥ô‡µç‡¥ï ‡¥ï‡µÅ‡¥±‡¥Ø‡¥æ‡¥§‡µÜ ‡¥§‡¥µ‡¥ø‡¥û‡µç‡¥û‡¥æ‡¥≤‡µç‚Äç | thavinjal covid 19c...,[üëâ],1,[üëâ],‡¥Ü‡¥∂‡¥ô‡µç‡¥ï ‡¥ï‡µÅ‡¥±‡¥Ø‡¥æ‡¥§‡µÜ ‡¥§‡¥µ‡¥ø‡¥û‡µç‡¥û‡¥æ‡¥≤‡µç‚Äç | thavinjal covid 19c...
9218,indiafightscorona:üìçpreventive measures in rest...,"[üìç, ‚ñ∂, ‚ñ∂]",3,"[üìç, ‚ñ∂, ‚ñ∂]",indiafightscorona: emoji_round_pushpinpreventi...
89824,coronavirusupdate7-8-2020 23:59 utctotal covid...,"[‚¨á, ‚Üî]",2,"[‚¨á, ‚Üî]",coronavirusupdate7-8-2020 23:59 utctotal covid...
173285,level2 covid19 nz jacindaardern level 2.5: cov...,"[‚ú®, ü•∞, üßö, ‚ôÄ]",4,"[‚ú®, ü•∞, üßö, ‚ôÄ]",level2 covid19 nz jacindaardern level 2.5: cov...
...,...,...,...,...,...
168411,"cross fingers, perhaps soon we won't need to s...",[üòä],1,[üòä],"cross fingers, perhaps soon we won't need to s..."
94779,"we are at snec expo, shanghai now! come hug us...",[üòò],1,[üòò],"we are at snec expo, shanghai now! come hug us..."
58146,covid19 and its subsequent health and economic...,"[üìâ, ü•¶]",2,"[üìâ, ü•¶]",covid19 and its subsequent health and economic...
40033,allahdoesnotneedqurbani cmomaharashtra maharas...,[üí•],1,[üí•],allahdoesnotneedqurbani cmomaharashtra maharas...


In [71]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

#Tokenize text using NLTK
df_with_emoji["tokens"]=df_with_emoji["text_demojize"].apply(word_tokenize)

#Remove Stopwords
stopwords=set(stopwords.words('english'))

#Lemmantization
lemmatizer=WordNetLemmatizer()

#Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'): 
        return wordnet.NOUN
    elif tag.startswith('R'): 
        return wordnet.ADV
    else:
        return wordnet.NOUN #Default

def lemmatize_tokens(text):
    tokens=word_tokenize(text.lower())
    tagged_tokens=pos_tag(tokens)
    final=[]
    for word, tag in tagged_tokens:
        if(word.isalpha() or word.startswith(':') and word not in stopwords):
            final.append(lemmatizer.lemmatize(word,get_wordnet_pos(tag)))

    return [lemmatizer.lemmatize(word,get_wordnet_pos(pos)) for word,pos in tagged_tokens]

df_with_emoji["final_tokens"]=df_with_emoji["text_demojize"].apply(lemmatize_tokens)

df_with_emoji["final_text"]=df_with_emoji["final_tokens"].apply(lambda x: " ".join(x))
df_with_emoji[["text_demojize","final_tokens","final_text",]].sample(10)



Unnamed: 0,text_demojize,final_tokens,final_text
171109,more dumbasses. emoji_woman_facepalming_light...,"[more, dumbass, ., emoji_woman_facepalming_lig...",more dumbass . emoji_woman_facepalming_light_s...
71478,coronavirusupdates: covid19 india tracker(as o...,"[coronavirusupdates, :, covid19, india, tracke...",coronavirusupdates : covid19 india tracker ( a...
4214,emoji_airplane learn more about iata's safety ...,"[emoji_airplane, learn, more, about, iata, 's,...",emoji_airplane learn more about iata 's safety...
11687,covid19 tenders. niggas bought bmw stock out. ...,"[covid19, tender, ., nigga, buy, bmw, stock, o...",covid19 tender . nigga buy bmw stock out . emo...
59175,love should tule the world emoji_red_heart em...,"[love, should, tule, the, world, emoji_red_hea...",love should tule the world emoji_red_heart emo...
13330,kids wouldn‚Äôt pose with us with masks! emoji_...,"[kid, wouldn, ‚Äô, t, pose, with, u, with, mask,...",kid wouldn ‚Äô t pose with u with mask ! emoji_f...
146777,no covid19 didn‚Äôt infect climatechange it‚Äôs st...,"[no, covid19, didn, ‚Äô, t, infect, climatechang...",no covid19 didn ‚Äô t infect climatechange it ‚Äô ...
4007,please don't view n pass... patronize me abeg ...,"[please, do, n't, view, n, pas, ..., patronize...",please do n't view n pas ... patronize me abeg...
92558,"amitshah to chinese virus, covid19 emoji_smil...","[amitshah, to, chinese, virus, ,, covid19, emo...","amitshah to chinese virus , covid19 emoji_smil..."
101288,the latest the force for health emoji_register...,"[the, late, the, force, for, health, emoji_reg...",the late the force for health emoji_registered...


In [70]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB,MultinomialNB
from sklearn.metrics import classification_report,accuracy_score
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack

analyzer=SentimentIntensityAnalyzer()

#We generate labels using VADER (Automatically handles emojis)
def label_sentiment(text):
   score=analyzer.polarity_scores(text)['compound']
   if score>0.05:
      return "Positive"
   elif score<=-0.05:
      return "Negative"
   else:
      return "Neutral"

df_with_emoji["sentiment"]=df_with_emoji["text_clean"].apply(label_sentiment)
df_with_emoji=df_with_emoji[df_with_emoji["sentiment"] != "Neutral"].copy()

df_with_emoji["vader_compound"]=df_with_emoji["text_clean"].apply(lambda t: analyzer.polarity_scores(t)['compound'])

def emoji_score(text):
   #Estraggo le emoji dal testo
   emoji_found=[c for c in text if c in emoji.EMOJI_DATA]
   if not emoji_found:
      return 0
   
   score=sum(analyzer.polarity_scores(e)['compound'] for e in emoji_found)
   return np.tanh(score) #Per 'schiacciare il valore'

df_with_emoji["emoji_score"]=df_with_emoji["text_clean"].apply(emoji_score)

X_text=df_with_emoji["final_text"]
X_extra=df_with_emoji[["vader_compound","emoji_score"]]
y=df_with_emoji["sentiment"] #Target labels

#Training
X_train_text,X_test_text,X_train_extra,X_test_extra,y_train,y_test=train_test_split(X_text,X_extra,y,test_size=0.2,random_state=42)

#TF-IDF
tfidf=TfidfVectorizer(ngram_range=(1,3),min_df=3,max_df=0.8,stop_words="english",sublinear_tf=True,use_idf=True)

#Fit on training data and transform both sets into numerical matrices
X_train_tfidf=tfidf.fit_transform(X_train_text)
X_test_tfidf=tfidf.transform(X_test_text)

#Scaling numeric feature
scaler=MinMaxScaler()
X_train_extra_scaled=scaler.fit_transform(X_train_extra)
X_test_extra_scaled=scaler.transform(X_test_extra)

#Union+Emoji
X_train_final=hstack([X_train_tfidf,X_train_extra_scaled])
X_test_final=hstack([X_test_tfidf,X_test_extra_scaled])
#Initializing and training Complement Naive Bayes Classifier
model=ComplementNB(alpha=1)
model.fit(X_train_final,y_train)

#Model evaluation
y_pred=model.predict(X_test_final)

print(f"Global Accuracy: {accuracy_score(y_test,y_pred):.2f}\n")
print(classification_report(y_test,y_pred))


Global Accuracy: 0.88

              precision    recall  f1-score   support

    Negative       0.88      0.73      0.80       383
    Positive       0.88      0.95      0.92       810

    accuracy                           0.88      1193
   macro avg       0.88      0.84      0.86      1193
weighted avg       0.88      0.88      0.88      1193

