# ***Upload and Load dataset***

In [19]:
#Upload the file
from google.colab import files
uploaded = files.upload()

Saving Corona.csv to Corona (1).csv


In [20]:
#Load the dataset
import pandas as pd
df = pd.read_csv('Corona.csv', encoding='ISO-8859-1')

# ***Data Description***

In [21]:
#Display first few rows of the dataset
print(df.head(5))

   UserName  ScreenName   Location     TweetAt  \
0      3799       48751     London  16-03-2020   
1      3800       48752         UK  16-03-2020   
2      3801       48753  Vagabonds  16-03-2020   
3      3802       48754        NaN  16-03-2020   
4      3803       48755        NaN  16-03-2020   

                                       OriginalTweet           Sentiment  
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral  
1  advice Talk to your neighbours family to excha...            Positive  
2  Coronavirus Australia: Woolworths to give elde...            Positive  
3  My food stock is not the only one which is emp...            Positive  
4  Me, ready to go at supermarket during the #COV...  Extremely Negative  


In [22]:
#Display basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB
None


In [23]:
#Display numbers of rows and columns
print(df.shape)

(41157, 6)


# ***Data Cleaning***

In [24]:
#Identify missing values
print(df.isnull().sum())

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64


In [25]:
#Identify duplicate rows
print(df.duplicated().sum())

0


In [26]:
#Remove columns
df = df.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis=1)

In [27]:
#Class imbalance
print(df['Sentiment'].value_counts())

Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64


In [28]:
#Display rows where Sentiment is 'Neutral'
print(df[df['Sentiment'] == 'Neutral'])

                                           OriginalTweet Sentiment
0      @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...   Neutral
7      Was at the supermarket today. Didn't buy toile...   Neutral
10     All month there hasn't been crowding in the su...   Neutral
16     ????? ????? ????? ????? ??\r\r\n?????? ????? ?...   Neutral
17     @eyeonthearctic 16MAR20 Russia consumer survei...   Neutral
...                                                  ...       ...
41141  #Coronavirus ?? ????? ??? ????? ?? ??? ???????...   Neutral
41143  https://t.co/8s4vKvcO1r #5gtowers?? #EcuadorUn...   Neutral
41144  @_Sunrise_SV @Gamzap @NPR What does not having...   Neutral
41152  Airline pilots offering to stock supermarket s...   Neutral
41155  Is it wrong that the smell of hand sanitizer i...   Neutral

[7713 rows x 2 columns]


In [29]:
#Remove 'Neutral' in sentiment (wrongly labeled neutral)
df = df[df['Sentiment'] != 'Neutral']

# ***Model Preprocessing***

In [30]:
#Label simplification
def simplify_sentiment(label):
    if label in ['Positive', 'Extremely Positive']:
        return 'Positive'
    elif label in ['Negative', 'Extremely Negative']:
        return 'Negative'
    else:
        return 'Other'

df = df[['OriginalTweet','Sentiment']].dropna().copy()
df['simplified_sentiment'] = df['Sentiment'].apply(simplify_sentiment)
df = df[df['simplified_sentiment'] != 'Other'].copy()

In [31]:
#Clean
import re
from bs4 import BeautifulSoup
def clean_text(t):
    t = BeautifulSoup(str(t), "html.parser").get_text()
    t = re.sub(r"http\S+|www\S+", " ", t)
    t = re.sub(r"@\w+", " ", t)
    t = re.sub(r"#", "", t)
    t = re.sub(r"\brt\b", " ", t, flags=re.IGNORECASE)
    t = re.sub(r"&amp;", " and ", t)
    t = re.sub(r"[^A-Za-z0-9'\s.,!?]", " ", t)
    return re.sub(r"\s+", " ", t).strip().lower()

df['clean_tweet'] = df['OriginalTweet'].apply(clean_text)

In [32]:
#Split
from sklearn.model_selection import train_test_split
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['clean_tweet'].values,
    df['simplified_sentiment'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['simplified_sentiment'].values
)

# ***Linear SVM***

In [33]:
#Linear SVM
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

svm_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.95, max_features=100_000, strip_accents="unicode", lowercase=True)),
    ("clf", LinearSVC(random_state=42))
])
svm_pipe.fit(X_train_text, y_train)
svm_pred = svm_pipe.predict(X_test_text)
print("SVM Accuracy:", round(accuracy_score(y_test, svm_pred), 4))
print("SVM Macro F1:", round(f1_score(y_test, svm_pred, average="macro"), 4))
print(confusion_matrix(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

SVM Accuracy: 0.8619
SVM Macro F1: 0.861
[[2613  467]
 [ 457 3152]]
              precision    recall  f1-score   support

    Negative       0.85      0.85      0.85      3080
    Positive       0.87      0.87      0.87      3609

    accuracy                           0.86      6689
   macro avg       0.86      0.86      0.86      6689
weighted avg       0.86      0.86      0.86      6689



# ***XGBoost***

In [35]:
#Xgboost
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import xgboost as xgb

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

xgb_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2), min_df=5, max_df=0.95, max_features=100_000,
        strip_accents="unicode", lowercase=True
    )),
    ("clf", xgb.XGBClassifier(
        objective="binary:logistic", eval_metric="logloss",
        tree_method="hist", n_estimators=400, learning_rate=0.1,
        max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42
    ))
])

xgb_pipe.fit(X_train_text, y_train_enc)
y_pred_enc = xgb_pipe.predict(X_test_text)

print("XGB Accuracy:", round(accuracy_score(y_test_enc, y_pred_enc), 4))
print("XGB Macro F1:", round(f1_score(y_test_enc, y_pred_enc, average="macro"), 4))
print(confusion_matrix(y_test_enc, y_pred_enc))
print(classification_report(y_test_enc, y_pred_enc, target_names=le.classes_))

XGB Accuracy: 0.8433
XGB Macro F1: 0.8417
[[2486  594]
 [ 454 3155]]
              precision    recall  f1-score   support

    Negative       0.85      0.81      0.83      3080
    Positive       0.84      0.87      0.86      3609

    accuracy                           0.84      6689
   macro avg       0.84      0.84      0.84      6689
weighted avg       0.84      0.84      0.84      6689

