In [1]:
!pip install -q contractions transformers sent2vec
!pip install -q imbalanced-learn

[K     |████████████████████████████████| 3.8 MB 6.3 MB/s 
[K     |████████████████████████████████| 284 kB 70.4 MB/s 
[K     |████████████████████████████████| 106 kB 49.6 MB/s 
[K     |████████████████████████████████| 6.5 MB 18.4 MB/s 
[K     |████████████████████████████████| 895 kB 48.4 MB/s 
[K     |████████████████████████████████| 67 kB 5.1 MB/s 
[K     |████████████████████████████████| 596 kB 60.8 MB/s 
[?25h

In [2]:
import numpy as np
import re
import warnings
import contractions
import pandas as pd
import string
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

from sent2vec.vectorizer import Vectorizer
from collections import Counter
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn import preprocessing
from imblearn.combine import SMOTETomek
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

nltk.download('stopwords')

warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=None
pd.options.display.max_rows=None
pd.options.display.max_columns=None
pd.options.display.width=None
%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
PUNCTUATIONS = string.punctuation
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.remove('not')

In [4]:
train_filename = "train.tsv"
val_filename = "valid.tsv"

# Load data
train = pd.read_csv(train_filename, sep="\t")
validation = pd.read_csv(val_filename, sep="\t")
print(f"Shape of training data is {train.shape} and validation data is {validation.shape}")

Shape of training data is (9067, 3) and validation data is (500, 3)


In [5]:
# Train top 5 rows
train.head().style.set_caption("Task 3: Classify Covid Tweets")

Unnamed: 0,tweet_id,tweet,label
0,13729,A growing number of Covid-19 patients whose symptoms were initially mild are now facing mysterious long-term neurological problems https://t.co/If2SgRduuw,Lit-News_mentions
1,12399,"Medical experts advise that symptoms of the novel coronavirus include fever, shortness of breath, and stinky smelly pits and feet 😳🤪",Lit-News_mentions
2,20056,"@drdavidsamadi Hubby/I:same symptoms n November 2019 after a weekend trip 2 Vegas where bus loads of Chinese tourists.1 day fever,3 days sore throat,several weeks of fatigue.He's healthy,I'm not: diabetes,hypertension,obese, respiratory issues @ 53. No meds/pneumonia,we believe was COVID-19",Nonpersonal_reports
3,10175,"1/x In the April 11 BC briefing Dr. Bonnie Henry had mentioned that there's now reports of neurologic complications after COVID-19 infection, even during recovery. There's now anecdotal reporting of neurological manifestations from WUHAN patients in JAMA:https://t.co/7spTyk7l2M",Lit-News_mentions
4,12179,Major study PHOSP-COVID investigates health impacts of #COVID19 on hospitalised patients including #mentalhealth & neurological problems. Find out more https://t.co/JLTrz0BA7f @OxfordHealthNHS https://t.co/jZ2kPyPqmS,Lit-News_mentions


In [6]:
# Label transform
le = preprocessing.LabelEncoder()
train['label'] = le.fit_transform(train['label'])
validation['label'] = le.transform(validation['label'])

In [7]:
# Drop unwanted columns
train.drop(['tweet_id'], axis=1, inplace=True)
validation.drop(['tweet_id'], axis=1, inplace=True)

In [8]:
#@title Helper function for cleaning tweets
def clean_tweets(df, col):
    
    # Lower case
    df[col] = df[col].str.lower()

    # Remove URL's
    df[col] = df[col].str.replace(r'https?://\S+|www\.\S+', '', regex=True)
    
    # Remove Mentions and Hastags
    df[col] = df[col].apply(lambda x: remove_tags_mentions(x))

    # Remove redundant characters
    df[col] = df[col].apply(lambda x: remove_redundant_chars(x))

    # Remove emoji's
    df[col] = df[col].apply(lambda x: remove_emoji(x))

    # Remove emoticons
    df[col] = df[col].apply(lambda x: remove_emoticons(x))
    
    # Expand contractions
    df[col] = df[col].apply(lambda x: expand_contractions(x))
    
    # Remove stopwords
    df[col] = df[col].apply(lambda x: remove_stopwords(x))

    # Remove punctuations
    df[col] = df[col].apply(lambda x: remove_punctuation(x))

    # Remove repeated characters in tweet
    df[col] = df[col].apply(lambda x: remove_repeated_chars(x))

    # Remove single character within text
    df[col] = df[col].apply(lambda x: remove_single_char(x))

    # Remove extract whitespaces within tweet
    df[col] = df[col].apply(lambda x: remove_whitespaces(x))

    # Drop duplicate rows
    df.drop_duplicates(subset=['tweet'], inplace=True, keep='first')
    
    return df
    
def remove_tags_mentions(text):
    pattern = re.compile(r'(@\S+|#\S+)')
    return pattern.sub('', text)

def remove_redundant_chars(text):
    pattern = re.compile(r'(&gt|&amp|&lt|wtf|fuck\'n|®|©|rt)')
    return pattern.sub('', text)

# Reference: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

EMOTICONS = {
    u":‑\)":"Happy face or smiley",
    u":\)":"Happy face or smiley",
    u":-\]":"Happy face or smiley",
    u":\]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-\)":"Happy face smiley",
    u":o\)":"Happy face smiley",
    u":-\}":"Happy face smiley",
    u":\}":"Happy face smiley",
    u":-\)":"Happy face smiley",
    u":c\)":"Happy face smiley",
    u":\^\)":"Happy face smiley",
    u"=\]":"Happy face smiley",
    u"=\)":"Happy face smiley",
    u":‑D":"Laughing, big grin or laugh with glasses",
    u":D":"Laughing, big grin or laugh with glasses",
    u"8‑D":"Laughing, big grin or laugh with glasses",
    u"8D":"Laughing, big grin or laugh with glasses",
    u"X‑D":"Laughing, big grin or laugh with glasses",
    u"XD":"Laughing, big grin or laugh with glasses",
    u"=D":"Laughing, big grin or laugh with glasses",
    u"=3":"Laughing, big grin or laugh with glasses",
    u"B\^D":"Laughing, big grin or laugh with glasses",
    u":-\)\)":"Very happy",
    u":‑\(":"Frown, sad, andry or pouting",
    u":-\(":"Frown, sad, andry or pouting",
    u":\(":"Frown, sad, andry or pouting",
    u":‑c":"Frown, sad, andry or pouting",
    u":c":"Frown, sad, andry or pouting",
    u":‑<":"Frown, sad, andry or pouting",
    u":<":"Frown, sad, andry or pouting",
    u":‑\[":"Frown, sad, andry or pouting",
    u":\[":"Frown, sad, andry or pouting",
    u":-\|\|":"Frown, sad, andry or pouting",
    u">:\[":"Frown, sad, andry or pouting",
    u":\{":"Frown, sad, andry or pouting",
    u":@":"Frown, sad, andry or pouting",
    u">:\(":"Frown, sad, andry or pouting",
    u":'‑\(":"Crying",
    u":'\(":"Crying",
    u":'‑\)":"Tears of happiness",
    u":'\)":"Tears of happiness",
    u"D‑':":"Horror",
    u"D:<":"Disgust",
    u"D:":"Sadness",
    u"D8":"Great dismay",
    u"D;":"Great dismay",
    u"D=":"Great dismay",
    u"DX":"Great dismay",
    u":‑O":"Surprise",
    u":O":"Surprise",
    u":‑o":"Surprise",
    u":o":"Surprise",
    u":-0":"Shock",
    u"8‑0":"Yawn",
    u">:O":"Yawn",
    u":-\*":"Kiss",
    u":\*":"Kiss",
    u":X":"Kiss",
    u";‑\)":"Wink or smirk",
    u";\)":"Wink or smirk",
    u"\*-\)":"Wink or smirk",
    u"\*\)":"Wink or smirk",
    u";‑\]":"Wink or smirk",
    u";\]":"Wink or smirk",
    u";\^\)":"Wink or smirk",
    u":‑,":"Wink or smirk",
    u";D":"Wink or smirk",
    u":‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|":"Straight face",
    u":\|":"Straight face",
    u":$":"Embarrassed or blushing",
    u":‑x":"Sealed lips or wearing braces or tongue-tied",
    u":x":"Sealed lips or wearing braces or tongue-tied",
    u":‑#":"Sealed lips or wearing braces or tongue-tied",
    u":#":"Sealed lips or wearing braces or tongue-tied",
    u":‑&":"Sealed lips or wearing braces or tongue-tied",
    u":&":"Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)":"Angel, saint or innocent",
    u"O:\)":"Angel, saint or innocent",
    u"0:‑3":"Angel, saint or innocent",
    u"0:3":"Angel, saint or innocent",
    u"0:‑\)":"Angel, saint or innocent",
    u"0:\)":"Angel, saint or innocent",
    u":‑b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)":"Angel, saint or innocent",
    u">:‑\)":"Evil or devilish",
    u">:\)":"Evil or devilish",
    u"\}:‑\)":"Evil or devilish",
    u"\}:\)":"Evil or devilish",
    u"3:‑\)":"Evil or devilish",
    u"3:\)":"Evil or devilish",
    u">;\)":"Evil or devilish",
    u"\|;‑\)":"Cool",
    u"\|‑O":"Bored",
    u":‑J":"Tongue-in-cheek",
    u"#‑\)":"Party all night",
    u"%‑\)":"Drunk or confused",
    u"%\)":"Drunk or confused",
    u":-###..":"Being sick",
    u":###..":"Being sick",
    u"<:‑\|":"Dump",
    u"\(>_<\)":"Troubled",
    u"\(>_<\)>":"Troubled",
    u"\(';'\)":"Baby",
    u"\(\^\^>``":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz":"Sleeping",
    u"\(\^_-\)":"Wink",
    u"\(\(\+_\+\)\)":"Confused",
    u"\(\+o\+\)":"Confused",
    u"\(o\|o\)":"Ultraman",
    u"\^_\^":"Joyful",
    u"\(\^_\^\)/":"Joyful",
    u"\(\^O\^\)／":"Joyful",
    u"\(\^o\^\)／":"Joyful",
    u"\(__\)":"Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_":"Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>":"Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)":"Sad or Crying",
    u"\(/_;\)":"Sad or Crying",
    u"\(T_T\) \(;_;\)":"Sad or Crying",
    u"\(;_;":"Sad of Crying",
    u"\(;_:\)":"Sad or Crying",
    u"\(;O;\)":"Sad or Crying",
    u"\(:_;\)":"Sad or Crying",
    u"\(ToT\)":"Sad or Crying",
    u";_;":"Sad or Crying",
    u";-;":"Sad or Crying",
    u";n;":"Sad or Crying",
    u";;":"Sad or Crying",
    u"Q\.Q":"Sad or Crying",
    u"T\.T":"Sad or Crying",
    u"QQ":"Sad or Crying",
    u"Q_Q":"Sad or Crying",
    u"\(-\.-\)":"Shame",
    u"\(-_-\)":"Shame",
    u"\(一一\)":"Shame",
    u"\(；一_一\)":"Shame",
    u"\(=_=\)":"Tired",
    u"\(=\^\·\^=\)":"cat",
    u"\(=\^\·\·\^=\)":"cat",
    u"=_\^=	":"cat",
    u"\(\.\.\)":"Looking down",
    u"\(\._\.\)":"Looking down",
    u"\^m\^":"Giggling with hand covering mouth",
    u"\(\・\・?":"Confusion",
    u"\(?_?\)":"Confusion",
    u">\^_\^<":"Normal Laugh",
    u"<\^!\^>":"Normal Laugh",
    u"\^/\^":"Normal Laugh",
    u"\（\*\^_\^\*）" :"Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)":"Normal Laugh",
    u"\(^\^\)":"Normal Laugh",
    u"\(\^\.\^\)":"Normal Laugh",
    u"\(\^_\^\.\)":"Normal Laugh",
    u"\(\^_\^\)":"Normal Laugh",
    u"\(\^\^\)":"Normal Laugh",
    u"\(\^J\^\)":"Normal Laugh",
    u"\(\*\^\.\^\*\)":"Normal Laugh",
    u"\(\^—\^\）":"Normal Laugh",
    u"\(#\^\.\^#\)":"Normal Laugh",
    u"\（\^—\^\）":"Waving",
    u"\(;_;\)/~~~":"Waving",
    u"\(\^\.\^\)/~~~":"Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~":"Waving",
    u"\(T_T\)/~~~":"Waving",
    u"\(ToT\)/~~~":"Waving",
    u"\(\*\^0\^\*\)":"Excited",
    u"\(\*_\*\)":"Amazed",
    u"\(\*_\*;":"Amazed",
    u"\(\+_\+\) \(@_@\)":"Amazed",
    u"\(\*\^\^\)v":"Laughing,Cheerful",
    u"\(\^_\^\)v":"Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)":"Headphones,Listening to music",
    u'\(-"-\)':"Worried",
    u"\(ーー;\)":"Worried",
    u"\(\^0_0\^\)":"Eyeglasses",
    u"\(\＾ｖ\＾\)":"Happy",
    u"\(\＾ｕ\＾\)":"Happy",
    u"\(\^\)o\(\^\)":"Happy",
    u"\(\^O\^\)":"Happy",
    u"\(\^o\^\)":"Happy",
    u"\)\^o\^\(":"Happy",
    u":O o_O":"Surprised",
    u"o_0":"Surprised",
    u"o\.O":"Surpised",
    u"\(o\.o\)":"Surprised",
    u"oO":"Surprised",
    u"\(\*￣m￣\)":"Dissatisfied",
    u"\(‘A`\)":"Snubbed or Deflated"
}

def remove_emoticons(text):
    emoticons_pattern = re.compile(u'(' + u'|'.join(emo for emo in EMOTICONS) + u')')
    return emoticons_pattern.sub(r'', text)

def expand_contractions(text):
    return contractions.fix(text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCTUATIONS))

def remove_whitespaces(text):
    pattern = re.compile(r" +")
    return pattern.sub(" ", text).strip()

def remove_repeated_chars(text):
    return re.sub("(.)\\1{2,}", "\\1", text)

def remove_single_char(text):
    pattern = re.compile(r' ([a-zA-Z]{1}) ')
    return pattern.sub(" ", text)

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in STOPWORDS])

In [9]:
train = clean_tweets(train, 'tweet').reset_index(drop=True)
validation = clean_tweets(validation, 'tweet').reset_index(drop=True)

### Doc2Vec

In [25]:
train_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train.tweet)]
test_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(validation.tweet)]
model = Doc2Vec(epochs=30)
model.build_vocab(train_documents)

model.train(train_documents, total_examples=model.corpus_count, epochs=model.epochs)

def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, feature_vectors

In [26]:
y_train_indexs, X_train = vector_for_learning(model, train_documents)
y_valid_indexs, X_valid = vector_for_learning(model, test_documents)

### Logistic Regression

In [12]:
logreg = LogisticRegression()
logreg.fit(X_train, train['label'])
y_pred = logreg.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.7358490566037735
Testing classification report 
              precision    recall  f1-score   support

           0       0.83      0.82      0.82       224
           1       0.65      0.79      0.71       180
           2       0.71      0.34      0.46        73

    accuracy                           0.74       477
   macro avg       0.73      0.65      0.67       477
weighted avg       0.74      0.74      0.73       477

Testing confusion matrix 
[[183  40   1]
 [ 28 143   9]
 [ 10  38  25]]
Testing F1 score: 0.7358490566037735


In [27]:
svc = SVC()
svc.fit(X_train, train['label'])
y_pred = svc.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.7568134171907757
Testing classification report 
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       224
           1       0.68      0.78      0.73       180
           2       0.78      0.38      0.51        73

    accuracy                           0.76       477
   macro avg       0.76      0.67      0.69       477
weighted avg       0.76      0.76      0.75       477

Testing confusion matrix 
[[192  31   1]
 [ 32 141   7]
 [ 10  35  28]]
Testing F1 score: 0.7568134171907758


In [28]:
linearSVC = LinearSVC()
linearSVC.fit(X_train, train['label'])
y_pred = linearSVC.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.7358490566037735
Testing classification report 
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       224
           1       0.64      0.81      0.71       180
           2       0.78      0.25      0.38        73

    accuracy                           0.74       477
   macro avg       0.75      0.63      0.64       477
weighted avg       0.75      0.74      0.72       477

Testing confusion matrix 
[[188  36   0]
 [ 30 145   5]
 [ 10  45  18]]
Testing F1 score: 0.7358490566037735


### SMOTETomek

In [13]:
smotetomek = SMOTETomek(random_state = 0)
X_train_resample, train_resample = smotetomek.fit_resample(X_train, train['label'])

### SMOTETomek + Logistic Regression

In [14]:
logreg = LogisticRegression(random_state=0)
logreg.fit(pd.DataFrame(X_train_resample), train_resample)
y_pred = logreg.predict(pd.DataFrame(X_valid))

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.6918238993710691
Testing classification report 
              precision    recall  f1-score   support

           0       0.84      0.81      0.83       224
           1       0.66      0.59      0.63       180
           2       0.42      0.56      0.48        73

    accuracy                           0.69       477
   macro avg       0.64      0.66      0.64       477
weighted avg       0.71      0.69      0.70       477

Testing confusion matrix 
[[182  31  11]
 [ 27 107  46]
 [  8  24  41]]
Testing F1 score: 0.6918238993710691


### Tf-Idf

In [31]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2))

In [32]:
cv = 5

models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    SVC(),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=42)
]

result_cv_df = pd.DataFrame(index=range(cv * len(models)))

scorer = make_scorer(f1_score, average='micro')

In [33]:
X_train = tfidf.fit_transform(train['tweet'])
X_valid = tfidf.transform(validation['tweet'])

In [18]:
cv_result = []

for model in models:
  model_name = model.__class__.__name__
  f1_score_cv = cross_val_score(model, X_train, train['label'], scoring=scorer, cv=cv)
  for fold_idx, score in enumerate(f1_score_cv):
    cv_result.append((model_name, fold_idx, score))
    
result_cv_df = pd.DataFrame(cv_result, columns=['model_name', 'fold_idx', 'f1_score'])

In [19]:
result_cv_df.groupby('model_name').f1_score.mean()

model_name
LinearSVC                 0.938457
LogisticRegression        0.931891
MultinomialNB             0.887214
RandomForestClassifier    0.771726
SVC                       0.933436
Name: f1_score, dtype: float64

### SVC

In [20]:
svc = SVC()
svc.fit(X_train, train['label'])
y_pred = svc.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.9580712788259959
Testing classification report 
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       224
           1       0.96      0.94      0.95       180
           2       0.90      0.95      0.92        73

    accuracy                           0.96       477
   macro avg       0.94      0.95      0.95       477
weighted avg       0.96      0.96      0.96       477

Testing confusion matrix 
[[219   4   1]
 [  4 169   7]
 [  1   3  69]]
Testing F1 score: 0.9580712788259959


### Linear SVC

In [21]:
linearSVC = LinearSVC()
linearSVC.fit(X_train, train['label'])
y_pred = linearSVC.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.960167714884696
Testing classification report 
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       224
           1       0.96      0.94      0.95       180
           2       0.93      0.93      0.93        73

    accuracy                           0.96       477
   macro avg       0.95      0.95      0.95       477
weighted avg       0.96      0.96      0.96       477

Testing confusion matrix 
[[220   3   1]
 [  6 170   4]
 [  1   4  68]]
Testing F1 score: 0.960167714884696


### Multinomial NB

In [23]:
mnb = MultinomialNB()
mnb.fit(X_train, train['label'])
y_pred = mnb.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.9014675052410901
Testing classification report 
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       224
           1       0.82      0.96      0.88       180
           2       0.95      0.58      0.72        73

    accuracy                           0.90       477
   macro avg       0.91      0.83      0.86       477
weighted avg       0.91      0.90      0.90       477

Testing confusion matrix 
[[216   7   1]
 [  7 172   1]
 [  1  30  42]]
Testing F1 score: 0.9014675052410901


### Random Forest

In [35]:
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, train['label'])
y_pred = rf.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.7966457023060797
Testing classification report 
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       224
           1       0.66      0.96      0.78       180
           2       0.00      0.00      0.00        73

    accuracy                           0.80       477
   macro avg       0.54      0.63      0.57       477
weighted avg       0.70      0.80      0.74       477

Testing confusion matrix 
[[208  16   0]
 [  8 172   0]
 [  2  71   0]]
Testing F1 score: 0.7966457023060797


### Best Model - Logistic Regression

In [22]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, train['label'])
y_pred = logreg.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.9643605870020965
Testing classification report 
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       224
           1       0.97      0.95      0.96       180
           2       0.93      0.95      0.94        73

    accuracy                           0.96       477
   macro avg       0.96      0.96      0.96       477
weighted avg       0.96      0.96      0.96       477

Testing confusion matrix 
[[220   3   1]
 [  5 171   4]
 [  1   3  69]]
Testing F1 score: 0.9643605870020965


In [66]:
le.inverse_transform([0,1,2])

array(['Lit-News_mentions', 'Nonpersonal_reports', 'Self_reports'],
      dtype=object)

In [72]:
y_pred.shape

(477,)

In [91]:
tempo = validation.reset_index(drop=True)

In [92]:
temp_val = tempo[y_pred != tempo["label"]]
temp_val = temp_val.iloc[:-3]
temp = pd.concat([temp_val.reset_index(drop=True), pd.DataFrame(y_pred[temp_val.index], columns=['y_pred'])], axis=1)

In [100]:
validation_check = pd.read_csv(val_filename, sep="\t")
validation_check

Unnamed: 0,tweet_id,tweet,label
0,13361,"Loss of taste and smell is best indicator of COVID-19, study shows https://t.co/wJ4B3fqoUh via @Harvard",Lit-News_mentions
1,20228,"Me and my girl swear we have already had COVID-19. We were sick for nearly a month, fever, cough, sore throat, the doctors told me I had the flu combined with bronchitis because some days I felt like I was drowning in chest mucus. Idk dawg....",Nonpersonal_reports
2,20120,It’s great to have some guidelines around fatigue management. Something I have found very present front he start of patients recovery from covid-19 #RCOT #COVIDrehab,Nonpersonal_reports
3,23604,"My partner &amp; I had #coronavirus in late February. We both recovered. I had high temperature to the point of hallucination and every time I coughed I was sick. I had no strength. Dean kept me supplied with water, even though he was pretty much wiped out by it. Nasty virus. https://t.co/0bXXekDXvz",Nonpersonal_reports
4,31412,"Covid week 13 update. Week 11 kidney pain on the wane 🥳 Vascular system now. Presenting as high BP (affecting brain speed, vision, tightness in veins). BP readings are ‘prehypertension’ apparently which is good news I think! Presumably just still inflammation not actually heart.",Self_reports
5,11975,Doctors are finding striking similarities between chronic fatigue syndrome and long-term coronavirus symptoms https://t.co/1xYCZj3YwX via @businessinsider,Lit-News_mentions
6,11788,"GRT DRIFFIELD NEWS CENTRE - Coronavirus: Four out of five with sudden loss of smell or taste had COVID-19, study finds - @greatdriffradio https://t.co/oXCrsmDb0I RT",Lit-News_mentions
7,12252,Doctors say they are being 'inundated' with calls about 'COVID toes' from people whose only coronavirus symptoms are bruises and rashes on their feet via https://t.co/VSUl7QLy5e https://t.co/zXB5DPZ4Xg,Lit-News_mentions
8,12091,CNBC looks into scientists reporting neurological damage in COVID-19 patients. More @ https://t.co/Ts6DsdW1ex,Lit-News_mentions
9,21789,"i was sick after traveling to stanford in december 2019 &amp; february 2020 for my sisters surgeries. on valentine’s day i had major shortness of breath, a bad cough, exhausted, &amp; i didn’t eat for 4 days. my temp even went up to 104. i’m convinced covid-19 was here for awhile.",Nonpersonal_reports


In [93]:
temp[temp.label == 0]

Unnamed: 0,tweet,label,y_pred
2,i crippling body aches fatigue could not concentrate longhauler sounds familiar,0,1
4,medical community right now ok longcovid post viral symptoms not faking illnessbut every yet unexplained illness 100 psycosomatic including post viral,0,2
6,70 percent patients sick enough admitted new york state’s largest hospital system not feverremember workplaces tell employees safe return work taking everybodys temperature,0,1
13,become familiar symptoms coronavirus medical professionals tell us common ones dry cough high fever shoness breath not doctor believe symptoms coronavirus,0,1


In [98]:
temp[temp.label == 1]

Unnamed: 0,tweet,label,y_pred
0,great guidelines around fatigue management something found present front sta patients recovery covid19,1,0
1,hi symptoms covid19 similar common cold flu symptoms are fatigue fever coughing stuffy nose sore throat diarrhea seek medical attention you child family member show signs,1,0
3,children usually showed milder respiratory symptoms asymptomatic loss taste sensation smell seldom repoed,1,0
9,images users put repository tissues organs lungs liver kidney hea patients infected covid19 similar types coronaviruses respiratory illnesses,1,0
10,wifes covid19 journey nurse day 1 coughday 2 headache later onday 3 fever bad headacheday 4 tested muscle aches feverday 5 chest pain breathlessness no fever day 6 dizziness diarrheaday 7 worse breathless dizzinessday 8 not bad today,1,2
11,one symptoms covid19 diarrhea,1,0


In [96]:
temp[temp.label == 2]

Unnamed: 0,tweet,label,y_pred
5,time much coronavirus not qualify test first symptoms sore throat sneezing dizziness think huge problem folk thinking not covid going cough never cough tested positive,2,1
7,rushed ae weds night suffering struggling breath chest pains 8hrs tests xrays oxygen allowed home told got worst covid19 would take time recover still cannot even walk dogs,2,1
8,tell personal experience fever contracted virus temp never went 99 respiratory issue problem severe dehydration extreme fatigue doctors scratched heads not knowing,2,1
12,scary read every symptom feeling since january doctors dismissed brain fog severe fatigue phantom vibrations loss memory here felt people also gotten sick 1,2,0


In [101]:
temp

Unnamed: 0,tweet,label,y_pred
0,great guidelines around fatigue management something found present front sta patients recovery covid19,1,0
1,hi symptoms covid19 similar common cold flu symptoms are fatigue fever coughing stuffy nose sore throat diarrhea seek medical attention you child family member show signs,1,0
2,i crippling body aches fatigue could not concentrate longhauler sounds familiar,0,1
3,children usually showed milder respiratory symptoms asymptomatic loss taste sensation smell seldom repoed,1,0
4,medical community right now ok longcovid post viral symptoms not faking illnessbut every yet unexplained illness 100 psycosomatic including post viral,0,2
5,time much coronavirus not qualify test first symptoms sore throat sneezing dizziness think huge problem folk thinking not covid going cough never cough tested positive,2,1
6,70 percent patients sick enough admitted new york state’s largest hospital system not feverremember workplaces tell employees safe return work taking everybodys temperature,0,1
7,rushed ae weds night suffering struggling breath chest pains 8hrs tests xrays oxygen allowed home told got worst covid19 would take time recover still cannot even walk dogs,2,1
8,tell personal experience fever contracted virus temp never went 99 respiratory issue problem severe dehydration extreme fatigue doctors scratched heads not knowing,2,1
9,images users put repository tissues organs lungs liver kidney hea patients infected covid19 similar types coronaviruses respiratory illnesses,1,0


In [63]:
temp_val.shape

(94, 2)