In [2]:
!pip install -q contractions transformers sent2vec
!pip install -q imbalanced-learn

[K     |████████████████████████████████| 3.8 MB 6.1 MB/s 
[K     |████████████████████████████████| 284 kB 51.6 MB/s 
[K     |████████████████████████████████| 106 kB 57.3 MB/s 
[K     |████████████████████████████████| 6.5 MB 43.4 MB/s 
[K     |████████████████████████████████| 895 kB 45.0 MB/s 
[K     |████████████████████████████████| 67 kB 5.6 MB/s 
[K     |████████████████████████████████| 596 kB 47.1 MB/s 
[?25h

In [3]:
import numpy as np
import re
import warnings
import contractions
import pandas as pd
import string
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

from sent2vec.vectorizer import Vectorizer
from collections import Counter
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn import preprocessing
from imblearn.combine import SMOTETomek
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')

warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=None
pd.options.display.max_rows=None
pd.options.display.max_columns=None
pd.options.display.width=None
%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
PUNCTUATIONS = string.punctuation
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.remove('not')
stemmer = PorterStemmer()

In [5]:
train_filename = "train.tsv"
val_filename = "valid.tsv"

# Load data
train = pd.read_csv(train_filename, sep="\t", names=["tweet_id", "user_id", "tweet", "label"])
validation = pd.read_csv(val_filename, sep="\t")
print(f"Shape of training data is {train.shape} and validation data is {validation.shape}")

# Train top 5 rows
train.head().style.set_caption("Task 3: Classify Covid Tweets")

Shape of training data is (6465, 4) and validation data is (716, 4)


Unnamed: 0,tweet_id,user_id,tweet,label
0,1239172732690014208,2391447188,We’re parking at the airport and my mom rolled down the window to speak to an attendant and my dad immediately said “we have the coronavirus sir”,0
1,1223737201030246402,1200539436167159809,I really didn’t expect this will go wide this way. I hope safety & health for all people of #Chine & whole world. We are just trying to show some support & respect to them as much we can especially doctors who bravely facing the dirty #coronaVirus.,0
2,1239385333319389185,838382730,"For those who believe they are immortal and continue to go out to the park without paying attention to the order to remain at home, these are the x-rays of a 28-year-old boy intubated in the ICU in my hospital for #coronavirus. Hint: the lungs are black, white is pneumonia",1
3,1236209435241938945,780855138,My flight from Jordan back to the US stops in Paris 😂 will I be quarantined? Stay tuned to find out 😂😂 #coronavirus,0
4,1233855551605440514,337103373,I went to the movies and the air was on. Now I'm out to eat and Olive Garden has the air on. I see these establishments are doing their best to fight the coronavirus.,0


In [6]:
# Drop unwanted columns
train.drop(['tweet_id','user_id'], axis=1, inplace=True)
validation.drop(['tweet_id','user_id'], axis=1, inplace=True)

In [7]:
#@title Helper function for cleaning tweets
def clean_tweets(df, col):
    
    # Lower case
    df[col] = df[col].str.lower()

    # Remove URL's
    df[col] = df[col].str.replace(r'https?://\S+|www\.\S+', '', regex=True)
    
    # Remove Mentions and Hastags
    df[col] = df[col].apply(lambda x: remove_tags_mentions(x))

    # Remove redundant characters
    df[col] = df[col].apply(lambda x: remove_redundant_chars(x))

    # Remove emoji's
    df[col] = df[col].apply(lambda x: remove_emoji(x))

    # Remove emoticons
    df[col] = df[col].apply(lambda x: remove_emoticons(x))
    
    # Expand contractions
    df[col] = df[col].apply(lambda x: expand_contractions(x))
    
    # Remove stopwords
    df[col] = df[col].apply(lambda x: remove_stopwords(x))

    # Remove Numbers
    # df.text = df[col].str.replace(r'\d+', '', regex=True)

    # Remove punctuations
    df[col] = df[col].apply(lambda x: remove_punctuation(x))

    # Remove repeated characters in tweet
    df[col] = df[col].apply(lambda x: remove_repeated_chars(x))

    # Remove single character within text
    df[col] = df[col].apply(lambda x: remove_single_char(x))

    # Remove extract whitespaces within tweet
    df[col] = df[col].apply(lambda x: remove_whitespaces(x))

    # Drop duplicate rows
    df.drop_duplicates(subset=['tweet'], inplace=True, keep='first')

    # Perform Stemming
    # df[col] = df[col].apply(lambda text: stem_words(text))

    return df

def stem_words(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

def remove_tags_mentions(text):
    pattern = re.compile(r'(@\S+|#\S+)')
    return pattern.sub('', text)

def remove_redundant_chars(text):
    pattern = re.compile(r'(&gt|&amp|&lt|wtf|fuck\'n|®|©|rt)')
    return pattern.sub('', text)

# Reference: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

EMOTICONS = {
    u":‑\)":"Happy face or smiley",
    u":\)":"Happy face or smiley",
    u":-\]":"Happy face or smiley",
    u":\]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-\)":"Happy face smiley",
    u":o\)":"Happy face smiley",
    u":-\}":"Happy face smiley",
    u":\}":"Happy face smiley",
    u":-\)":"Happy face smiley",
    u":c\)":"Happy face smiley",
    u":\^\)":"Happy face smiley",
    u"=\]":"Happy face smiley",
    u"=\)":"Happy face smiley",
    u":‑D":"Laughing, big grin or laugh with glasses",
    u":D":"Laughing, big grin or laugh with glasses",
    u"8‑D":"Laughing, big grin or laugh with glasses",
    u"8D":"Laughing, big grin or laugh with glasses",
    u"X‑D":"Laughing, big grin or laugh with glasses",
    u"XD":"Laughing, big grin or laugh with glasses",
    u"=D":"Laughing, big grin or laugh with glasses",
    u"=3":"Laughing, big grin or laugh with glasses",
    u"B\^D":"Laughing, big grin or laugh with glasses",
    u":-\)\)":"Very happy",
    u":‑\(":"Frown, sad, andry or pouting",
    u":-\(":"Frown, sad, andry or pouting",
    u":\(":"Frown, sad, andry or pouting",
    u":‑c":"Frown, sad, andry or pouting",
    u":c":"Frown, sad, andry or pouting",
    u":‑<":"Frown, sad, andry or pouting",
    u":<":"Frown, sad, andry or pouting",
    u":‑\[":"Frown, sad, andry or pouting",
    u":\[":"Frown, sad, andry or pouting",
    u":-\|\|":"Frown, sad, andry or pouting",
    u">:\[":"Frown, sad, andry or pouting",
    u":\{":"Frown, sad, andry or pouting",
    u":@":"Frown, sad, andry or pouting",
    u">:\(":"Frown, sad, andry or pouting",
    u":'‑\(":"Crying",
    u":'\(":"Crying",
    u":'‑\)":"Tears of happiness",
    u":'\)":"Tears of happiness",
    u"D‑':":"Horror",
    u"D:<":"Disgust",
    u"D:":"Sadness",
    u"D8":"Great dismay",
    u"D;":"Great dismay",
    u"D=":"Great dismay",
    u"DX":"Great dismay",
    u":‑O":"Surprise",
    u":O":"Surprise",
    u":‑o":"Surprise",
    u":o":"Surprise",
    u":-0":"Shock",
    u"8‑0":"Yawn",
    u">:O":"Yawn",
    u":-\*":"Kiss",
    u":\*":"Kiss",
    u":X":"Kiss",
    u";‑\)":"Wink or smirk",
    u";\)":"Wink or smirk",
    u"\*-\)":"Wink or smirk",
    u"\*\)":"Wink or smirk",
    u";‑\]":"Wink or smirk",
    u";\]":"Wink or smirk",
    u";\^\)":"Wink or smirk",
    u":‑,":"Wink or smirk",
    u";D":"Wink or smirk",
    u":‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|":"Straight face",
    u":\|":"Straight face",
    u":$":"Embarrassed or blushing",
    u":‑x":"Sealed lips or wearing braces or tongue-tied",
    u":x":"Sealed lips or wearing braces or tongue-tied",
    u":‑#":"Sealed lips or wearing braces or tongue-tied",
    u":#":"Sealed lips or wearing braces or tongue-tied",
    u":‑&":"Sealed lips or wearing braces or tongue-tied",
    u":&":"Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)":"Angel, saint or innocent",
    u"O:\)":"Angel, saint or innocent",
    u"0:‑3":"Angel, saint or innocent",
    u"0:3":"Angel, saint or innocent",
    u"0:‑\)":"Angel, saint or innocent",
    u"0:\)":"Angel, saint or innocent",
    u":‑b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)":"Angel, saint or innocent",
    u">:‑\)":"Evil or devilish",
    u">:\)":"Evil or devilish",
    u"\}:‑\)":"Evil or devilish",
    u"\}:\)":"Evil or devilish",
    u"3:‑\)":"Evil or devilish",
    u"3:\)":"Evil or devilish",
    u">;\)":"Evil or devilish",
    u"\|;‑\)":"Cool",
    u"\|‑O":"Bored",
    u":‑J":"Tongue-in-cheek",
    u"#‑\)":"Party all night",
    u"%‑\)":"Drunk or confused",
    u"%\)":"Drunk or confused",
    u":-###..":"Being sick",
    u":###..":"Being sick",
    u"<:‑\|":"Dump",
    u"\(>_<\)":"Troubled",
    u"\(>_<\)>":"Troubled",
    u"\(';'\)":"Baby",
    u"\(\^\^>``":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz":"Sleeping",
    u"\(\^_-\)":"Wink",
    u"\(\(\+_\+\)\)":"Confused",
    u"\(\+o\+\)":"Confused",
    u"\(o\|o\)":"Ultraman",
    u"\^_\^":"Joyful",
    u"\(\^_\^\)/":"Joyful",
    u"\(\^O\^\)／":"Joyful",
    u"\(\^o\^\)／":"Joyful",
    u"\(__\)":"Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_":"Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>":"Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)":"Sad or Crying",
    u"\(/_;\)":"Sad or Crying",
    u"\(T_T\) \(;_;\)":"Sad or Crying",
    u"\(;_;":"Sad of Crying",
    u"\(;_:\)":"Sad or Crying",
    u"\(;O;\)":"Sad or Crying",
    u"\(:_;\)":"Sad or Crying",
    u"\(ToT\)":"Sad or Crying",
    u";_;":"Sad or Crying",
    u";-;":"Sad or Crying",
    u";n;":"Sad or Crying",
    u";;":"Sad or Crying",
    u"Q\.Q":"Sad or Crying",
    u"T\.T":"Sad or Crying",
    u"QQ":"Sad or Crying",
    u"Q_Q":"Sad or Crying",
    u"\(-\.-\)":"Shame",
    u"\(-_-\)":"Shame",
    u"\(一一\)":"Shame",
    u"\(；一_一\)":"Shame",
    u"\(=_=\)":"Tired",
    u"\(=\^\·\^=\)":"cat",
    u"\(=\^\·\·\^=\)":"cat",
    u"=_\^=	":"cat",
    u"\(\.\.\)":"Looking down",
    u"\(\._\.\)":"Looking down",
    u"\^m\^":"Giggling with hand covering mouth",
    u"\(\・\・?":"Confusion",
    u"\(?_?\)":"Confusion",
    u">\^_\^<":"Normal Laugh",
    u"<\^!\^>":"Normal Laugh",
    u"\^/\^":"Normal Laugh",
    u"\（\*\^_\^\*）" :"Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)":"Normal Laugh",
    u"\(^\^\)":"Normal Laugh",
    u"\(\^\.\^\)":"Normal Laugh",
    u"\(\^_\^\.\)":"Normal Laugh",
    u"\(\^_\^\)":"Normal Laugh",
    u"\(\^\^\)":"Normal Laugh",
    u"\(\^J\^\)":"Normal Laugh",
    u"\(\*\^\.\^\*\)":"Normal Laugh",
    u"\(\^—\^\）":"Normal Laugh",
    u"\(#\^\.\^#\)":"Normal Laugh",
    u"\（\^—\^\）":"Waving",
    u"\(;_;\)/~~~":"Waving",
    u"\(\^\.\^\)/~~~":"Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~":"Waving",
    u"\(T_T\)/~~~":"Waving",
    u"\(ToT\)/~~~":"Waving",
    u"\(\*\^0\^\*\)":"Excited",
    u"\(\*_\*\)":"Amazed",
    u"\(\*_\*;":"Amazed",
    u"\(\+_\+\) \(@_@\)":"Amazed",
    u"\(\*\^\^\)v":"Laughing,Cheerful",
    u"\(\^_\^\)v":"Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)":"Headphones,Listening to music",
    u'\(-"-\)':"Worried",
    u"\(ーー;\)":"Worried",
    u"\(\^0_0\^\)":"Eyeglasses",
    u"\(\＾ｖ\＾\)":"Happy",
    u"\(\＾ｕ\＾\)":"Happy",
    u"\(\^\)o\(\^\)":"Happy",
    u"\(\^O\^\)":"Happy",
    u"\(\^o\^\)":"Happy",
    u"\)\^o\^\(":"Happy",
    u":O o_O":"Surprised",
    u"o_0":"Surprised",
    u"o\.O":"Surpised",
    u"\(o\.o\)":"Surprised",
    u"oO":"Surprised",
    u"\(\*￣m￣\)":"Dissatisfied",
    u"\(‘A`\)":"Snubbed or Deflated"
}

def remove_emoticons(text):
    emoticons_pattern = re.compile(u'(' + u'|'.join(emo for emo in EMOTICONS) + u')')
    return emoticons_pattern.sub(r'', text)

def expand_contractions(text):
    return contractions.fix(text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCTUATIONS))

def remove_whitespaces(text):
    pattern = re.compile(r" +")
    return pattern.sub(" ", text).strip()

def remove_repeated_chars(text):
    return re.sub("(.)\\1{2,}", "\\1", text)

def remove_single_char(text):
    pattern = re.compile(r' ([a-zA-Z]{1}) ')
    return pattern.sub(" ", text)

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in STOPWORDS])

In [8]:
train = clean_tweets(train, 'tweet')
validation = clean_tweets(validation, 'tweet')

In [9]:
train_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train.tweet)]
test_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(validation.tweet)]

model = Doc2Vec(epochs=30)
model.build_vocab(train_documents)

model.train(train_documents, total_examples=model.corpus_count, epochs=model.epochs)

def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, feature_vectors

In [10]:
y_train_indexs, X_train = vector_for_learning(model, train_documents)
y_valid_indexs, X_valid = vector_for_learning(model, test_documents)

### Logistic Regression

In [11]:
logreg = LogisticRegression()
logreg.fit(X_train, train['label'])
y_pred = logreg.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred)}')

Testing accuracy 0.8293706293706293
Testing classification report 
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       593
           1       0.00      0.00      0.00       122

    accuracy                           0.83       715
   macro avg       0.41      0.50      0.45       715
weighted avg       0.69      0.83      0.75       715

Testing confusion matrix 
[[593   0]
 [122   0]]
Testing F1 score: 0.0


In [12]:
svc = SVC()
svc.fit(X_train, train['label'])
y_pred = svc.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.8293706293706293
Testing classification report 
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       593
           1       0.00      0.00      0.00       122

    accuracy                           0.83       715
   macro avg       0.41      0.50      0.45       715
weighted avg       0.69      0.83      0.75       715

Testing confusion matrix 
[[593   0]
 [122   0]]
Testing F1 score: 0.8293706293706293


In [13]:
linearSVC = LinearSVC()
linearSVC.fit(X_train, train['label'])
y_pred = linearSVC.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.8293706293706293
Testing classification report 
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       593
           1       0.00      0.00      0.00       122

    accuracy                           0.83       715
   macro avg       0.41      0.50      0.45       715
weighted avg       0.69      0.83      0.75       715

Testing confusion matrix 
[[593   0]
 [122   0]]
Testing F1 score: 0.8293706293706293


### SMOTETomek

In [14]:
smotetomek = SMOTETomek(random_state = 0)
X_train_resample, train_resample = smotetomek.fit_resample(X_train, train['label'])

### SMOTETomek + Logistic Regression

In [36]:
logreg = LogisticRegression(random_state=0)
logreg.fit(pd.DataFrame(X_train_resample), train_resample)
y_pred = logreg.predict(pd.DataFrame(X_valid))

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred)}')

Testing accuracy 0.6167832167832168
Testing classification report 
              precision    recall  f1-score   support

           0       0.88      0.63      0.73       593
           1       0.24      0.57      0.34       122

    accuracy                           0.62       715
   macro avg       0.56      0.60      0.53       715
weighted avg       0.77      0.62      0.66       715

Testing confusion matrix 
[[371 222]
 [ 52  70]]
Testing F1 score: 0.3381642512077294


### SMOTETomek + Linear SVC

In [16]:
logreg = LinearSVC()
logreg.fit(pd.DataFrame(X_train_resample), train_resample)
y_pred = logreg.predict(pd.DataFrame(X_valid))

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred)}')

Testing accuracy 0.6181818181818182
Testing classification report 
              precision    recall  f1-score   support

           0       0.87      0.63      0.73       593
           1       0.24      0.56      0.33       122

    accuracy                           0.62       715
   macro avg       0.56      0.59      0.53       715
weighted avg       0.77      0.62      0.66       715

Testing confusion matrix 
[[374 219]
 [ 54  68]]
Testing F1 score: 0.33251833740831294


In [17]:
logreg = RandomForestClassifier(random_state=0)
logreg.fit(pd.DataFrame(X_train_resample), train_resample)
y_pred = logreg.predict(pd.DataFrame(X_valid))

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred)}')

Testing accuracy 0.7748251748251749
Testing classification report 
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       593
           1       0.21      0.11      0.15       122

    accuracy                           0.77       715
   macro avg       0.52      0.51      0.51       715
weighted avg       0.73      0.77      0.75       715

Testing confusion matrix 
[[540  53]
 [108  14]]
Testing F1 score: 0.14814814814814814


In [18]:
logreg = SVC(random_state=0)
logreg.fit(pd.DataFrame(X_train_resample), train_resample)
y_pred = logreg.predict(pd.DataFrame(X_valid))

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred)}')

Testing accuracy 0.6447552447552447
Testing classification report 
              precision    recall  f1-score   support

           0       0.87      0.67      0.76       593
           1       0.24      0.51      0.33       122

    accuracy                           0.64       715
   macro avg       0.56      0.59      0.54       715
weighted avg       0.76      0.64      0.69       715

Testing confusion matrix 
[[399 194]
 [ 60  62]]
Testing F1 score: 0.328042328042328


### TF-IDF

In [19]:
vectorizer = TfidfVectorizer()

In [20]:
cv = 5

models = [
    RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    SVC(),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=42)
]

result_cv_df = pd.DataFrame(index=range(cv * len(models)))

scorer = make_scorer(f1_score)

In [21]:
train_documents = vectorizer.fit_transform(train.tweet)
test_documents = vectorizer.transform(validation.tweet)

In [22]:
cv_result = []

for model in models:
  model_name = model.__class__.__name__
  f1_score_cv = cross_val_score(model, train_documents, train['label'], scoring=scorer, cv=cv)
  for fold_idx, score in enumerate(f1_score_cv):
    cv_result.append((model_name, fold_idx, score))
    
result_cv_df = pd.DataFrame(cv_result, columns=['model_name', 'fold_idx', 'f1_score'])

In [23]:
result_cv_df.groupby('model_name').f1_score.mean()

model_name
LinearSVC                 0.444973
LogisticRegression        0.235296
MultinomialNB             0.011594
RandomForestClassifier    0.000000
SVC                       0.200674
Name: f1_score, dtype: float64

### Best Model - Linear SVC

In [37]:
linearSVC = LinearSVC()
linearSVC.fit(train_documents, train['label'])
y_pred = linearSVC.predict(test_documents)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred)}')

Testing accuracy 0.8699300699300699
Testing classification report 
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       593
           1       0.70      0.41      0.52       122

    accuracy                           0.87       715
   macro avg       0.80      0.69      0.72       715
weighted avg       0.86      0.87      0.86       715

Testing confusion matrix 
[[572  21]
 [ 72  50]]
Testing F1 score: 0.5181347150259068


In [38]:
tempo = validation.reset_index(drop=True)

In [39]:
temp_val = tempo[y_pred != tempo["label"]]
temp_val = temp_val
temp = pd.concat([temp_val.reset_index(drop=True), pd.DataFrame(y_pred[temp_val.index], columns=['y_pred'])], axis=1)

In [42]:
temp

Unnamed: 0,tweet,label,y_pred
0,exactly spent hours investigating issue staed kids self quarantined symptoms question someone elderly people get tested still answers,1,0
1,anyway still sick flu not coronavirus back bed me the banality media racism thread day,1,0
2,time 0307the medical team trying get get tested likely not tomorrow remain quarantine 72 hours results come back depending results next steps play accordingly,1,0
3,today came across first customer self isolating due coronavirus thankfully not get close vicinity infected seems sent home self isolate family appreciative visit paid,1,0
4,way knowing december present pneumonia respiratory flu whatever cases coronavirus tie not testing,0,1
5,woke california cough guess coronavirus,1,0
6,realize cruise coronavirus rampant god make us wash hands time stand cafe door spray hands singing “washy washy” you so that,1,0
7,coronavirus whatever causing pain sinuses,1,0
8,confirmed case coronavirus one hospitals personally taken care tb meningitis prion diseases ya know influenza patients really concerned job work unit quarantineddesignated virus comes is,1,0
9,laying bed flu admit sick hearing coronavirus not fathom hearing anything worst flu,1,0


In [43]:
validation_check = pd.read_csv(val_filename, sep="\t")

In [45]:
validation_check

Unnamed: 0,tweet_id,user_id,tweet,label
0,1236914850728534018,23560081,"I went to a buffet, a football game with 20,000 people, and karaoke today. If I don't have the #coronavirus after this, it don't exist or I'm already immune. Stay tuned!",0
1,1233851573995884545,1268665964,people at the airport are going to think i have the damn Coronavirus... just wonderful,0
2,1238080702836543489,1119974425,@Swanny1875 I've had a cough but there again I always get a cough. The doc said that if you're able to take a deep breath then it's definitely not coronavirus.,1
3,1223482298902822914,209123498,Soon as I hear a cough I’m assuming it’s the Coronavirus back tf up !,0
4,1234868935096492032,66518109,"Can I self quarantine from people over obsessing about the Coronavirus!? Yes, we it Sharon! You read the Sun!",0
5,1237104881166680073,380285402,"Girl, 7, who lives in The Bronx is the YOUNGEST person in the US to be diagnosed with coronavirus https://t.co/cr1iYH7FUZ",0
6,1236086887418187776,30757332,When I went to buy a few bottles of hand sanitizer at Walmart 2 weeks ago there were only 3 on the shelf. I assumed they were other doomsday prepers. My husband thought I was being ridiculous. Now I’m seeing this on @eBay &amp; I’m happy with my decision. #handsanitizer #coronavirus https://t.co/RkfHjJDoc2,0
7,1236016473849466880,919016771486486529,"Exactly @JasonZocchi ! I have spent hours investigating this issue. It all started with my kids I self quarantined when they had all the symptoms, then a question from someone about where elderly people can get tested. Still no answers. #CoronavirusUSA #coronavirus #orleg",1
8,1230123002890723329,250722404,"@specterm Not really. I have had the flu shot which should limit the extent of my illness, I know the treatment protocol for it, and it’s less likely to be “hidden” so that I can take adequate precautions to protect my family and my newborn. The unknowns of coronavirus make me nervous.",0
9,1221291720890912768,991749720124084224,If you been to China in th past month stay in your home and don’t come in contact with other people. I don’t need the coronavirus spreading to Nevada,0


In [52]:
# Most frequent words in tweets having adverse reactions
counter_pos = Counter([word for tweet in validation_check[validation_check['label'] == 1].tweet.tolist() for word in tweet.split() if word.lower() not in STOPWORDS])

In [53]:
counter_pos.most_common(20)

[('coronavirus', 42),
 ('#coronavirus', 28),
 ('get', 21),
 ('got', 21),
 ('not', 20),
 ('coughing', 20),
 ('sick', 19),
 ('people', 18),
 ('cough', 16),
 ('back', 15),
 ('know', 15),
 ('Coronavirus', 15),
 ('coronavirus.', 14),
 ('flight', 14),
 ('symptoms', 13),
 ('I’m', 12),
 ('tested', 11),
 ('like', 10),
 ('said', 9),
 ("I'm", 9)]

In [54]:
# Most frequent words in tweets having adverse reactions
counter_neg = Counter([word for tweet in validation_check[validation_check['label'] == 0].tweet.tolist() for word in tweet.split() if word.lower() not in STOPWORDS])

In [56]:
counter_neg.most_common(50)

[('coronavirus', 241),
 ('#coronavirus', 126),
 ('not', 114),
 ('people', 96),
 ('get', 90),
 ('Coronavirus', 81),
 ('sick', 75),
 ('I’m', 61),
 ('US', 54),
 ('&amp;', 52),
 ('like', 47),
 ('home', 46),
 ('work', 45),
 ("I'm", 40),
 ('need', 40),
 ('don’t', 39),
 ('going', 38),
 ('go', 38),
 ('us', 37),
 ('know', 35),
 ('quarantine', 32),
 ('got', 30),
 ('went', 29),
 ('it’s', 29),
 ('think', 28),
 ('hospital', 28),
 ('flights', 28),
 ('one', 27),
 ('time', 26),
 ('stay', 25),
 ('want', 25),
 ('even', 25),
 ('symptoms', 25),
 ('quarantined', 24),
 ('coronavirus.', 24),
 ('cough', 22),
 ('make', 22),
 ('told', 22),
 ('would', 22),
 ('cruise', 22),
 ('China', 21),
 ('every', 21),
 ('still', 21),
 ('💀', 21),
 ('back', 20),
 ('working', 20),
 ('2', 19),
 ('say', 19),
 ('airport', 18),
 ('doctor', 18)]

In [25]:
svc = SVC()
svc.fit(X_train, train['label'])
y_pred = svc.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.8293706293706293
Testing classification report 
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       593
           1       0.00      0.00      0.00       122

    accuracy                           0.83       715
   macro avg       0.41      0.50      0.45       715
weighted avg       0.69      0.83      0.75       715

Testing confusion matrix 
[[593   0]
 [122   0]]
Testing F1 score: 0.8293706293706293


### Random Forest

In [26]:
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, train['label'])
y_pred = rf.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.8293706293706293
Testing classification report 
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       593
           1       0.00      0.00      0.00       122

    accuracy                           0.83       715
   macro avg       0.41      0.50      0.45       715
weighted avg       0.69      0.83      0.75       715

Testing confusion matrix 
[[593   0]
 [122   0]]
Testing F1 score: 0.8293706293706293


### Logistic Regression

In [27]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, train['label'])
y_pred = logreg.predict(X_valid)

print(f'Testing accuracy {accuracy_score(validation["label"], y_pred)}')
print(f'Testing classification report \n{classification_report(validation["label"], y_pred)}')
print(f'Testing confusion matrix \n{confusion_matrix(validation["label"], y_pred)}')
print(f'Testing F1 score: {f1_score(validation["label"], y_pred, average="micro")}')

Testing accuracy 0.8293706293706293
Testing classification report 
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       593
           1       0.00      0.00      0.00       122

    accuracy                           0.83       715
   macro avg       0.41      0.50      0.45       715
weighted avg       0.69      0.83      0.75       715

Testing confusion matrix 
[[593   0]
 [122   0]]
Testing F1 score: 0.8293706293706293


In [28]:
# from sklearn.model_selection import StratifiedShuffleSplit

In [29]:
# train_set = train.copy(deep=True)

In [30]:
# sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)

In [31]:
# train_documents = vectorizer.fit_transform(train_set.tweet)
# train_documents = pd.concat([train_set, pd.DataFrame(train_documents, columns=['vec_tweet'])], axis=1)

In [32]:
# train_documents['vec_tweet'].head()

In [33]:
# positive_class = train[train.label == 1].tweet
# negative_class = train[train.label == 0].tweet

# for fold in range(5):
#     if fold != 4:
#         fold_part = negative_class.iloc[1025*fold:1025(fold+1)]
#     else:
#         fold_part = negative_class.iloc[1025*fold:]
#     fold_part = pd.concat([fold_part, positive_class]).reset_index(drop=True).sample(frac=1)
#     linearSVC = LinearSVC()
#     linearSVC.fit(train_documents, train['label'])
#     y_pred = linearSVC.predict(test_documents)

In [34]:
train.label.value_counts()

0    5316
1    1025
Name: label, dtype: int64

In [35]:
5316/1025

5.186341463414634