In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import tensorflow as tf
import torch 
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.utils.data.sampler import SequentialSampler
from transformers import BertTokenizer
import transformers as ppb 

import warnings
warnings.filterwarnings('ignore')


device = 'cuda' if torch.cuda.is_available() else 'cpu'

class CovidDataset(Dataset):
    def __init__(self,filename):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.df = pd.read_csv(filename,delimiter='\t',encoding='utf-8')  
        #self.train_df = pd.read_csv("/home/joao/COVID19Tweet-master/train.tsv",delimiter='\t',encoding='utf-8')  
        #self.val_df = pd.read_csv("/home/joao/COVID19Tweet-master/valid.tsv",delimiter='\t',encoding='utf-8')   
        self.df = self.df.rename(columns={'Text': 'sentence'})
        self.df = self.df.rename(columns={'Label': 'label'})
        self.df['label'].replace('INFORMATIVE', 1)
        self.df['label'] = self.df['label'].replace('INFORMATIVE', 1)
        self.df['label'].replace('UNINFORMATIVE', 0)
        self.df['label'] = self.df['label'].replace('UNINFORMATIVE', 0)
        self.df = self.df[['sentence','label']]
        self.df['nchars'] = self.df['sentence'].str.len()
        self.df['nwords'] = self.df['sentence'].str.split().str.len()
        self.df['bhash'] = self.df["sentence"].str.contains(pat = '#',flags=re.IGNORECASE, regex = True).astype(int) 
        self.df['nhash'] = self.df["sentence"].str.count('#') 
        self.df['blink']  = self.df["sentence"].str.contains(pat = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', flags=re.IGNORECASE, regex = True) .astype(int)
        self.df['nlink'] = self.df["sentence"].str.count(pat = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', flags=re.IGNORECASE) 
        self.df['bat'] = self.df["sentence"].str.contains(pat = '@',flags=re.IGNORECASE, regex = True).astype(int) 
        self.df['nat'] = self.df["sentence"].str.count(pat = '@') 
        self.df['rt'] = self.df["sentence"].str.contains(pat = '@rt|rt@',flags=re.IGNORECASE, regex = True).astype(int) 
        #df['phone'] = df["sentence"].str.contains(pat = '\(?([0-9]{3})\)?([ .-]?)([0-9]{3})\2([0-9]{4})',flags=re.IGNORECASE, regex = True).astype(int) 
        self.df['dlex'] = self.df["sentence"].apply(self.lexical_diversity)
        self.df["sentence"] = self.df["sentence"].str.lower()
        ## List of  US slangs.
        slangList = ['ASAP','BBIAB','BBL','BBS','BF','BFF','BFFL','BRB','CYA','DS','FAQ','FB','FITBLR','FLBP','FML','FTFY','FTW','FYI','G2G','GF','GR8','GTFO','HBIC','HML','HRU','HTH','IDK','IGHT','IMO','IMHO','IMY','IRL','ISTG','JK','JMHO','KTHX','L8R','LMAO','LMFAO','LMK','LOL','MWF','NM','NOOB','NP','NSFW','OOAK','OFC','OMG','ORLY','OTOH','RN','ROFL','RUH','SFW','SOML','SOZ','STFU','TFTI','TIL','TMI','TTFN','TTYL','TWSS','U','W/','WB','W/O','WYD','WTH','WTF','WYM','WYSIWYG','Y','YMMV','YW','YWA']
        slangList = [x.lower() for x in slangList]
        #happy emojis
        happy_emojis = [':\)', ';\)', '\(:']
        #sad emojis
        sad_emojis = [':\(', ';\(', '\):']
        punctuation = ['.',',','...','?','!',':',';']    
        #','-','+','*','_','=','/','','%',' &','{','}','[',']','(',')','
        #Checks if the sentence contains slang
        mask = self.df.iloc[:, 0].str.contains(r'\b(?:{})\b'.format('|'.join(slangList)))
        df1 = self.df[~mask]
        self.df['slang'] = mask.astype(int) 
        #Checks if the sentence contains happy emojis
        mask = self.df.iloc[:, 0].str.contains(r'\b(?:{})\b'.format('|'.join(happy_emojis)), regex = True)
        df1 = self.df[~mask]
        self.df['hemojis'] = mask.astype(int) 
        #Checks if the sentence contains happy emojis
        mask = self.df.iloc[:, 0].str.contains(r'\b(?:{})\b'.format('|'.join(sad_emojis)), regex = True)
        df1 = self.df[~mask]
        self.df['semojis'] = mask.astype(int) 
        self.hand_features =  self.df[['nchars', 'nwords','bhash','nhash','blink','nlink','bat','nat','rt','slang','dlex']]
        self.hand_features_DF = pd.DataFrame(self.hand_features)
        #################
        self.df['sentence'] = self.df['sentence'].str.replace(r'http(\S)+', r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'http(\S)+', r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'http ...', r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'@[\S]+',r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'_[\S]?',r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'[ ]{2, }',r' ')
        self.df['sentence'] = self.df['sentence'].str.replace(r'&amp;?',r'and')
        self.df['sentence'] = self.df['sentence'].str.replace(r'&lt;',r'<')
        self.df['sentence'] = self.df['sentence'].str.replace(r'&gt;',r'>')
        self.df['sentence'] = self.df['sentence'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
        self.df['sentence'] = self.df['sentence'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')
        self.df['sentence'] = self.df['sentence'].str.lower()
        self.df['sentence'] = self.df['sentence'].str.strip()
        self.sentences = self.df['sentence']
        self.labels = self.df['label'].values
        self.maxlen = 512
        #for sent in self.sentences:
        #    input_ids = self.tokenizer.encode(sent, add_special_tokens=True)
        #    self.maxlen = max(self.maxlen, len(input_ids))
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        sentence = self.df.loc[idx, 'sentence']
        label = self.df.loc[idx, 'label']
        h_features = self.hand_features_DF.loc[idx,['nchars', 'nwords','bhash','nhash','blink','nlink','bat','nat','rt','slang','dlex']]
        h_features_tensor = torch.tensor(h_features).to(device)
        tokens = self.tokenizer.tokenize(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids).to(device) #Converting the list to a pytorch tensor
        attn_mask = (tokens_ids_tensor != 0).float()
        attn_mask_tensor = torch.tensor(attn_mask).to(device)
        label_tensor = torch.tensor(label).to(device)
        return tokens_ids_tensor, attn_mask_tensor, label_tensor,h_features_tensor
    def lexical_diversity(self,text):
        return len(set(text.split())) / len(text.split())


train_dataset =  CovidDataset("/home/joao/COVID19Tweet-master/train.tsv")
val_dataset =  CovidDataset("/home/joao/COVID19Tweet-master/valid.tsv")


# Load pretrained model/tokenizer
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
model = model_class.from_pretrained(pretrained_weights)

# Tell pytorch to run this model on the GPU.
model.cuda()
model.to(device)


batch_size = 32

# Create the DataLoaders for our training and validation sets.
train_dataloader = DataLoader(train_dataset, sampler = SequentialSampler(train_dataset),batch_size = batch_size)
val_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset),batch_size = batch_size)

train_BertdfLabels = pd.DataFrame()
train_BertdfFeatures = pd.DataFrame()
train_h_dfFeatures = pd.DataFrame()
# For each batch of training data...
for batch in train_dataloader:
    with torch.no_grad():
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_h_features = batch[3].to(device)
        last_hidden_states = model(b_input_ids,attention_mask = b_input_mask)
        bertfeatures = last_hidden_states[0][:,0,:]#Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence.  The way BERT does sentence classification, is that it adds a token called [CLS] (for classification) at the beginning of every sentence. The output corresponding to that token can be thought of as an embedding for the entire sentence.
        bertfeatures = bertfeatures.cpu().detach().numpy()        
        labels = b_labels.cpu().detach().numpy()
        h_features = b_h_features.cpu().detach().numpy()
        train_BertdfLabels = train_BertdfLabels.append(pd.DataFrame(labels),ignore_index = True)
        train_BertdfFeatures = train_BertdfFeatures.append(pd.DataFrame(bertfeatures),ignore_index = True)
        train_h_dfFeatures = train_h_dfFeatures.append(pd.DataFrame(h_features),ignore_index = True)

        
val_BertdfLabels = pd.DataFrame()
val_BertdfFeatures = pd.DataFrame()
val_h_dfFeatures = pd.DataFrame()
# For each batch of validation data...
for batch in val_dataloader:
    with torch.no_grad():
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_h_features = batch[3].to(device)
        last_hidden_states = model(b_input_ids,attention_mask = b_input_mask)
        bertfeatures = last_hidden_states[0][:,0,:]#Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence.  The way BERT does sentence classification, is that it adds a token called [CLS] (for classification) at the beginning of every sentence. The output corresponding to that token can be thought of as an embedding for the entire sentence.
        bertfeatures = bertfeatures.cpu().detach().numpy()        
        labels = b_labels.cpu().detach().numpy()
        h_features = b_h_features.cpu().detach().numpy()
        val_BertdfLabels = val_BertdfLabels.append(pd.DataFrame(labels),ignore_index = True)
        val_BertdfFeatures = val_BertdfFeatures.append(pd.DataFrame(bertfeatures),ignore_index = True)
        val_h_dfFeatures = val_h_dfFeatures.append(pd.DataFrame(h_features),ignore_index = True)


In [None]:

# Model 2. Train and Test Split 
# The output from BERT is going to be input to SKLEARN

### Hand designed Features  only
features = features.reset_index(drop=True)
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=42)

train_labels = train_BertdfLabels
train_features = train_BertdfFeatures

train_h_dfFeatures

test_features = val_BertdfFeatures
test_labels = val_BertdfLabels

val_h_dfFeatures 
#

### Bert features only ( BertModel [CLS] token )
train_features, test_features, train_labels, test_labels = train_test_split(dfFeatures, dfLabels,test_size=0.33, random_state=42)


### Combined featues  BertModel [CLS] token  + Hand designed Features
combine_df = pd.concat([dfFeatures, features],axis=1, ignore_index=True)
train_features, test_features, train_labels, test_labels = train_test_split(combine_df, dfLabels,test_size=0.33, random_state=42)


lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

dt_clf = DecisionTreeClassifier()
dt_clf.fit(train_features, train_labels)

svm_clf = svm.SVC(gamma=0.001, C=100.)
svm_clf.fit(train_features, train_labels)

rf_clf = RandomForestClassifier()
rf_clf.fit(train_features, train_labels)

ab_clf = AdaBoostClassifier(n_estimators=100, random_state=0)
ab_clf.fit(train_features, train_labels)

nb_clf = GaussianNB()
nb_clf.fit(train_features, train_labels)

nn_clf = MLPClassifier(random_state=1, max_iter=300)
nn_clf.fit(train_features, train_labels)


#Evaluating Model #2
#So how well does our model do in classifying sentences? One way is to check the accuracy against the testing dataset:
lr_clf.score(test_features, test_labels)
dt_clf.score(test_features, test_labels)
rf_clf.score(test_features, test_labels)
ab_clf.score(test_features, test_labels)
nb_clf.score(test_features, test_labels)
nn_clf.score(test_features, test_labels)
svm_clf.score(test_features, test_labels)

y_pred = lr_clf.predict(test_features)
y_pred = dt_clf.predict(test_features)
y_pred = rf_clf.predict(test_features)
y_pred = ab_clf.predict(test_features)
y_pred = nb_clf.predict(test_features)
y_pred = nn_clf.predict(test_features)
y_pred = svm_clf.predict(test_features)

print('Accuracy:', accuracy_score(test_labels, y_pred))
print('F1 score:', f1_score(test_labels, y_pred, average='macro'))
print('Recall:', recall_score(test_labels, y_pred, average='macro'))
print('Precision:', precision_score(test_labels, y_pred, average='macro'))
print('\n clasification report:\n', classification_report(test_labels,y_pred))
print('\n confussion matrix:\n',confusion_matrix(test_labels, y_pred))



##### Grid Search for Parameters
#We can dive into Logistic regression directly with the Scikit Learn default parameters, but sometimes it's worth searching for the best value of the C parameter, which determines regularization strength.
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

target_names = ['class 0', 'class 1']
print(classification_report(test_labels, y_pred, target_names=target_names))