In [None]:
#!/usr/bin/env python3
import math 
import random
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import tensorflow as tf
import torch 
from torch.utils.data import DataLoader, Dataset, TensorDataset, random_split, RandomSampler,IterableDataset
from torch.utils.data.sampler import SequentialSampler
from transformers import BertTokenizer
import transformers as ppb 

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


class CustomDataset(Dataset):
    def __init__(self,filename,name):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        if name == 'crisismmd':
            self.df = pd.read_csv(filename,delimiter='\t',encoding='utf-8')
            #df = pd.read_csv('/home/joao/task_informative_text_img_dev.tsv',delimiter='\t',encoding='utf-8')
            #df = pd.read_csv('/home/joao/task_informative_text_img_test.tsv',delimiter='\t',encoding='utf-8')
            self.df = self.df[['tweet_text','label_text']]
            self.df = self.df.rename(columns={'tweet_text': 'sentence'})
            self.df = self.df.rename(columns={'label_text': 'label'})
            self.df['label'] = self.df['label'].replace('informative', 1)
            self.df['label'] = self.df['label'].replace('not_informative', 0)
        if name == 'covid':
            self.df = pd.read_csv(filename,delimiter='\t',encoding='utf-8')  
            #self.train_df = pd.read_csv("/home/joao/COVID19Tweet-master/train.tsv",delimiter='\t',encoding='utf-8')  
            #self.val_df = pd.read_csv("/home/joao/COVID19Tweet-master/valid.tsv",delimiter='\t',encoding='utf-8')   
            self.df = self.df.rename(columns={'Text': 'sentence'})
            self.df = self.df.rename(columns={'Label': 'label'})
            self.df['label'].replace('INFORMATIVE', 1)
            self.df['label'] = self.df['label'].replace('INFORMATIVE', 1)
            self.df['label'].replace('UNINFORMATIVE', 0)
            self.df['label'] = self.df['label'].replace('UNINFORMATIVE', 0)
        if name == 'crisislext26':
            self.df = pd.read_csv("/home/joao/crisisLexT26.csv", encoding='utf-8')
            self.df = self.df.drop(['Tweet ID', ' Information Source', ' Information Type' ], axis=1)
            #Relabelling the columns titles to remove white spaces
            self.df = self.df.rename(columns={' Tweet Text': 'sentence'})
            self.df = self.df.rename(columns={' Informativeness': 'label'})
            self.df = self.df[self.df.label!= 'Not related']
            self.df = self.df[self.df.label!= 'Not applicable']
            self.df['label'].replace('Related and informative', 1)
            self.df['label'] = self.df['label'].replace('Related and informative', 1)
            self.df['label'].replace('Related - but not informative', 0)
            self.df['label'] = self.df['label'].replace('Related - but not informative', 0)
            self.df = self.df.reset_index(drop=True)
        if name == 'crisislext6':
            #self.df = pd.read_csv("/home/renato/Datasets/CrisisLexT6-v1.0/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv", encoding='utf-8')
            #self.df = pd.read_csv("/home/joao/2012_Sandy_Hurricane-ontopic_offtopic.csv",encoding='utf-8')
            self.df = pd.read_csv("/home/joao/crisisLexT6.csv", encoding='utf-8')
            self.df = self.df.rename(columns={' tweet': 'sentence'})
            self.df = self.df.rename(columns={' label': 'label'})
            self.df['label'].replace('on-topic', 1)
            self.df['label'] = self.df['label'].replace('on-topic', 1)
            self.df['label'].replace('off-topic', 0)
            self.df['label'] = self.df['label'].replace('off-topic', 0)
        self.df = self.df[['sentence','label']]
        self.df['nchars'] = self.df['sentence'].str.len()
        self.df['nwords'] = self.df['sentence'].str.split().str.len()
        self.df['bhash'] = self.df["sentence"].str.contains(pat = '#',flags=re.IGNORECASE, regex = True).astype(int) 
        self.df['nhash'] = self.df["sentence"].str.count('#') 
        self.df['blink']  = self.df["sentence"].str.contains(pat = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', flags=re.IGNORECASE, regex = True) .astype(int)
        self.df['nlink'] = self.df["sentence"].str.count(pat = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', flags=re.IGNORECASE) 
        self.df['bat'] = self.df["sentence"].str.contains(pat = '@',flags=re.IGNORECASE, regex = True).astype(int) 
        self.df['nat'] = self.df["sentence"].str.count(pat = '@') 
        self.df['rt'] = self.df["sentence"].str.contains(pat = '@rt|rt@',flags=re.IGNORECASE, regex = True).astype(int) 
        #df['phone'] = df["sentence"].str.contains(pat = '\(?([0-9]{3})\)?([ .-]?)([0-9]{3})\2([0-9]{4})',flags=re.IGNORECASE, regex = True).astype(int) 
        self.df['dlex'] = self.df["sentence"].apply(self.lexical_diversity)
        self.df["sentence"] = self.df["sentence"].str.lower()
        ## List of  US slangs.
        slangList = ['ASAP','BBIAB','BBL','BBS','BF','BFF','BFFL','BRB','CYA','DS','FAQ','FB','FITBLR','FLBP','FML','FTFY','FTW','FYI','G2G','GF','GR8','GTFO','HBIC','HML','HRU','HTH','IDK','IGHT','IMO','IMHO','IMY','IRL','ISTG','JK','JMHO','KTHX','L8R','LMAO','LMFAO','LMK','LOL','MWF','NM','NOOB','NP','NSFW','OOAK','OFC','OMG','ORLY','OTOH','RN','ROFL','RUH','SFW','SOML','SOZ','STFU','TFTI','TIL','TMI','TTFN','TTYL','TWSS','U','W/','WB','W/O','WYD','WTH','WTF','WYM','WYSIWYG','Y','YMMV','YW','YWA']
        slangList = [x.lower() for x in slangList]
        #happy emojis
        happy_emojis = [':\)', ';\)', '\(:']
        #sad emojis
        sad_emojis = [':\(', ';\(', '\):']
        punctuation = ['.',',','...','?','!',':',';']    
        #','-','+','*','_','=','/','','%',' &','{','}','[',']','(',')','
        #Checks if the sentence contains slang
        mask = self.df.iloc[:, 0].str.contains(r'\b(?:{})\b'.format('|'.join(slangList)))
        df1 = self.df[~mask]
        self.df['slang'] = mask.astype(int) 
        #Checks if the sentence contains happy emojis
        mask = self.df.iloc[:, 0].str.contains(r'\b(?:{})\b'.format('|'.join(happy_emojis)), regex = True)
        df1 = self.df[~mask]
        self.df['hemojis'] = mask.astype(int) 
        #Checks if the sentence contains happy emojis
        mask = self.df.iloc[:, 0].str.contains(r'\b(?:{})\b'.format('|'.join(sad_emojis)), regex = True)
        df1 = self.df[~mask]
        self.df['semojis'] = mask.astype(int) 
        self.hand_features =  self.df[['nchars', 'nwords','bhash','nhash','blink','nlink','bat','nat','rt','slang','dlex']]
        self.hand_features_DF = pd.DataFrame(self.hand_features)
        #################
        self.df['sentence'] = self.df['sentence'].str.replace(r'http(\S)+', r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'http(\S)+', r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'http ...', r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'@[\S]+',r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'_[\S]?',r'')
        self.df['sentence'] = self.df['sentence'].str.replace(r'[ ]{2, }',r' ')
        self.df['sentence'] = self.df['sentence'].str.replace(r'&amp;?',r'and')
        self.df['sentence'] = self.df['sentence'].str.replace(r'&lt;',r'<')
        self.df['sentence'] = self.df['sentence'].str.replace(r'&gt;',r'>')
        self.df['sentence'] = self.df['sentence'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
        self.df['sentence'] = self.df['sentence'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')
        self.df['sentence'] = self.df['sentence'].str.lower()
        self.df['sentence'] = self.df['sentence'].str.strip()
        self.sentences = self.df['sentence']
        self.labels = self.df['label'].values
        self.maxlen = 0
        if name == 'covid':
            self.maxlen = 256
        if name == 'crisismmd':
            self.maxlen = 256
        else:
            for sent in self.sentences:
                input_ids = self.tokenizer.encode(sent, add_special_tokens=True)
                self.maxlen = max(self.maxlen, len(input_ids))
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        sentence = self.df.loc[idx, 'sentence']
        label = self.df.loc[idx, 'label']
        h_features = self.hand_features_DF.loc[idx,['nchars', 'nwords','bhash','nhash','blink','nlink','bat','nat','rt','slang','dlex']]
        h_features_tensor = torch.tensor(h_features).to(device)
        tokens = self.tokenizer.tokenize(sentence)
        encoded_dict = self.tokenizer.encode_plus(tokens, add_special_tokens = True, max_length = self.maxlen, pad_to_max_length = True,return_attention_mask = True)
        tokens_ids = encoded_dict['input_ids']
        tokens_ids_tensor = torch.tensor(tokens_ids).to(device)
        attn_mask = encoded_dict['attention_mask']
        attn_mask_tensor = torch.tensor(attn_mask).to(device)
        label_tensor = torch.tensor(label).to(device)
        return tokens_ids_tensor, attn_mask_tensor, label_tensor,h_features_tensor
    def lexical_diversity(self,text):
        return len(set(text.split())) / len(text.split())

 

def main():
    set_seed(42)

    datasets = ['covid', 'crisislext6', 'crisislext26', 'crisismmd']
    
    for data in datasets :
        print("=== {} ===".format(data))
        if data == 'crisismmd':
            train_dataset =  CustomDataset("/home/joao/task_informative_text_img_train.tsv","crisismmd")
            val_dataset =  CustomDataset("/home/joao/task_informative_text_img_dev.tsv","crisismmd")
            test_dataset = CustomDataset("/home/joao/task_informative_text_img_test.tsv","crisismmd")
            datasets = [train_dataset,val_dataset,test_dataset]
            dataset = torch.utils.data.ConcatDataset(datasets)
        if data == 'covid':
            train_dataset =  CustomDataset("/home/joao/COVID19Tweet-master/train.tsv","covid")
            val_dataset =  CustomDataset("/home/joao/COVID19Tweet-master/valid.tsv","covid")
            datasets = [train_dataset,val_dataset]
            dataset = torch.utils.data.ConcatDataset(datasets)
        if data ==  'crisislext6':
            dataset =  CustomDataset(None,"crisislext6")
            # Create a 90-10 train-validation split.
            #train_size = int(0.9 * len(dataset))
            #val_size = len(dataset) - train_size
            #train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
        if data == 'crisislext26':
            dataset =  CustomDataset(None,"crisislext26")
            # Create a 90-10 train-validation split.
            #train_size = int(0.9 * len(dataset))
            #val_size = len(dataset) - train_size
            #train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

        batch_size = 64

        # Create the DataLoaders for our training and validation sets.
        train_dataloader = DataLoader(dataset, sampler = RandomSampler(dataset),batch_size = batch_size)
        val_dataloader = DataLoader(val_dataset, sampler = RandomSampler(val_dataset),batch_size = batch_size)

        # Load pretrained model/tokenizer
        model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
        model = model_class.from_pretrained(pretrained_weights)

        # Tell pytorch to run this model on the GPU.
        model.cuda()
        model.to(device)


        # For each batch of training data...
        train_BertdfLabels = pd.DataFrame()    # dataframe with the Labels Features only 
        train_BertdfFeatures = pd.DataFrame()  # dataframe with the Bert features only
        train_h_dfFeatures = pd.DataFrame()    # dataframe with the hand crafted features only
        for batch in train_dataloader:
            with torch.no_grad():
                b_input_ids, b_input_mask, b_labels, b_h_features = tuple(t.to(device) for t in batch)
                last_hidden_states = model(b_input_ids,attention_mask = b_input_mask)
                bertfeatures = last_hidden_states[0][:,0,:]#Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence.  The way BERT does sentence classification, is that it adds a token called [CLS] (for classification) at the beginning of every sentence. The output corresponding to that token can be thought of as an embedding for the entire sentence.
                bertfeatures = bertfeatures.cpu().detach().numpy()        
                labels = b_labels.cpu().detach().numpy()
                h_features = b_h_features.cpu().detach().numpy()
                train_BertdfLabels = train_BertdfLabels.append(pd.DataFrame(labels),ignore_index = True)
                train_BertdfFeatures = train_BertdfFeatures.append(pd.DataFrame(bertfeatures),ignore_index = True)
                train_h_dfFeatures = train_h_dfFeatures.append(pd.DataFrame(h_features),ignore_index = True)
      
        # For each batch of validation data...
        val_BertdfLabels = pd.DataFrame()
        val_BertdfFeatures = pd.DataFrame()
        val_h_dfFeatures = pd.DataFrame()
        for batch in val_dataloader:
            with torch.no_grad():
                b_input_ids, b_input_mask, b_labels, b_h_features = tuple(t.to(device) for t in batch)
                last_hidden_states = model(b_input_ids,attention_mask = b_input_mask)
                bertfeatures = last_hidden_states[0][:,0,:]#Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence.  The way BERT does sentence classification, is that it adds a token called [CLS] (for classification) at the beginning of every sentence. The output corresponding to that token can be thought of as an embedding for the entire sentence.
                bertfeatures = bertfeatures.cpu().detach().numpy()        
                labels = b_labels.cpu().detach().numpy()
                h_features = b_h_features.cpu().detach().numpy()
                val_BertdfLabels = val_BertdfLabels.append(pd.DataFrame(labels),ignore_index = True)
                val_BertdfFeatures = val_BertdfFeatures.append(pd.DataFrame(bertfeatures),ignore_index = True)
                val_h_dfFeatures = val_h_dfFeatures.append(pd.DataFrame(h_features),ignore_index = True)

        # Model 2. Train and Test Split 
        # The output from BERT is going to be input to SKLEARN

        ### Hand designed Features  only
        #features = features.reset_index(drop=True)
        #train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=42)
        X_train = train_BertdfFeatures
        y_train = train_BertdfLabels
        #train_h_dfFeatures = pd.DataFrame()  
        
        lr_clf = LogisticRegression()
        dt_clf = DecisionTreeClassifier()
        rf_clf = RandomForestClassifier()
        ab_clf = AdaBoostClassifier(n_estimators=100, random_state=0)
        nb_clf = GaussianNB()
        nn_clf = MLPClassifier(random_state=1, max_iter=300)
        svm_clf = svm.SVC(gamma=0.001, C=100.)

        scoring = ['accuracy','precision_macro', 'recall_macro', 'f1_macro']
        
        print("   Firstly evaluate for BERT  ONLY encoded Features ")
        scores_lr_clf = cross_validate( lr_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_dt_clf = cross_validate( dt_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_rf_clf = cross_validate( rf_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_ab_clf = cross_validate( ab_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_nb_clf = cross_validate( nb_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_nn_clf = cross_validate( nn_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_svm_clf = cross_validate( svm_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
          
        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")     
        print("||    Classifiers     |   Accuracy  |  Precision  |  Recall  |  F-score  |")
        print("||+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++|")           
        print("||Logistic Regression |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_lr_clf['test_accuracy'].mean()*100.0,scores_lr_clf['test_precision_macro'].mean()*100.0,scores_lr_clf['test_recall_macro'].mean()*100.0,scores_lr_clf['test_f1_macro'].mean()*100.0))
        print("||Decision Tree       |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_dt_clf['test_accuracy'].mean()*100.0,scores_dt_clf['test_precision_macro'].mean()*100.0,scores_dt_clf['test_recall_macro'].mean()*100.0,scores_dt_clf['test_f1_macro'].mean()*100.0))
        print("||Random Forest       |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_rf_clf['test_accuracy'].mean()*100.0,scores_rf_clf['test_precision_macro'].mean()*100.0,scores_rf_clf['test_recall_macro'].mean()*100.0,scores_rf_clf['test_f1_macro'].mean()*100.0))
        print("||Adaboost            |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_ab_clf['test_accuracy'].mean()*100.0,scores_ab_clf['test_precision_macro'].mean()*100.0,scores_ab_clf['test_recall_macro'].mean()*100.0,scores_ab_clf['test_f1_macro'].mean()*100.0))
        print("||NaiveBayes          |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_nb_clf['test_accuracy'].mean()*100.0,scores_nb_clf['test_precision_macro'].mean()*100.0,scores_nb_clf['test_recall_macro'].mean()*100.0,scores_nb_clf['test_f1_macro'].mean()*100.0))
        print("||MLP                 |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_nn_clf['test_accuracy'].mean()*100.0,scores_nn_clf['test_precision_macro'].mean()*100.0,scores_nn_clf['test_recall_macro'].mean()*100.0,scores_nn_clf['test_f1_macro'].mean()*100.0))
        print("||SVM                 |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_svm_clf['test_accuracy'].mean()*100.0,scores_svm_clf['test_precision_macro'].mean()*100.0,scores_svm_clf['test_recall_macro'].mean()*100.0,scores_svm_clf['test_f1_macro'].mean()*100.0))
        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")     
        print()

        X_train =  train_h_dfFeatures
        lr_clf = LogisticRegression()
        dt_clf = DecisionTreeClassifier()
        rf_clf = RandomForestClassifier()
        ab_clf = AdaBoostClassifier(n_estimators=100, random_state=0)
        nb_clf = GaussianNB()
        nn_clf = MLPClassifier(random_state=1, max_iter=300)
        svm_clf = svm.SVC(gamma=0.001, C=100.)
    
        print(" Secondly evaluate for HAND CRAFTED  ONLY encoded Features   ") 
        scores_lr_clf = cross_validate( lr_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_dt_clf = cross_validate( dt_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_rf_clf = cross_validate( rf_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_ab_clf = cross_validate( ab_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_nb_clf = cross_validate( nb_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_nn_clf = cross_validate( nn_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_svm_clf = cross_validate( svm_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
   
        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")     
        print("||    Classifiers     |   Accuracy  |  Precision  |  Recall  |  F-score  |")
        print("||+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++|")           
        print("||Logistic Regression |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_lr_clf['test_accuracy'].mean()*100.0,scores_lr_clf['test_precision_macro'].mean()*100.0,scores_lr_clf['test_recall_macro'].mean()*100.0,scores_lr_clf['test_f1_macro'].mean()*100.0))
        print("||Decision Tree       |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_dt_clf['test_accuracy'].mean()*100.0,scores_dt_clf['test_precision_macro'].mean()*100.0,scores_dt_clf['test_recall_macro'].mean()*100.0,scores_dt_clf['test_f1_macro'].mean()*100.0))
        print("||Random Forest       |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_rf_clf['test_accuracy'].mean()*100.0,scores_rf_clf['test_precision_macro'].mean()*100.0,scores_rf_clf['test_recall_macro'].mean()*100.0,scores_rf_clf['test_f1_macro'].mean()*100.0))
        print("||Adaboost            |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_ab_clf['test_accuracy'].mean()*100.0,scores_ab_clf['test_precision_macro'].mean()*100.0,scores_ab_clf['test_recall_macro'].mean()*100.0,scores_ab_clf['test_f1_macro'].mean()*100.0))
        print("||NaiveBayes          |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_nb_clf['test_accuracy'].mean()*100.0,scores_nb_clf['test_precision_macro'].mean()*100.0,scores_nb_clf['test_recall_macro'].mean()*100.0,scores_nb_clf['test_f1_macro'].mean()*100.0))
        print("||MLP                 |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_nn_clf['test_accuracy'].mean()*100.0,scores_nn_clf['test_precision_macro'].mean()*100.0,scores_nn_clf['test_recall_macro'].mean()*100.0,scores_nn_clf['test_f1_macro'].mean()*100.0))
        print("||SVM                 |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_svm_clf['test_accuracy'].mean()*100.0,scores_svm_clf['test_precision_macro'].mean()*100.0,scores_svm_clf['test_recall_macro'].mean()*100.0,scores_svm_clf['test_f1_macro'].mean()*100.0))
        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")     
        print()
              
           
        X_train = pd.concat([train_BertdfFeatures, train_h_dfFeatures],axis=1, ignore_index=True)
              
        lr_clf = LogisticRegression()
        dt_clf = DecisionTreeClassifier()
        rf_clf = RandomForestClassifier()
        ab_clf = AdaBoostClassifier(n_estimators=100, random_state=0)
        nb_clf = GaussianNB()
        nn_clf = MLPClassifier(random_state=1, max_iter=300)
        svm_clf = svm.SVC(gamma=0.001, C=100.)
    
        print(" Thirdly evaluate for BERT + HAND CRAFTED   encoded Features ") 
        scores_lr_clf = cross_validate( lr_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_dt_clf = cross_validate( dt_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_svm_clf = cross_validate( svm_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_rf_clf = cross_validate( rf_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_ab_clf = cross_validate( ab_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_nb_clf = cross_validate( nb_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
        scores_nn_clf = cross_validate( nn_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
       
        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")     
        print("||    Classifiers     |   Accuracy  |  Precision  |  Recall  |  F-score  |")
        print("||+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++|")           
        print("||Logistic Regression |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_lr_clf['test_accuracy'].mean()*100.0,scores_lr_clf['test_precision_macro'].mean()*100.0,scores_lr_clf['test_recall_macro'].mean()*100.0,scores_lr_clf['test_f1_macro'].mean()*100.0))
        print("||Decision Tree       |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_dt_clf['test_accuracy'].mean()*100.0,scores_dt_clf['test_precision_macro'].mean()*100.0,scores_dt_clf['test_recall_macro'].mean()*100.0,scores_dt_clf['test_f1_macro'].mean()*100.0))
        print("||Random Forest       |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_rf_clf['test_accuracy'].mean()*100.0,scores_rf_clf['test_precision_macro'].mean()*100.0,scores_rf_clf['test_recall_macro'].mean()*100.0,scores_rf_clf['test_f1_macro'].mean()*100.0))
        print("||Adaboost            |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_ab_clf['test_accuracy'].mean()*100.0,scores_ab_clf['test_precision_macro'].mean()*100.0,scores_ab_clf['test_recall_macro'].mean()*100.0,scores_ab_clf['test_f1_macro'].mean()*100.0))
        print("||NaiveBayes          |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_nb_clf['test_accuracy'].mean()*100.0,scores_nb_clf['test_precision_macro'].mean()*100.0,scores_nb_clf['test_recall_macro'].mean()*100.0,scores_nb_clf['test_f1_macro'].mean()*100.0))
        print("||MLP                 |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_nn_clf['test_accuracy'].mean()*100.0,scores_nn_clf['test_precision_macro'].mean()*100.0,scores_nn_clf['test_recall_macro'].mean()*100.0,scores_nn_clf['test_f1_macro'].mean()*100.0))
        print("||SVM                 |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(scores_svm_clf['test_accuracy'].mean()*100.0,scores_svm_clf['test_precision_macro'].mean()*100.0,scores_svm_clf['test_recall_macro'].mean()*100.0,scores_svm_clf['test_f1_macro'].mean()*100.0))
        print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")     
        print("")
    
main()

In [None]:


### Bert features only ( BertModel [CLS] token )
train_features, test_features, train_labels, test_labels = train_test_split(dfFeatures, dfLabels,test_size=0.33, random_state=42)


### Combined featues  BertModel [CLS] token  + Hand designed Features
combine_df = pd.concat([dfFeatures, features],axis=1, ignore_index=True)
train_features, test_features, train_labels, test_labels = train_test_split(combine_df, dfLabels,test_size=0.33, random_state=42)




#Evaluating Model #2
#So how well does our model do in classifying sentences? One way is to check the accuracy against the testing dataset:

print("++++++++++++++ SCORES +++++++++++++++++++++++++++++++++++++++++++++++++++"     
print("|| {0:.3f} | {0:.3f} | {0:.3f} | {0:.3f} | {0:.3f} | {0:.3f} | {0:.3f} ||".format(lr_clf.score(test_features, test_labels),dt_clf.score(test_features, test_labels),rf_clf.score(test_features, test_labels),ab_clf.score(test_features, test_labels),nb_clf.score(test_features, test_labels),nn_clf.score(test_features, test_labels),svm_clf.score(test_features, test_labels))
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"     

y_pred_lr = lr_clf.predict(test_features)
y_pred_dt = dt_clf.predict(test_features)
y_pred_rf = rf_clf.predict(test_features)
y_pred_ab = ab_clf.predict(test_features)
y_pred_nb = nb_clf.predict(test_features)
y_pred_nn = nn_clf.predict(test_features)
y_pred_svm = svm_clf.predict(test_features)

      
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")     
print("||    Classifiers     |   Accuracy  |  Precision  |  Recall  |  F-score  |")
print("||+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++|")           
print("||Logistic Regression |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(accuracy_score(test_labels, y_pred_lr),precision_score(test_labels, y_pred_lr, average='macro'),recall_score(test_labels, y_pred_lr, average='macro'),f1_score(test_labels, y_pred_lr, average='macro')))
print("||Decision Tree       |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(accuracy_score(test_labels, y_pred_dt),precision_score(test_labels, y_pred_dt, average='macro'),recall_score(test_labels, y_pred_dt, average='macro'),f1_score(test_labels, y_pred_dt, average='macro')))
print("||Random Forest       |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(accuracy_score(test_labels, y_pred_rf),precision_score(test_labels, y_pred_rf, average='macro'),recall_score(test_labels, y_pred_rf, average='macro'),f1_score(test_labels, y_pred_rf, average='macro')))
print("||Adaboost            |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(accuracy_score(test_labels, y_pred_ab),precision_score(test_labels, y_pred_ab, average='macro'),recall_score(test_labels, y_pred_ab, average='macro'),f1_score(test_labels, y_pred_ab, average='macro')))
print("||NaiveBayes          |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(accuracy_score(test_labels, y_pred_nb),precision_score(test_labels, y_pred_nb, average='macro'),recall_score(test_labels, y_pred_nb, average='macro'),f1_score(test_labels, y_pred_nb, average='macro')))
print("||MLP                 |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(accuracy_score(test_labels, y_pred_nn),precision_score(test_labels, y_pred_nn, average='macro'),recall_score(test_labels, y_pred_nn, average='macro'),f1_score(test_labels, y_pred_nn, average='macro')))
print("||SVM                 |   {0:.3f}   |   {0:.3f}   |  {0:.3f} |   {0:.3f} |".format(accuracy_score(test_labels, y_pred_svm),precision_score(test_labels, y_pred_svm, average='macro'),recall_score(test_labels, y_pred_svm, average='macro'),f1_score(test_labels, y_pred_svm, average='macro'))) 
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"     
      
   
#print('\n clasification report:\n', classification_report(test_labels,y_pred))
#print('\n confussion matrix:\n',confusion_matrix(test_labels, y_pred))



##### Grid Search for Parameters
#We can dive into Logistic regression directly with the Scikit Learn default parameters, but sometimes it's worth searching for the best value of the C parameter, which determines regularization strength.
#parameters = {'penalty': ['l1', 'l2'],'C':[0.001,0.009,0.01,0.09,1,3,5,10,25,50,100,200]}

#grid_search = GridSearchCV(LogisticRegression(),param_grid = parameters,scoring = )
#grid_search.fit(train_features, train_labels)
#print('best parameters: ', grid_search.best_params_)
#print('best scrores: ', grid_search.best_score_)
#grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'recall')




#target_names = ['class 0', 'class 1']
#print(classification_report(test_labels, y_pred, target_names=target_names))

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

digits = datasets.load_digits()
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [0.0009,0.001, 0.002,  0.004, 0.008, 0.01,0.09, 1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [0.0009,0.001, 0.002,  0.004, 0.008, 0.01,0.09, 1, 10, 100, 1000]}]
scores = ['precision', 'recall', 'f1']


lr_clf = LogisticRegression()

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    clf = GridSearchCV(SVC(),tuned_parameters, scoring='%s_macro' % score)
    clf.fit(X_train, y_train)
    print("Best parameters set found on development set:")
    print(clf.best_params_)
    #print("Grid scores on development set:")
    #means = clf.cv_results_['mean_test_score']
    #stds = clf.cv_results_['std_test_score']
    #for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    #    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    #print()
    #print("Detailed classification report:")
    #print()
    #print("The model is trained on the full development set.")
    #print("The scores are computed on the full evaluation set.")
    #print()
    #y_true, y_pred = y_test, clf.predict(X_test)
    #print(classification_report(y_true, y_pred))
    
#print()

#tuned_parameters = {'penalty': ['l1', 'l2'],'C':[0.0009,0.001, 0.002,  0.004, 0.008, 0.01,0.09, 1, 10, 100, 1000]}
#for score in scores:
#    print("# Tuning hyper-parameters for %s" % score)
#    clf  = GridSearchCV(LogisticRegression(), tuned_parameters, scoring='%s_macro' % score)
#    clf.fit(X_train, y_train)
#    print("Best parameters set found on development set:")
#    print(clf.best_params_)
    #grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
    #grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'recall')
    #grid_clf_acc.fit(X_train, y_train)

y_pred=clf.predict(X_test)

from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.model_selection import cross_validate

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

scoring = ['accuracy','precision_macro', 'recall_macro', 'f1_macro']

scores = cross_validate( lr_clf, X_train, y_train, cv=10, scoring=scoring, return_train_score=False)
#scores['train_precision_macro'].mean()

scores['test_accuracy'].mean()*100.0
scores['test_precision_macro'].mean()*100.0
scores['test_recall_macro'].mean()*100.0
scores['test_f1_macro'].mean()*100.0

print()
print(scores)
print(": %0.2f (+/- %0.2f)" % (scores['test_accuracy'].mean()*100.0, scores['test_accuracy'].std() * 2))


In [None]:
#y_pred_proba = clf.predict_proba(X_test)[::,1]
#fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
#auc = metrics.roc_auc_score(y_test, y_pred_proba)
#plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
#plt.legend(loc=4)
#plt.show()