In [1]:
import re
import os
import emot 
import json
import nltk
import spacy
import gensim
import unicodedata
import pandas as pd
from bs4 import BeautifulSoup
from string import punctuation
from num2words import num2words
from nltk.corpus import stopwords
pd.set_option('display.max_colwidth', None)

In [2]:
class NoiseClean:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        short_form_list = open('short_forms.txt', 'r')
        chat_words_str = short_form_list.read()
        self.chat_words_map_dict = {}
        self.chat_words_list = []
        for line in chat_words_str.split("\n"):
            if line != "":
                cw = line.split("=")[0]
                try:
                    cw_expanded = line.split("=")[1]
                except Exception as e:
                    print(line)
                self.chat_words_list.append(cw)
                self.chat_words_map_dict[cw] = cw_expanded
        self.chat_words_list = set(self.chat_words_list)
        with open("contractions.json",'r') as outfile:
            contractions_json_data = outfile.read()
        self.contraction_mapping = json.loads(contractions_json_data)
        with open("emoticons.json",'r',encoding='utf-8') as outfile:
            emoticons_json_data = outfile.read()  
        self.EMOTICONS = eval(emoticons_json_data)
        with open("emoji_word.json",'r',encoding='utf-8') as outfile:
            emoji_word_json_data = outfile.read()  
        self.EMO_UNICODE = eval(emoji_word_json_data)
        self.UNICODE_EMO = {v: k for k, v in self.EMO_UNICODE.items()}
        self.emot_obj = emot.core.emot() 
        stopwords_nltk = list(stopwords.words('english')) 
        self.sp = spacy.load('en_core_web_sm')
        stopwords_spacy = list(self.sp.Defaults.stop_words)
        stopwords_gensim = list(gensim.parsing.preprocessing.STOPWORDS)
        self.all_stopwords = []
        self.all_stopwords.extend(stopwords_nltk)
        self.all_stopwords.extend(stopwords_spacy)
        self.all_stopwords.extend(stopwords_gensim)
        self.all_stopwords = list(set(self.all_stopwords))
    
    def get_tokens(self,text):
        text_tokenized = []
        doc = self.nlp(text)
        word_tokenized_list = [token.text for token in doc]
        return word_tokenized_list
        
    def convert_to_lower_case(self,text):
        text_lower = text.lower()
        return text_lower

    def remove_html_tags_regex(self,text):
        html_pattern = r'<.*?>'
        text_without_html = re.sub(pattern=html_pattern, repl=' ', string=text)
        return text_without_html

    def remove_html_tags_beautifulsoup(self,text):
        parser = BeautifulSoup(text, "html.parser")
        text_without_html = parser.get_text(separator = " ")
        return text_without_html

    def remove_urls_regex(self,text):
        url_pattern = r'https?://\S+|www\.\S+'
        text_without_urls = re.sub(pattern=url_pattern, repl=' ', string=text)
        return text_without_urls

    def remove_numbers_regex(self,text):
        number_pattern = r'\d+'
        text_without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
        return text_without_number

    def convert_numbers_to_words(self,text):
        text_tokenized = self.get_tokens(text)
        for index in range(len(text_tokenized)):
            if text_tokenized[index].isdigit():
                text_tokenized[index] = num2words(text_tokenized[index])
        numbers_to_words = ' '.join(text_tokenized)
        return numbers_to_words


    def convert_accented_to_ascii(self,text):
        unaccented_text = str(unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
        if unaccented_text == None:
            return ""
        else:
            return unaccented_text

    def convert_chat_abbreviations(self,text):
        text_tokenized = self.get_tokens(text)
        new_text = []
        for w in text_tokenized:
            if w.upper() in self.chat_words_list:
                new_text.append(self.chat_words_map_dict[w.upper()])
            else:
                new_text.append(w)
        expanded_text = " ".join(new_text)
        return expanded_text

    def expand_contractions(self,text):
        contractions_pattern = re.compile('({})'.format('|'.join(self.contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = self.contraction_mapping.get(match)                                    if self.contraction_mapping.get(match)                                    else self.contraction_mapping.get(match.lower())                       
            expanded_contraction = first_char+expanded_contraction[1:]
            if expanded_contraction == None:
                return ""
            else:
                return expanded_contraction
        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text

    def remove_emoji_regex(self,text):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  
                                   u"\U0001F300-\U0001F5FF"  
                                   u"\U0001F680-\U0001F6FF"  
                                   u"\U0001F1E0-\U0001F1FF"  
                                   u"\U00002500-\U00002BEF"  
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f" 
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)

        text_without_emoji = emoji_pattern.sub(r'',text)
        return text_without_emoji

    def remove_emoji_emot(self,text):
        for val in self.emot_obj.emoji(text)['value']:
            text = text.replace(val,'')
        return text

    def remove_emoticons_regex(self,text):
        emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in self.EMOTICONS) + u')')
        text_without_emoticons = emoticon_pattern.sub(r'',text)
        return text_without_emoticons

    def remove_emoticons_emot(self,text):
        for val in self.emot_obj.emoticons(text)['value']:
            text = text.replace(val,'')
        return text

    def convert_emoji_to_word(self,text):
        for emot in self.UNICODE_EMO:
            emoji_pattern = r'('+emot+')'
            emoji_words = self.UNICODE_EMO[emot]
            replace_text = emoji_words.replace(",","")
            replace_text = replace_text.replace(":","")
            replace_text_list = replace_text.split()
            emoji_name = '_'.join(replace_text_list)
            text = re.sub(emoji_pattern, emoji_name, text)
        return text

    def convert_emoticons_to_word(self,text):
        for emot in self.EMOTICONS:
            emoticon_pattern = r'('+emot+')'
            emoticon_words = self.EMOTICONS[emot]
            replace_text = emoticon_words.replace(",","")
            replace_text = replace_text.replace(":","")
            replace_text_list = replace_text.split()
            emoticon_name = '_'.join(replace_text_list)
            text = re.sub(emoticon_pattern, emoticon_name, text)
        return text

    def remove_punctuations(self,text):
        text_without_punctuations = text.translate(str.maketrans('', '', punctuation))
        return text_without_punctuations

    def remove_stopwords(self,text):
        text_without_sw = []
        text_tokenized = self.get_tokens(text)
        for word in text_tokenized:
            if word not in self.all_stopwords:
                text_without_sw.append(word)
        text_without_stopwords = ' '.join(text_without_sw)
        return text_without_stopwords

    def remove_extra_whitespaces(self,text):
        space_pattern = r'\s+'
        text_without_space = re.sub(pattern=space_pattern, repl=" ", string=text)
        return text_without_space
    

In [315]:
import os
import warnings
warnings.filterwarnings("ignore")
import keras
from tqdm import tqdm
tqdm.pandas()
import xgboost as xgb
import numpy as np
import tensorflow as tf
from attention import Attention
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import TextVectorization
from sklearn import model_selection, naive_bayes, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support

In [316]:
class Classification:
    def __init__(self,dataset_path):
        self.dataset_path = dataset_path
        self.noise_clean_ob = NoiseClean()
            
    def load_data(self):
        self.df = pd.read_csv(self.dataset_path)
        
    def clean_text(self,text):
        text = self.noise_clean_ob.convert_accented_to_ascii(text)
        text = self.noise_clean_ob.remove_urls_regex(text)
        text = self.noise_clean_ob.remove_html_tags_regex(text)
        text = self.noise_clean_ob.remove_html_tags_beautifulsoup(text)
        text = self.noise_clean_ob.convert_chat_abbreviations(text)
        text = self.noise_clean_ob.expand_contractions(text)
        text = self.noise_clean_ob.remove_emoji_emot(text)
        text = self.noise_clean_ob.remove_emoji_regex(text)
        text = self.noise_clean_ob.remove_emoticons_emot(text)
        text = self.noise_clean_ob.remove_emoticons_regex(text)
        #text = self.noise_clean_ob.convert_numbers_to_words(text)
        #text = self.noise_clean_ob.remove_punctuations(text)
        text = self.noise_clean_ob.remove_extra_whitespaces(text)
        return text
    
    def clean_data(self,column_name):
        print("\nCleaning Data ---->")
        self.df['cleaned_tweet_text'] = self.df.progress_apply(lambda row:self.clean_text(row[column_name]),axis=1)
        
    def get_class_distribution(self):
        print("\nClass Distribution of Dataset ----> {0}".format(self.dataset_path))
        self.class_distribution_df = self.df.groupby(['label_text']).count().drop(columns=self.df.columns.tolist()[:len(self.df.columns.tolist())-2]).reset_index()
        self.class_distribution_df.rename(columns={self.class_distribution_df.columns[1]:'count'},inplace=True)
        self.class_distribution_df.plot.bar(x='label_text',y='count',rot=0,figsize=(20,10))

    def data_split(self):
        print("\nSplitting Data into Train, Test and Validation Sample ----> ")
        self.label_encoder_target = LabelEncoder()
        self.df['label_text_transformed'] = self.label_encoder_target.fit_transform(self.df['label_text'])
        self.label_encoder_mappings_dict = dict(zip(self.label_encoder_target.transform(self.label_encoder_target.classes_),self.label_encoder_target.classes_))
        self.trainD, self.testD = train_test_split(self.df, test_size=0.2, random_state=100, stratify=self.df["label_text_transformed"])
        self.testD, self.valD = train_test_split(self.testD, test_size=0.5, random_state=100)
        print("Training Sample Shape ----> ", self.trainD.shape)
        print("Testing Sample Shape ----> ", self.testD.shape)
        print("Validation Sample Shape ----> ", self.valD.shape)
        self.X_train = self.trainD["cleaned_tweet_text"].tolist() 
        self.y_train = self.trainD["label_text_transformed"].tolist()
        self.X_test = self.testD["cleaned_tweet_text"].tolist() 
        self.y_test = self.testD["label_text_transformed"].tolist()
        self.X_val = self.valD["cleaned_tweet_text"].tolist() 
        self.y_val = self.valD["label_text_transformed"].tolist()

    def convert_data_into_tensorflow_format(self):
        print("\nConverting Data into Tensorflow Format ----> ")
        BUFFER_SIZE = 8000
        BATCH_SIZE = 32
        self.train_dataset = tf.data.Dataset.from_tensor_slices((self.X_train, self.y_train))
        self.test_dataset = tf.data.Dataset.from_tensor_slices((self.X_test, self.y_test))
        self.val_dataset = tf.data.Dataset.from_tensor_slices((self.X_val, self.y_val))
        self.train_dataset = self.train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
        self.test_dataset = self.test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
        self.val_dataset = self.val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    def vectorize_tweets(self,ngram):
        print("\nVectorizing Tweet using {0},{1} ngram ----> ".format(ngram[0],ngram[1]))
        self.vectorizer_model = TfidfVectorizer(ngram_range=ngram)
        self.vectorizer_model.fit(self.df['cleaned_tweet_text'].tolist())
        """print("\nVocabulary Length ----> ",len(self.vectorizer_model.vocabulary_))"""
        self.X_train_idf = self.vectorizer_model.transform(self.X_train)
        self.X_test_idf = self.vectorizer_model.transform(self.X_test)
    
    def get_precision_recall_f1score(self,y_true,y_predicted,average_across_all_classes=True):
        prf1_df = pd.DataFrame(precision_recall_fscore_support(y_true, y_predicted),columns=self.label_encoder_target.transform(self.label_encoder_target.classes_))
        prf1_df = prf1_df.iloc[:-1]
        if average_across_all_classes:
            self.average = prf1_df.mean(axis=1)
        else:
            self.average = prf1_df.values.tolist()
        """return average"""
    
    def train_naive_bayes(self):
        print("\nTraining Multinomial Naive Bayes Algorithm ----> ")
        mnb_model = naive_bayes.MultinomialNB()
        mnb_model.fit(self.X_train_idf,self.y_train)
        predicted_val = mnb_model.predict(self.X_test_idf)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nNaive Bayes Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)

    def train_support_vector_machines(self):
        print("\nTraining Support Vector Machine Algorithm ----> ")
        svm_model = svm.SVC(random_state=29)
        svm_model.fit(self.X_train_idf,self.y_train)
        predicted_val = svm_model.predict(self.X_test_idf)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nSVM Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)

    def train_random_forest(self):
        print("\nTraining Random Forest Algorithm ----> ")
        rf_model = RandomForestClassifier(random_state=29)
        rf_model.fit(self.X_train_idf,self.y_train)
        predicted_val = rf_model.predict(self.X_test_idf)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nRandom Forest Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)

    def train_decision_tree(self):
        print("\nTraining Decision Tree Algorithm ----> ")
        dt_model = DecisionTreeClassifier(random_state=29)
        dt_model.fit(self.X_train_idf,self.y_train)
        predicted_val = dt_model.predict(self.X_test_idf)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nDecision Tree Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)

    def train_logistic_regression(self):
        print("\nTraining Logistic Regression Algorithm ----> ")
        lr_model = LogisticRegression(random_state=29)
        lr_model.fit(self.X_train_idf,self.y_train)
        predicted_val = lr_model.predict(self.X_test_idf)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nLogistic Regression Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)

    def train_knn(self):
        print("\nTraining K Nearest Neighbor Algorithm ----> ")
        knn_model = KNeighborsClassifier()
        knn_model.fit(self.X_train_idf,self.y_train)
        predicted_val = knn_model.predict(self.X_test_idf)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nK Nearest Neighbor Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)

    def train_gradient_boosting(self):
        print("\nTraining Gradient Boosting Algorithm ----> ")
        gb_model = GradientBoostingClassifier(random_state=29)
        gb_model.fit(self.X_train_idf,self.y_train)
        predicted_val = gb_model.predict(self.X_test_idf)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nGradient Boosting Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)

    def train_xgboost(self):
        print("\nTraining XGBoost Algorithm ----> ")
        xg_train = xgb.DMatrix(cf.X_train_idf, label=cf.y_train)
        xg_test = xgb.DMatrix(cf.X_test_idf, label=cf.y_test)
        param = {}
        param['objective'] = 'multi:softmax'
        param['eta'] = 0.1
        param['max_depth'] = 6
        param['nthread'] = 4
        param['num_class'] = 6
        watchlist = [(xg_train, 'train'), (xg_test, 'test')]
        num_round = 5
        xgb_model = xgb.train(param, xg_train, num_round, watchlist)
        predicted_val = xgb_model.predict(xg_test)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nXGBoost Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)

    def train_catboost_classifier(self):
        print("\nTraining CatBoost Algorithm ----> ")
        cb_model = CatBoostClassifier(random_state=29,iterations=100,learning_rate=0.01,eval_metric='Accuracy',task_type='CPU')
        cb_model.fit(self.X_train_idf,self.y_train)
        predicted_val = cb_model.predict(self.X_test_idf)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nCatBoost Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)

    
    def plot_graphs(self,history, metric):
        plt.plot(history.history[metric])
        plt.plot(history.history['val_'+metric], '')
        plt.xlabel("Epochs")
        plt.ylabel(metric)
        plt.legend([metric, 'val_'+metric])
    
    def train_lstm_without_embedding(self):
        print("\nTraining LSTM without Embeddings ----> ")
        VOCAB_SIZE = 10000
        encoder_model = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
        encoder_model.adapt(self.train_dataset.map(lambda text, label: text))
        lstm_model = tf.keras.Sequential([encoder_model,
            tf.keras.layers.Embedding(input_dim=len(encoder_model.get_vocabulary()),output_dim=128,mask_zero=True),
            tf.keras.layers.LSTM(64, dropout=0.2),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1)])
        lstm_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                      metrics=['accuracy'])
        history = lstm_model.fit(self.train_dataset, epochs=5,validation_data=self.val_dataset)
        test_loss, test_acc = lstm_model.evaluate(self.test_dataset)
        predictions = lstm_model.predict(self.test_dataset)
        predicted_val = np.argmax(predictions, axis=-1)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nLSTM without Embeddings Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)
        """print('LSTM WITHOUT EMBEDDING TEST LOSS:', test_loss)
        print('LSTM WITHOUT EMBEDDING TEST ACCURACY:', test_acc)
        plt.figure(figsize=(20, 10))
        plt.subplot(1, 2, 1)
        self.plot_graphs(history, 'accuracy')
        plt.ylim(None, 1)
        plt.subplot(1, 2, 2)
        self.plot_graphs(history, 'loss')
        plt.ylim(0, None)"""
    
    def train_lstm_with_glove(self):
        print("\nTraining LSTM with Glove ----> ")
        vectorizer_model = TextVectorization(max_tokens=14000, output_sequence_length=200)
        vectorizer_model.adapt(self.train_dataset.map(lambda text, label: text))
        vocab = vectorizer_model.get_vocabulary()
        word_index_dict = dict(zip(vocab, range(len(vocab))))
        glove_file_path = "glove.6B.100d.txt"
        embeddings_index_dict = {}
        with open(glove_file_path,errors='ignore') as file:
            for data in file:
                word, coefs = data.split(maxsplit=1)
                coefs = np.fromstring(coefs, "f", sep=" ")
                embeddings_index_dict[word] = coefs
        print("Found {0} Word Embeddings".format(len(embeddings_index_dict)))
        num_tokens = 14000 + 2
        embedding_dimension = 100
        hits = 0
        misses = 0
        embedding_matrix = np.zeros((num_tokens, embedding_dimension))
        for word, i in word_index_dict.items():
            embedding_vector = embeddings_index_dict.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                hits += 1
            else:
                misses += 1
        print("Converted {0} Words ({1} Misses)".format(hits, misses))
        embedding_layer = Embedding(num_tokens,
                                    embedding_dimension,
                                    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                                    trainable=False,
                                    name="Embeddings")
        lstm_glove_model = tf.keras.Sequential()
        lstm_glove_model.add(vectorizer_model)
        lstm_glove_model.add(embedding_layer)
        lstm_glove_model.add(tf.keras.layers.LSTM(128, dropout=0.2, return_sequences=True))
        lstm_glove_model.add(Attention())
        lstm_glove_model.add(tf.keras.layers.Dense(128, activation='relu'))
        lstm_glove_model.add(tf.keras.layers.Dropout(0.2))
        lstm_glove_model.add(tf.keras.layers.Dense(1))
        lstm_glove_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                      metrics=['accuracy'])
        tf.keras.utils.plot_model(lstm_glove_model)
        history = lstm_glove_model.fit(self.train_dataset, epochs=9,validation_data=self.val_dataset)
        test_loss, test_acc = lstm_glove_model.evaluate(self.test_dataset)
        predictions = lstm_glove_model.predict(self.test_dataset)
        predicted_val = np.argmax(predictions, axis=-1)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nLSTM with GLOVE Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)
        """print('LSTM WITH GLOVE TEST LOSS:', test_loss)
        print('LSTM WITH GLOVE TEST ACCURACY:', test_acc)
        plt.figure(figsize=(20, 10))
        plt.subplot(1, 2, 1)
        self.plot_graphs(history, 'accuracy')
        plt.ylim(None, 1)
        plt.subplot(1, 2, 2)
        self.plot_graphs(history, 'loss')
        plt.ylim(0, None)"""

    def train_lstm_with_attention(self):
        print("\nTraining LSTM with Attention ----> ")
        VOCAB_SIZE = 10000
        encoder_model = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
        encoder_model.adapt(self.train_dataset.map(lambda text, label: text))
        lstm_attention_model = tf.keras.Sequential()
        lstm_attention_model.add(encoder_model)
        lstm_attention_model.add(tf.keras.layers.Embedding(
                input_dim=len(encoder_model.get_vocabulary()),
                output_dim=128,
                mask_zero=True))
        lstm_attention_model.add(tf.keras.layers.LSTM(64, dropout=0.2, return_sequences=True))
        lstm_attention_model.add(Attention())
        lstm_attention_model.add(tf.keras.layers.Dense(256, activation='relu'))
        lstm_attention_model.add(tf.keras.layers.Dropout(0.5))
        lstm_attention_model.add(tf.keras.layers.Dense(1))
        lstm_attention_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                      metrics=['accuracy'])
        history = lstm_attention_model.fit(self.train_dataset, epochs=5,validation_data=self.val_dataset)
        test_loss, test_acc = lstm_attention_model.evaluate(self.test_dataset)
        predictions = lstm_attention_model.predict(self.test_dataset)
        predicted_val = np.argmax(predictions, axis=-1)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nLSTM with ATTENTION Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)
        """print('LSTM WITH ATTENTION TEST LOSS:', test_loss)
        print('LSTM WITH ATTENTION TEST ACCURACY:', test_acc)
        plt.figure(figsize=(20, 10))
        plt.subplot(1, 2, 1)
        self.plot_graphs(history, 'accuracy')
        plt.ylim(None, 1)
        plt.subplot(1, 2, 2)
        self.plot_graphs(history, 'loss')
        plt.ylim(0, None)"""
        
    def train_bilstm_without_embedding(self):
        print("\nTraining Bidirectional LSTM without Embeddings ----> ")
        VOCAB_SIZE = 10000
        encoder_model = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
        encoder_model.adapt(self.train_dataset.map(lambda text, label: text))
        bilstm_model = tf.keras.Sequential([encoder_model,
                       tf.keras.layers.Embedding(input_dim=len(encoder_model.get_vocabulary()),output_dim=128,mask_zero=True),
                       tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.2)),
                       tf.keras.layers.Dense(256, activation='relu'),
                       tf.keras.layers.Dropout(0.5),
                       tf.keras.layers.Dense(1)])
        bilstm_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                      metrics=['accuracy'])
        history = bilstm_model.fit(self.train_dataset, epochs=5,validation_data=self.val_dataset)
        test_loss, test_acc = bilstm_model.evaluate(self.test_dataset)
        predictions = bilstm_model.predict(self.test_dataset)
        predicted_val = np.argmax(predictions, axis=-1)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nBIDIRECTIONAL LSTM without EMBEDDINGS Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)
        """print('BIDIRECTIONAL LSTM WITHOUT EMBEDDING TEST LOSS:', test_loss)
        print('BIDIRECTIONAL LSTM WITHOUT EMBEDDING TEST ACCURACY:', test_acc)
        plt.figure(figsize=(20, 10))
        plt.subplot(1, 2, 1)
        self.plot_graphs(history, 'accuracy')
        plt.ylim(None, 1)
        plt.subplot(1, 2, 2)
        self.plot_graphs(history, 'loss')
        plt.ylim(0, None)""" 
        
    def train_bilstm_with_glove(self):
        print("\nTraining BIDIRECTIONAL LSTM with Glove ----> ")
        vectorizer_model = TextVectorization(max_tokens=14000, output_sequence_length=200)
        vectorizer_model.adapt(self.train_dataset.map(lambda text, label: text))
        vocab = vectorizer_model.get_vocabulary()
        word_index_dict = dict(zip(vocab, range(len(vocab))))
        glove_file_path = "glove.6B.100d.txt"
        embeddings_index_dict = {}
        with open(glove_file_path,errors='ignore') as file:
            for data in file:
                word, coefs = data.split(maxsplit=1)
                coefs = np.fromstring(coefs, "f", sep=" ")
                embeddings_index_dict[word] = coefs
        print("Found {0} Word Embeddings".format(len(embeddings_index_dict)))
        num_tokens = 14000 + 2
        embedding_dimension = 100
        hits = 0
        misses = 0
        embedding_matrix = np.zeros((num_tokens, embedding_dimension))
        for word, i in word_index_dict.items():
            embedding_vector = embeddings_index_dict.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                hits += 1
            else:
                misses += 1
        print("Converted {0} Words ({1} Misses)".format(hits, misses))
        embedding_layer = Embedding(num_tokens,
                                    embedding_dimension,
                                    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                                    trainable=False,
                                    name="Embeddings")
        lstm_glove_model = tf.keras.Sequential()
        lstm_glove_model.add(vectorizer_model)
        lstm_glove_model.add(embedding_layer)
        lstm_glove_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.2, return_sequences=True)))
        lstm_glove_model.add(Attention())
        lstm_glove_model.add(tf.keras.layers.Dense(128, activation='relu'))
        lstm_glove_model.add(tf.keras.layers.Dropout(0.2))
        lstm_glove_model.add(tf.keras.layers.Dense(1))
        lstm_glove_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                      metrics=['accuracy'])
        tf.keras.utils.plot_model(lstm_glove_model)
        history = lstm_glove_model.fit(self.train_dataset, epochs=9,validation_data=self.val_dataset)
        test_loss, test_acc = lstm_glove_model.evaluate(self.test_dataset)
        predictions = lstm_glove_model.predict(self.test_dataset)
        predicted_val = np.argmax(predictions, axis=-1)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nBIDIRECTIONAL LSTM with GLOVE Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)
        """print('BIDIRECTIONAL LSTM WITH GLOVE TEST LOSS:', test_loss)
        print('BIDIRECTIONAL LSTM WITH GLOVE TEST ACCURACY:', test_acc)
        plt.figure(figsize=(20, 10))
        plt.subplot(1, 2, 1)
        self.plot_graphs(history, 'accuracy')
        plt.ylim(None, 1)
        plt.subplot(1, 2, 2)
        self.plot_graphs(history, 'loss')
        plt.ylim(0, None)"""

    def train_bilstm_with_attention(self):
        print("\nTraining BIDIRECTIONAL LSTM with Attention ----> ")
        VOCAB_SIZE = 10000
        encoder_model = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
        encoder_model.adapt(self.train_dataset.map(lambda text, label: text))
        lstm_attention_model = tf.keras.Sequential()
        lstm_attention_model.add(encoder_model)
        lstm_attention_model.add(tf.keras.layers.Embedding(
                input_dim=len(encoder_model.get_vocabulary()),
                output_dim=128,
                mask_zero=True))
        lstm_attention_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.2, return_sequences=True)))
        lstm_attention_model.add(Attention())
        lstm_attention_model.add(tf.keras.layers.Dense(256, activation='relu'))
        lstm_attention_model.add(tf.keras.layers.Dropout(0.5))
        lstm_attention_model.add(tf.keras.layers.Dense(1))
        lstm_attention_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                      metrics=['accuracy'])
        history = lstm_attention_model.fit(self.train_dataset, epochs=5,validation_data=self.val_dataset)
        test_loss, test_acc = lstm_attention_model.evaluate(self.test_dataset)
        predictions = lstm_attention_model.predict(self.test_dataset)
        predicted_val = np.argmax(predictions, axis=-1)
        self.get_precision_recall_f1score(self.y_test,predicted_val)
        print("\nBIDIRECTIONAL LSTM with ATTENTION Accuracy ----> ",accuracy_score(predicted_val, self.y_test)*100)
        """print('LSTM WITH ATTENTION TEST LOSS:', test_loss)
        print('LSTM WITH ATTENTION TEST ACCURACY:', test_acc)
        plt.figure(figsize=(20, 10))
        plt.subplot(1, 2, 1)
        self.plot_graphs(history, 'accuracy')
        plt.ylim(None, 1)
        plt.subplot(1, 2, 2)
        self.plot_graphs(history, 'loss')
        plt.ylim(0, None)"""   
    
    

In [280]:
for dataset_path in ["hurricane.csv","flood.csv","earthquake.csv","wildfire.csv"]:
    cf = Classification(dataset_path)
    cf.load_data()
    cf.get_class_distribution()
    cf.clean_data(column_name='tweet_text')
    cf.data_split()
    cf.convert_data_into_tensorflow_format()
    classical_ml_report_list = []
    for model in ['KNN','Logistic Regression','Naive Bayes','Decision Tree','Random Forest','SVM','Gradient Boosting','XGBoost','CatBoost']:
        ngrams = [(1,1),(2,2),(3,3),(1,2),(2,3),(1,3)]
        #ngrams = [(1,1),(2,2),(3,3),(2,3),(1,2,3)]
        for gram in ngrams:
            tmp_report_list = []
            tmp_report_list.append(model)
            if gram == (1,3):
                tmp_report_list.append((1,2,3))
            else:
                tmp_report_list.append(gram)
            cf.vectorize_tweets(gram)
            if model == 'KNN':
                cf.train_knn()
            elif model == 'Logistic Regression':
                cf.train_logistic_regression()
            elif model == 'Naive Bayes':
                cf.train_naive_bayes()
            elif model == 'Decision Tree':
                cf.train_decision_tree()
            elif model == 'Random Forest':
                cf.train_random_forest()
            elif model == 'SVM':
                cf.train_support_vector_machines()
            elif model == 'Gradient Boosting':
                cf.train_gradient_boosting()
            elif model == 'XGBoost':
                cf.train_xgboost()
            elif model == 'CatBoost':
                cf.train_catboost_classifier()
            tmp_report_list.extend(cf.average)
            classical_ml_report_list.append(tmp_report_list)
    classical_ml_report_df = pd.DataFrame(classical_ml_report_list,columns=['Classifier','Ngram','Precision','Recall','F1 Score'])
    classical_ml_report_df.to_csv('classical_ml_report_'+dataset_path,index=False)
    deep_nn_report_list = []
    for model in ['LSTM','Bi-LSTM']:
        for category in ['Without Embedding','With Embedding','With Attention']:
            tmp_report_list = []
            tmp_report_list.append(model)
            tmp_report_list.append(category)
            if model == 'LSTM':
                if category == 'Without Embedding':
                    cf.train_lstm_without_embedding()
                elif category == 'With Embedding':
                    cf.train_lstm_with_glove()
                elif category == 'With Attention':
                    cf.train_lstm_with_attention()
            elif model == 'Bi-LSTM':
                if category == 'Without Embedding':
                    cf.train_bilstm_without_embedding()
                elif category == 'With Embedding':
                    cf.train_bilstm_with_glove()
                elif category == 'With Attention':
                    cf.train_bilstm_with_attention()
            tmp_report_list.extend(cf.average)
            deep_nn_report_list.append(tmp_report_list)   
    deep_nn_report_df = pd.DataFrame(deep_nn_report_list,columns=['Classifier','Embedding','Precision','Recall','F1 Score'])
    deep_nn_report_df.to_csv('deep_nn_report_'+dataset_path,index=False)