In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# NLP Prediction of Disaster Tweets

## Table of Contents

* 1. [Data Loading](#dataloading)
    * 1.1 [Libraries](#libs)
    * 1.2 [NLTK and Spacy Language Load](#lang)
    * 1.3 [Load CSV](#loadcsv)
* 2. [Data Preprocessing and EDA](#preprocessing)
    * 2.1 [Duplicates Removal](#dupl)
    * 2.2 [Missing Values Removal](#misvals)
    * 2.3 [Lowercasing](#lowercase)
    * 2.4 [Most Common Punctuation](#punct)
    * 2.5 [Most Common Locations](#toploc)
    * 2.6 [Basic Cleaning](#basiccl)
        * 2.6.1 [Tokenization, and Punctuations, Digits, URLs, Non-ASCII, Emails Removal](#basiccl1)
        * 2.6.2 [Other Special Characters Removal](#basiccl2) 
    * 2.7 [Advanced Cleaning (Optional)](#advcl)
        * 2.7.1 [Tokenization, Lemmatization, and Punctuations, Stop Words, Digits, URLs, Non-ASCII, Emails removal](#advcl1)
        * 2.7.2 [Other Special Characters Removal](#advcl2)
        * 2.7.3 [Non-English Words Removal](#advcl3)
    * 2.8 [Duplicates Removal](#dupl2)
* 3. [More of EDA](#eda)
    * 3.1 [Word Clouds](#wordclouds)
    * 3.2 [Word Frequencies](#wordfreqs)
    * 3.3 [N-Gram Analysis](#ngrams)
        * 3.3.1 [2-Grams](#twograms)
        * 3.3.2 [3-Grams](#threegrams)
    * 3.4 [Distribution of Characters](#charactersdist)
    * 3.5 [Distribution of Words](#wordsdist)
* 4. [Modeling](#modeling)
    * 4.1 [Datasets Load](#datasetclass)
    * 4.2 [Single Models](#singlemodel)
    * 4.3 [Ensemble](#ensemble)
    * 4.4 [Train/Validation/Test Split](#datasplit)
    * 4.5 [Train and Test](#traintest)
    * 4.6 [Predict](#predict)
    * 4.7 [Run](#run)

<a class="anchor" id="dataloading"></a>
## 1. Data Loading

### 1.1 Libraries <a class="anchor" id="libs"></a>

In [None]:
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import spacy
import joblib
import sys
import nltk
import gc
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay, classification_report
import torch
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from transformers import AdamW, BertTokenizer, RobertaTokenizer, XLMRobertaTokenizer, BertModel, RobertaModel, XLMRobertaModel
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

In [None]:
warnings.filterwarnings('ignore')

### 1.2 NLTK and Spacy Language Load <a class="anchor" id="lang"></a>

In [None]:
nltk.download('words')
nlp = spacy.load("en_core_web_sm")

### 1.3 Import All Data <a class="anchor" id="loadcsv"></a>

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
print('There are', len(train_df.index), 'training samples.')
print('There are', len(test_df.index), 'testing samples.')

## Data Preprocessing and EDA <a class="anchor" id="preprocessing"></a>
#### I prefer to do certain data preprocessing before EDA or concurrently.

### 2.1 Remove duplicate tweets (if any) in the training dataset <a class="anchor" id="dupl"></a>

In [None]:
train_df = train_df.drop_duplicates(subset='text', keep="first")
print('Duplicate tweets has been removed!')
print('There are', len(train_df.index), 'training samples now.')

### Distribution of samples among given classes

In [None]:
train_text_samples = train_df.target.value_counts()
sns.set(rc={'figure.figsize':(6,6)})

colors = ['salmon' if (x < max(train_text_samples)) else 'yellowgreen' for x in train_text_samples]
sns.barplot(x = train_text_samples.index, y = train_text_samples, palette = colors)      

plt.gca().set_xlabel('Classes')
plt.gca().set_ylabel('# of Samples')
plt.suptitle('Distribution of Training Tweets')
print('There are', len(train_df[train_df['target'] == 0]['text']), 'samples are labeled as non-disaster.')
print('There are', len(train_df[train_df['target'] == 1]['text']), 'samples are labeled as disaster.')

#### We observe slight class imbalance, but not critical. Therefore, we won't need to apply various data imbalance techniques to make the training data more balanced.

### 2.2 Find and replace all NaNs <a class="anchor" id="misvals"></a>

In [None]:
percent_missing = train_df.isnull().sum() * 100 / len(train_df)
missing_vals = pd.DataFrame({'col_name': train_df.columns,
                                 'percent': percent_missing})

In [None]:
sns.set(rc={'figure.figsize':(6,6)})
cols = list(missing_vals['col_name'])
percent = list(missing_vals['percent'])
plt.gca().set_ylabel('% of Missing Values')
plt.gca().set_xlabel('Column Name')
plt.suptitle('Percentage of Missing Train Values Among Columns')
sns.barplot(x = cols, y = percent)
#plt.savefig("./plots/missing-values.png")
plt.show()

In [None]:
train_df['location'] = train_df['location'].fillna('None')
train_df['keyword'] = train_df['keyword'].fillna('None')
test_df['location'] = test_df['location'].fillna('None')
test_df['keyword'] = test_df['keyword'].fillna('None')

### 2.3 Lowercasing <a class="anchor" id="lowercase"></a>

In [None]:
train_df["text"] = train_df["text"].apply(lambda x: x.lower())
test_df["text"] = test_df["text"].apply(lambda x: x.lower())
train_df["location"] = train_df["location"].apply(lambda x: x.lower())

### 2.4 Visualization of the most common punctuation characters in the train dataset <a class="anchor" id="punct"></a>

In [None]:
# I use SpaCy library to find all punctuations
def spacy_punct(text):
    punct = []
    doc = nlp(text) #necessary to use SpaCy
    punct = [token.lemma_ for token in doc if token.is_punct]
    return punct

train_df['punct'] = train_df['text'].apply(spacy_punct)
train_df['punct'] = [' '.join(map(str, l)) for l in train_df['punct']]

punct_col = train_df['punct'].tolist()
punct_list = []
for sublist in punct_col:
    for item in sublist:
        punct_list.append(item)
punct_freq = dict(Counter(punct_list))
punct_freq = {i: j for i, j in sorted(punct_freq.items(), key = lambda item: item[1], reverse = True)}
del punct_freq[' ']
punct_keys = list(punct_freq.keys())
punct_vals = list(punct_freq.values())

### 2.5 Top Locations <a class="anchor" id="toploc"></a>

In [None]:
# location column needs some preprocessing for more precise results
def spacy_location(text):
    preprocessed = []
    doc = nlp(text)
    preprocessed = [token.lemma_ for token in doc if not token.is_punct and not token.is_digit and not token.like_url and not token.like_email and token.is_ascii]
    return preprocessed
train_df['location_names'] = train_df['location'].apply(spacy_location)
train_df['location_names'] = [' '.join(map(str, l)) for l in train_df['location_names']]
train_df['location_names'] = train_df['location_names'].str.replace('[^\w\s]', "").str.replace('[0-9]', "").str.replace(' [a-z] ', "").str.replace(' [a-z][a-z]', "").str.replace('-', "").str.replace('_', "").str.replace('@', "")

location_col = train_df['location_names'].tolist()
#location_list = []
#for sublist in location_col:
    #for item in sublist:
        #location_list.append(item)
location_freq = dict(Counter(location_col))
location_freq = {i: j for i, j in sorted(location_freq.items(), key=lambda item: item[1], reverse=True)}
del location_freq[' ']
del location_freq['']
del location_freq['none']
location_keys = list(location_freq.keys())
location_vals = list(location_freq.values())

In [None]:
sns.set(rc={'figure.figsize':(20,8)})
fig, ax = plt.subplots(1, 2)
sns.barplot(ax = ax[0], x = punct_keys, y = punct_vals)
ax[0].set_title('Common Punctuations')
ax[0].set_ylabel('Frequency')
ax[0].set_xlabel('Punctuations')
        
sns.barplot(ax = ax[1], x = location_keys[:20], y = location_vals[:20])
ax[1].set_title('Top Locations')
ax[1].set_ylabel('Frequency')
ax[1].set_xlabel('Location Name')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 45)
plt.show()

In [None]:
# Drop punct column as we don't need it anymore
train_df = train_df.drop('punct', axis=1)
train_df = train_df.drop('location_names', axis=1)

### 2.6 Basic Data Cleaning <a class="anchor" id="basiccl"></a>

#### 2.6.1 Tokenization, punctuation, digits, URLs, non-ascii and emails tokens removal <a class="anchor" id="basiccl1"></a>

In [None]:
def spacy_clean(text):
    preprocessed = []
    doc = nlp(text)
    preprocessed = [token.lemma_ for token in doc if not token.is_punct and not token.is_digit and not token.like_url and not token.like_email and token.is_ascii]
    return preprocessed

train_df['new_text'] = train_df['text'].apply(spacy_clean)
test_df['new_text'] = test_df['text'].apply(spacy_clean)
train_df['new_text'] = [' '.join(map(str, l)) for l in train_df['new_text']]
test_df['new_text'] = [' '.join(map(str, l)) for l in test_df['new_text']]

train_df

#### 2.6.2 Remove other special characters <a class="anchor" id="basiccl2"></a>

In [None]:
train_df['new_text'] = train_df["new_text"].str.replace('[^\w\s]', "").str.replace('[0-9]', "").str.replace(' [a-z] ', "").str.replace('-', "").str.replace('_', "").str.replace('&amp', "").str.replace('@', "")
test_df['new_text'] = test_df["new_text"].str.replace('[^\w\s]', "").str.replace('[0-9]', "").str.replace(' [a-z] ', "").str.replace('-', "").str.replace('_', "").str.replace('&amp ', "").str.replace('@', "")

### 2.7 Advanced Data Cleaning (Optional) <a class="anchor" id="advcl"></a>

#### 2.7.1 Tokenization, stop words, punctuation, digits, URLs, non-ascii and emails removal, lemmatization using Spacy <a class="anchor" id="advcl1"></a>

In [None]:
# UNCOMMENT TO USE
"""def spacy_clean(text):
    preprocessed = []
    doc = nlp(text)
    preprocessed = [token.lemma_ for token in doc if not token.is_stop and not nlp.vocab[token.lemma_].is_stop and not token.is_punct and not token.is_digit and not token.like_url and not token.like_email and token.is_ascii]
    return preprocessed

train_df['new_text'] = train_df['text'].apply(spacy_clean)
test_df['new_text'] = test_df['text'].apply(spacy_clean)
train_df['new_text'] = [' '.join(map(str, l)) for l in train_df['new_text']]
test_df['new_text'] = [' '.join(map(str, l)) for l in test_df['new_text']]
train_df['new_text'] = train_df["new_text"].str.replace('[^\w\s]', "").str.replace('[0-9]', "").str.replace(' [a-z] ', "").str.replace('-', "").str.replace('_', "").str.replace(' amp ', "").str.replace('@', "")
test_df['new_text'] = test_df["new_text"].str.replace('[^\w\s]', "").str.replace('[0-9]', "").str.replace(' [a-z] ', "").str.replace('-', "").str.replace('_', "").str.replace(' amp ', "").str.replace('@', "")
train_df"""

#### 2.7.2 Remove other special characters <a class="anchor" id="advcl2"></a>

In [None]:
# UNCOMMENT TO USE
"""
train_df['new_text'] = train_df["new_text"].str.replace('[^\w\s]', "").str.replace('[0-9]', "").str.replace(' [a-z] ', "").str.replace('-', "").str.replace('_', "").str.replace('&amp', "").str.replace('@', "")
test_df['new_text'] = test_df["new_text"].str.replace('[^\w\s]', "").str.replace('[0-9]', "").str.replace(' [a-z] ', "").str.replace('-', "").str.replace('_', "").str.replace('&amp ', "").str.replace('@', "")
"""

# Optional
#train_df = train_df.drop('text', axis=1)
#test_df = test_df.drop('text', axis=1)

In [None]:
train_df.head()

In [None]:
test_df.head()

#### 2.7.3 non-English words removal using NLTK <a class="anchor" id="advcl3"></a>

In [None]:
# UNCOMMENT TO USE
"""nltk_words = set(nltk.corpus.words.words())
preprocessed = []
for i in train_df['new_text']:
    doc = nltk.wordpunct_tokenize(i)
    preprocessed.append(" ".join(w for w in doc if w.lower() in nltk_words or not w.isalpha()))
train_df['new_text'] = preprocessed
train_df"""

### 2.8 Duplicate tweets removal <a class="anchor" id="dupl2"></a>

In [None]:
# target = 0
train_tweets_freq0 = train_df[train_df['target'] == 0]['new_text'].value_counts()
train_tweets_freq0

In [None]:
# target = 1
train_tweets_freq1 = train_df[train_df['target'] == 1]['new_text'].value_counts()
train_tweets_freq1

In [None]:
train_df_new = train_df.drop_duplicates(subset='new_text', keep="first")
train_df_new

## 3. More of EDA <a class="anchor" id="dupl2"></a>

### 3.1 Word Clouds <a class="anchor" id="wordclouds"></a>

#### Word Cloud of the text samples that are labeled as disaster i.e. target = 1

In [None]:
from skimage import io
import requests

mask = io.imread('/kaggle/input/tweeter-mask/twitter_mask.png')
word_cloud_before1 = '  '.join(list(train_df_new[train_df_new['target'] == 1]['text']))
word_cloud_before1 = WordCloud(background_color='white', width = 400, height = 300, colormap='Set1', mask = mask).generate(word_cloud_before1)
word_cloud_after1 = '  '.join(list(train_df_new[train_df_new['target'] == 1]['new_text']))
word_cloud_after1 = WordCloud(background_color='white', width = 400, height = 300, colormap='Set1', mask = mask).generate(word_cloud_after1)

fig, ax = plt.subplots(1, 2, figsize=(16, 14))
ax[0].imshow(word_cloud_before1, interpolation="bilinear")
ax[1].imshow(word_cloud_after1, interpolation="bilinear")

ax[0].title.set_text('Before Text Preprocessing\n')
ax[1].title.set_text('After Text Preprocessing\n')
#ax[0].figure.savefig('./word_cloud_1.png')
ax[0].axis('off')
ax[1].axis('off')
plt.show()

#### Word Clouds of the text samples that are not labeled as disaster i.e. target = 0

In [None]:
word_cloud_before0 = '  '.join(list(train_df_new[train_df_new['target'] == 0]['text']))
word_cloud_before0 = WordCloud(background_color='white', width = 400, height = 300, mask = mask).generate(word_cloud_before0)
word_cloud_after0 = '  '.join(list(train_df_new[train_df_new['target'] == 0]['new_text']))
word_cloud_after0 = WordCloud(background_color='white', width = 400, height = 300, mask = mask).generate(word_cloud_after0)

fig, ax = plt.subplots(1, 2, figsize=(16, 14))
ax[0].imshow(word_cloud_before0, interpolation="bilinear")
ax[1].imshow(word_cloud_after0, interpolation="bilinear")
ax[0].title.set_text('Before Text Preprocessing\n')
ax[1].title.set_text('After Text Preprocessing\n')
#ax[0].figure.savefig('./word_cloud_0.png')
ax[0].axis('off')
ax[1].axis('off')
plt.show()

### 3.2 Observed frequencies of words in both classes <a class="anchor" id="wordfreqs"></a>

In [None]:
train_words_freq1 = train_df_new[train_df_new['target'] == 1]['new_text'].str.split(expand = True).stack().value_counts()
train_words_freq0 = train_df_new[train_df_new['target'] == 0]['new_text'].str.split(expand = True).stack().value_counts()

sns.set(rc = {'figure.figsize':(12,9)})
fig, ax = plt.subplots(1, 2)

sns.barplot(ax = ax[0], x = train_words_freq1[:30], y = train_words_freq1.index[:30])
ax[0].set_title('Target = 1')
ax[0].set_xlabel('Frequency')
ax[0].set_ylabel('Words')

sns.barplot(ax = ax[1], x = train_words_freq0[:30], y = train_words_freq0.index[:30])
ax[1].set_title('Target = 0')
ax[1].set_xlabel('Frequency')
ax[1].set_ylabel('Words')
#ax[0].figure.savefig("./plots/words-freqs.png")
plt.show()

### 3.3 N-gram Analysis <a class="anchor" id="ngrams"></a>
#### 3.3.1 Bi-grams <a class="anchor" id="twograms"></a>

In [None]:
def bigram(corpus, n = None):
    vectorizer = CountVectorizer(ngram_range = (2, 2)).fit(corpus)
    bag_of_words = vectorizer.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

sns.set(rc = {'figure.figsize':(7, 7)})
plt.suptitle('Bi-grams')
plt.gca().set_xlabel('Frequency')
top_bigrams = bigram(train_df_new['new_text'])[:20]
x, y = map(list,zip(*top_bigrams))
sns.barplot(x = y, y = x)
#plt.savefig("./plots/bigrams.png")
plt.show()

#### 3.3.2 3-grams <a class="anchor" id="threegrams"></a>

In [None]:
def threegram(corpus, n = None):
    vectorizer = CountVectorizer(ngram_range = (3, 3)).fit(corpus)
    bag_of_words = vectorizer.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]

sns.set(rc = {'figure.figsize':(7, 7)})
plt.suptitle('3-grams')
plt.gca().set_xlabel('Frequency')
top_bigrams = threegram(train_df_new['new_text'])[:20]
x, y = map(list,zip(*top_bigrams))
sns.barplot(x = y, y = x)
#plt.savefig("./plots/threegrams.png")
plt.show()

### 3.4 Distribution of Characters <a class="anchor" id="charactersdist"></a>

#### Distribution of characters in disaster tweets (target = 1)

In [None]:
sns.set(rc={'figure.figsize':(12,5)})
fig, ax = plt.subplots(1, 2)

text_len1 = train_df[train_df['target'] == 1]['text'].str.len()
ax[0].hist(text_len1, color = "salmon")
ax[0].set_title('Before preprocessing')
ax[0].set_xlabel('Number of Characters')
ax[0].set_ylabel('Frequency')

text_len2 = train_df_new[train_df_new['target'] == 1]['new_text'].str.len()
ax[1].hist(text_len2, color = "salmon")
ax[1].set_title('After preprocessing')
ax[1].set_xlabel('Number of Characters')
ax[1].set_ylabel('Frequency')
fig.suptitle('Distribution of Characters in Disaster Tweets (target = 1)')
#ax[0].figure.savefig("./plots/distribution-characters1.png")
plt.show()

#### Distribution of characters in non-disaster tweets (target = 0)

In [None]:
sns.set(rc={'figure.figsize':(12,5)})
fig, ax = plt.subplots(1, 2)

text_len1 = train_df[train_df['target'] == 0]['text'].str.len()
ax[0].hist(text_len1, color = "yellowgreen")
ax[0].set_title('Before preprocessing')
ax[0].set_xlabel('Number of Characters')
ax[0].set_ylabel('Frequency')

text_len2 = train_df_new[train_df_new['target'] == 0]['new_text'].str.len()
ax[1].hist(text_len2, color = "yellowgreen")
ax[1].set_title('After preprocessing')
ax[1].set_xlabel('Number of Characters')
ax[1].set_ylabel('Frequency')
fig.suptitle('Distribution of Characters in Non-Disaster Tweets (target = 0)')
#ax[0].figure.savefig("./plots/distribution-characters0.png")
plt.show()

### 3.5 Distribution of Words <a class="anchor" id="wordsdist"></a>

#### Distribution of words in disaster tweets (target = 1)

In [None]:
sns.set(rc={'figure.figsize':(12,5)})
fig, ax = plt.subplots(1, 2)

text_len1 = train_df[train_df['target'] == 1]['text'].str.split().map(lambda x: len(x))
ax[0].hist(text_len1, color = "salmon")
ax[0].set_title('Before preprocessing')
ax[0].set_xlabel('Number of Words')
ax[0].set_ylabel('Frequency')

text_len2 = train_df_new[train_df_new['target'] == 1]['new_text'].str.split().map(lambda x: len(x))
ax[1].hist(text_len2, color = "salmon")
ax[1].set_title('After preprocessing')
ax[1].set_xlabel('Number of Words')
ax[1].set_ylabel('Frequency')
fig.suptitle('Distribution Words in Disaster Tweets (target = 1)')
#ax[0].figure.savefig("./plots/distribution-words1.png")
plt.show()

#### Distribution of words in non-disaster tweets (target = 0)

In [None]:
sns.set(rc={'figure.figsize':(12,5)})
fig, ax = plt.subplots(1, 2)

text_len1 = train_df[train_df['target'] == 0]['text'].str.split().map(lambda x: len(x))
ax[0].hist(text_len1, color = "yellowgreen")
ax[0].set_title('Before preprocessing')
ax[0].set_xlabel('Number of Words')
ax[0].set_ylabel('Frequency')

text_len2 = train_df_new[train_df_new['target'] == 0]['new_text'].str.split().map(lambda x: len(x))
ax[1].hist(text_len2, color = "yellowgreen")
ax[1].set_title('After preprocessing')
ax[1].set_xlabel('Number of Words')
ax[1].set_ylabel('Frequency')
fig.suptitle('Distribution Words in Non-Disaster Tweets (target = 0)')
#ax[0].figure.savefig("./plots/distribution-words0.png")
plt.show()

## 4. Modeling - BERT, RoBERTa, XLMRoBERTa and ensembles <a class="anchor" id="modeling"></a>

### 4.1 Dataset Load <a class="anchor" id="datasetclass"></a>

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df, pretrained = 'bert-base-uncased'):
        self.labels = df['target'].to_list()
        self.tokenizer = BertTokenizer.from_pretrained(pretrained) 
        #self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        #self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.texts = [self.tokenizer(text, padding = 'max_length', max_length = 512, truncation = True,
                                return_tensors = 'pt') for text in df['new_text']]
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

### 4.2 Single Models <a class="anchor" id="singlemodel"></a>

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, dropout = 0.5, pretrained = 'bert-base-uncased'):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained)
        #self.roberta = RobertaModel.from_pretrained('roberta-base')
        #self.xlmroberta = XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids = input_id, attention_mask = mask, return_dict = False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [None]:
class Model1(nn.Module):
    def __init__(self, dropout = 0.5, pretrained = 'bert-base-uncased'):
        super(Model1, self).__init__()
        self.model1 = BertModel.from_pretrained(pretrained)
        #self.model1 = XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 768)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
            _, pooled_output = self.model1(input_ids = input_id, attention_mask = mask, return_dict = False)
            dropout_output = self.dropout(pooled_output)
            linear_output = self.linear(dropout_output)
            final_layer = self.relu(linear_output)
            return final_layer

In [None]:
class Model2(nn.Module):
    def __init__(self, dropout = 0.5, pretrained = 'bert-base-uncased'):
        super(Model2, self).__init__()
        self.model2 = BertModel.from_pretrained(pretrained)
        #self.model2 = XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 768)
        self.relu = nn.ReLU()
        

    def forward(self, input_id, mask):
            _, pooled_output = self.model2(input_ids = input_id, attention_mask = mask, return_dict = False)
            dropout_output = self.dropout(pooled_output)
            linear_output = self.linear(dropout_output)
            final_layer = self.relu(linear_output)
            return final_layer

### 4.3 Ensemble <a class="anchor" id="ensemble"></a>

In [None]:
class MyEnsemble(nn.Module):
    def __init__(self, modelA, modelB):
        super(MyEnsemble, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        self.modelA.fc = nn.Identity()
        self.modelB.fc = nn.Identity()
        self.classifier = nn.Linear(1536, 2)
        
    def forward(self, x, mask):
        x1 = self.modelA(x.clone(), mask.clone())
        x1 = x1.view(x1.size(0), -1)
        x2 = self.modelB(x.clone(), mask.clone())
        x2 = x2.view(x2.size(0), -1)
        x = torch.cat((x1, x2), dim=1)
        x = self.classifier(F.relu(x))
        return x

In [None]:
# Clean GPU cache if necessary
gc.collect()
torch.cuda.empty_cache()

In [None]:
use_cuda = torch.cuda.is_available()
print('CUDA:', use_cuda)
device = torch.device('cuda' if use_cuda else 'cpu')
print('You are using:', torch.cuda.get_device_name(device))

### 4.4 Train/Validation/Test Split <a class="anchor" id="datasplit"></a>

In [None]:
train_data, val_data, test_data = np.split(train_df_new.sample(frac = 1, random_state = 42), [int(.8*len(train_df_new)), int(.9*len(train_df_new))])
print('============= Train/Validation/Test Split =============')
print('Train/Validation/Test dataset size: ', len(train_data), '/', len(val_data), '/', len(test_data))

### 4.5 Train and Test <a class="anchor" id="traintest"></a>

In [None]:
class Train():
    def __init__(self, model, train_data, val_data, criterion, optimizer, epochs, batch_size):
        self.model = model
        self.train_data = train_data
        self.val_data = val_data
        self.criterion = criterion
        self.optimizer = optimizer
        self.epochs = epochs
        self.batch_size = batch_size
        
    def train_plots(self, epochs_list, train_losses, val_losses, train_accs, val_accs):
        %matplotlib inline
        sns.set(rc={'figure.figsize':(19, 9)})
        fig, ax = plt.subplots(1,2)
        ax[0].plot(epochs_list, train_losses, label = 'Training Loss', marker='o')
        ax[0].plot(epochs_list, val_losses, label = 'Validation Loss', marker='o')
        ax[0].set_title('Loss Values')
        ax[0].set_xlabel('Epoch')
        ax[0].set_ylabel('Value')
        ax[1].plot(epochs_list, train_accs, label = 'Training Accuracy', marker='o')
        ax[1].plot(epochs_list, val_accs, label = 'Validation Accuracy', marker='o')
        ax[1].set_title('Accuracy Values')
        ax[1].set_xlabel('Epoch')
        ax[1].set_ylabel('Percent (%)')
        ax[0].legend()
        ax[1].legend()
        plt.show()
        #fig.savefig('./plots/train-val-loss-accs.png')

    def start_train(self):
        train, val = Dataset(self.train_data), Dataset(self.val_data)
        train_dataloader = torch.utils.data.DataLoader(train, self.batch_size, shuffle = True)
        val_dataloader = torch.utils.data.DataLoader(val, self.batch_size)

        use_cuda = torch.cuda.is_available()
        print('CUDA:', use_cuda)
        device = torch.device('cuda' if use_cuda else 'cpu')
        print('You are using:', torch.cuda.get_device_name(device))
        total_steps = len(self.train_data)*self.epochs
        scheduler = get_linear_schedule_with_warmup(self.optimizer,  num_warmup_steps = 0, num_training_steps = total_steps)

        if use_cuda:
            self.model = self.model.cuda()
            self.criterion = self.criterion.cuda()

        train_losses = []
        val_losses = []
        train_accs = []
        val_accs = []
        epochs_list = []
    
        for epoch_num in range(self.epochs):
            print('\n====================== Epoch {:} / {:} =====================\n'.format(epoch_num + 1, self.epochs))
            total_loss_train = 0
            total_acc_train = 0
            self.model.train()
            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.type(torch.LongTensor)
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = self.model(input_id, mask)

                batch_loss = self.criterion(output, train_label)
                total_loss_train += batch_loss.item()

                acc_tr = (output.argmax(dim = 1) == train_label).sum().item()
                total_acc_train += acc_tr
                
                self.model.zero_grad()
                batch_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                self.optimizer.step()
                scheduler.step()

            ############ Validation ###############
            total_acc_val = 0
            total_loss_val = 0
            self.model.eval()
            with torch.no_grad():
                for val_input, val_label in val_dataloader:
                    val_label = val_label.type(torch.LongTensor)
                    val_label = val_label.to(device)
                        
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = self.model(input_id, mask)
                    #label_ids = val_label.to('cpu').numpy()
                    
                    batch_loss = self.criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc_val = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc_val
            # Losses
            final_train_loss = total_loss_train/len(self.train_data)
            final_val_loss = total_loss_val/len(self.val_data)
            train_losses.append(final_train_loss)
            train_losses.sort(reverse=True)
            val_losses.append(final_val_loss)

            # Accuracies
            final_train_acc = (total_acc_train/len(self.train_data))*100
            final_val_acc = (total_acc_val/len(self.val_data))*100
            train_accs.append(final_train_acc)
            val_accs.append(final_val_acc)
            epochs_list.append(epoch_num + 1)
            
            # Plots
            self.train_plots(epochs_list, train_losses, val_losses, train_accs, val_accs)

            print(f'Train Loss: {final_train_loss: .3f} | Train Accuracy: {final_train_acc: .3f}%')
            print(f'Validation Loss: {final_val_loss: .3f} | Validation Accuracy: {final_val_acc: .3f}%\n')
            
            # Save the model
            #torch.save(model.state_dict(), '/kaggle/input/nlpdisastertweetsbertmodel/nlp_disaster_tweets_bert2.pth')
            #print('Model Has Been Saved!')

In [None]:
class Test():
    def __init__(self, model, test_data, batch_size):
        self.model = model
        self.test_data = test_data
        self.batch_size = batch_size
        
    def plot_metrics(self, labels, outputs):
        labels = torch.cat(labels, dim = 0)
        labels = labels.cpu().numpy()
        outputs = torch.cat(outputs, dim = 0)
        probs = F.softmax(outputs, dim = 1).cpu().numpy()
        preds = probs[:, 1]
        
        # ROC
        fpr, tpr, threshold = roc_curve(labels, preds)
        roc_auc = auc(fpr, tpr)
        
        # Classification Report
        y_pred = np.where(preds > 0.5, 1, 0)
        
        print('\nClassification Report:\n', classification_report(labels, y_pred))
        
        # Confusion Matrix
        cm = confusion_matrix(labels, y_pred)
        
        %matplotlib inline
        sns.set(rc={'figure.figsize':(9, 6)})
        plt.title('Receiver Operating Characteristic')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        #plt.savefig('./plots/roc-curve.png')
        plt.show()
        
        #sns.set(rc={'figure.figsize':(7,5)})
        cm_disp = ConfusionMatrixDisplay(confusion_matrix = cm)
        cm_disp.plot()
        

    def start_test(self):
        test = Dataset(self.test_data)
        test_dataloader = torch.utils.data.DataLoader(test, self.batch_size)

        use_cuda = torch.cuda.is_available()
        device = torch.device('cuda' if use_cuda else 'cpu')

        if use_cuda:
            self.model = self.model.cuda()

        self.model.eval()
        total_acc_test = 0
        test_outputs = []
        test_labels = []
        with torch.no_grad():
            for test_input, test_label in test_dataloader:
                test_label = test_label.type(torch.LongTensor)
                test_label = test_label.to(device)
                mask = test_input['attention_mask'].to(device)
                input_id = test_input['input_ids'].squeeze(1).to(device)

                output = self.model(input_id, mask)
                acc = (output.argmax(dim = 1) == test_label).sum().item()
                total_acc_test += acc
                
                test_labels.append(test_label)
                test_outputs.append(output)
        self.plot_metrics(test_labels, test_outputs)
        print(f'Test Accuracy: {(total_acc_test / len(self.test_data))*100: .3f} %')

### 4.6 Predict <a class="anchor" id="predict"></a>

In [None]:
class UnseenDataset(torch.utils.data.Dataset):

    def __init__(self, df, pretrained = 'bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained)
        self.texts = [self.tokenizer(text, 
                               padding = 'max_length', max_length = 512, truncation = True,
                                return_tensors = 'pt') for text in df['new_text']]
    def __len__(self):
        return len(self.texts)

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)

        return batch_texts

In [None]:
class Predict():
    def __init__(self, model, model_path, unseen_data, batch_size):
        self.model = model
        self.model_path = model_path 
        self.unseen_data = unseen_data
        self.batch_size = batch_size

    def start_predict(self):
        test = UnseenDataset(self.unseen_data)
        test_dataloader = torch.utils.data.DataLoader(test, self.batch_size, shuffle = False)

        use_cuda = torch.cuda.is_available()
        device = torch.device('cuda' if use_cuda else 'cpu')

        self.model.load_state_dict(torch.load(self.model_path, map_location = 'cpu'))

        if use_cuda:
            self.model = self.model.cuda()
            
        predictions = []
        model.eval()
        with torch.no_grad():
            for test_input in test_dataloader:
                mask = test_input['attention_mask'].to(device)
                input_id = test_input['input_ids'].squeeze(1).to(device)
                output = model(input_id, mask)
                predictions.append(output.cpu().numpy())
        predictions = np.concatenate(predictions, axis = 0)
        self.unseen_data['target'] = predictions.argmax(axis = 1)
        self.unseen_data['target'] = self.unseen_data['target'].astype(int)
        predicted_data = self.unseen_data[['id', 'target']]
        predicted_data.to_csv('/kaggle/input/nlpdisastertweetssubmission/submission-bert.csv', index = False)

### 4.7 Run <a class="anchor" id="run"></a>

In [None]:
#To launch MyEnsemble class
model1 = Model1()
model2 = Model2()
parameters = list(model1.parameters()) + list(model2.parameters())
model = MyEnsemble(model1, model2)
optimizer = AdamW(parameters, lr = 3e-6, eps = 1e-8)

# To launch a single model 
"""
model = BertClassifier()
optimizer = AdamW(model.parameters(), lr = 3e-6, eps = 1e-8)

"""
loss_func = nn.CrossEntropyLoss()   
epochs = 4
batch_size = 2
model_path = '/kaggle/input/nlp-getting-started/nlp_disaster_tweets_bert.pth'
unseen_data = test_df

In [None]:
if __name__ == '__main__':
    path = Path(model_path)
    print('============= Mode Selection =============')
    user_input = input('Press t to start training and testing\nPress p to make predictions using the existing BERT model\nPress q to exit\n')
    if (user_input == 't'):
        print('============= Train/Validation/Test Split =============')
        print('Train/Validation/Test dataset size: ', len(train_data), '/', len(val_data), '/', len(test_data))
        print('============= Training Started =============')
        train = Train(model, train_data, val_data, loss_func, optimizer, epochs, batch_size) 
        train.start_train()
        print('Training Completed!')
        print('============= Testing Started =============')
        test = Test(model, test_data, batch_size)
        test.start_test()
        print('Testing Completed!')
    else:
        if (user_input == 'p'):
            if path.is_file():
                print('============= Making Prediction =============')
                predict = Predict(model, model_path, unseen_data, batch_size)
                predict.start_predict()
                print('Predictions Made and Saved!')
            else:
                print('OOPS! THERE IS NO EXISTING BERT MODEL FOUND. PLEASE TRAIN AND TEST ONE IN ORDER TO HAVE ONE :)')
        if (user_input == 'q'):
            sys.exit()

In [None]:
submission = pd.read_csv('/kaggle/input/nlpdisastertweetssubmission/submission-bert.csv')
submission.head()

## Results

### The results shown below were obtained using a randomly sampled training dataset of 5530 samples, a validation dataset of 691 samples, and a testing dataset of 692 samples. The accuracies may vary depending on the randomly sampled training, validation, and test datasets.
### I have done 3 runs for each model, so 3 different randomly sampled datasets. Therefore, it is hard to say if BERT completely outperforms RoBERTa, and their ensembles on that data, but both BERT and RoBERTa outperformed XLMRoBERTa in all 3 runs. However, the test accuracies for BERT and RoBERTa were always between approximately 81% and 83.09% in all 3 runs.
### In case of XLMRoBERTa, other models outperformed them in all 3 runs.
### In case of all ensembles, BERT seems to be the weakest as it had the lowest accuracy in all 3 runs. RoBERTa and XLMRoBERTa ensembles performed nearly the same.
### Vayring loss functions, optimizers, learning rate, etc. didn't produce much of improvement/decrease in accuracy.
### I used BERT with the test accuracy of 83.09% as my final submission that scored 0.82684.

### Code available on my Github https://github.com/alite13/NLP-Disaster-Tweets-Classification

| Model | Loss Function | Optimizer | Epochs | Accuracy |
| --- | --- | --- | --- | --- |
| BERT | nn.CrossEntropy() | AdamW | 4 | 83.09% |
| RoBERTa | nn.CrossEntropy() | AdamW | 4 | 81.79% |
| XLMRoBERTa | nn.CrossEntropy() | AdamW | 4 | 79.89% |
| BERT + BERT (Ensemble) | nn.CrossEntropy() | AdamW | 4 | 82.94% |
| RoBERTa + RoBERTa (Ensemble) | nn.CrossEntropy() | AdamW | 4 | 82.37% |
| XLMRoBERTa + XLMRoBERTa (Ensemble) | nn.CrossEntropy() | AdamW | 4 | 81.2% |
