# NLP Disaster Tweets - Classification #2 - Pretrained Embeddings in Linear NN

This kernel includes codes and ideas from...
- https://medium.com/@dhartidhami/understanding-bert-word-embeddings-7dc4d2ea54ca

Others:
- Compatible with Google Colab and Kaggle as runtime
- CUDA support

# Import Libraries

In [None]:
import os
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Running on {DEVICE}')

# running in google colab
if 'google.colab' in str(get_ipython()):
    BASE_PATH = './drive/MyDrive/Colab/data/'
    BASE_PATH_PRETRAINED = './drive/MyDrive/Colab/pretrained/'
    from google.colab import drive
    drive.mount('/content/drive')
    import nltk
    nltk.download('stopwords')
    !pip install tokenizers

# running interactively in kaggle
elif get_ipython().config.IPKernelApp.connection_file.startswith('/root/.local/share'):
    BASE_PATH = '/kaggle/input/'
    BASE_PATH_PRETRAINED = '/kaggle/input/'
    
# running as background job in kaggle
elif 'SHLVL' in os.environ:
    BASE_PATH = '/kaggle/input/'
    BASE_PATH_PRETRAINED = '/kaggle/input/'

else:
    BASE_PATH = '../data/'
    BASE_PATH_PRETRAINED = '../pretrained/'

In [None]:
import random
import pprint
import string
from collections import Counter, defaultdict
import locale
locale.setlocale(locale.LC_ALL, locale='')  # for thousands separator via ... print(f'{value:n}')"
import re
from pprint import pprint
import requests
from typing import Callable
import json

import sklearn
import numpy as np
from tqdm import tqdm
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import torchtext
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import matplotlib.ticker
from matplotlib.axes._axes import Axes
import nltk
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents, Lowercase
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.processors import TemplateProcessing
from transformers import BertModel
import gensim

my_seed = 42
random.seed(my_seed)
torch.manual_seed(my_seed);

# Load Data

In [None]:
df_source = pd.read_csv(BASE_PATH + 'nlp-getting-started/train.csv')

In [None]:
df_randomized = df_source.sample(frac=1)
NUM_VAL = int(len(df_randomized) * 0.15)

df_train_source = df_randomized[:-NUM_VAL]
df_val_source = df_randomized[-NUM_VAL:]

def replace_nan(df: pd.DataFrame) -> pd.DataFrame:
    df_ = df.copy()
    df_['keyword'] = df_['keyword'].fillna('')
    df_['location'] = df_['location'].fillna('')
    return df_

df_train = replace_nan(df_train_source)
df_val = replace_nan(df_val_source)

df_train

# Preproces
For more details and explanation, see Classification #1 Notebook.

In [None]:
normalizer = normalizers.Sequence([
    NFD(),   # NFD unicode normalization
    Lowercase(),
    StripAccents()  #
])

REGEX_HASHTAG_BEFORE = r'(?<!\S)#(\S+)'
REGEX_HASHTAG_AFTER = r'\1'
def strip_hashtags(tweet: str) -> str:
    return re.sub(REGEX_HASHTAG_BEFORE, REGEX_HASHTAG_AFTER, tweet)

punct = re.compile(r'[^\w\s]')
def remove_punctuations(text: str) -> str:
    return punct.sub(r'', text)

dl_url ="https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/british_spellings.json"
british_to_american_map = requests.get(dl_url).json()
def americanize(text: str):
    tokenized = nltk.tokenize.word_tokenize(text)
    americanized = [british_to_american_map[w] if w in british_to_american_map
                    else w for w in tokenized]
    return ' '.join(americanized)

stop_words = set(nltk.corpus.stopwords.words('english'))
def remove_stopwords(text: str):
    tokenized = nltk.tokenize.word_tokenize(text)
    without_stopwords = [word for word in tokenized if word.lower() not in stop_words]
    return ' '.join(without_stopwords)

spelling_dict = {
    'didnt': "didn't",
    'doesnt': "doesn't",
    'isnt': "isn't",
    'aint': "ain't",
    'wasnt': "wasn't",
    'shouldnt': "shoudn't",
    'im': "i'm",
}
def rectify_spelling(text: str):
    tokenized = nltk.tokenize.word_tokenize(text)
    corrected = [spelling_dict.get(w, w) for w in tokenized]
    return ' '.join(corrected)
    
with open(BASE_PATH + "contractions/english_contractions.json", "r") as f:
    contraction_mapping = json.load(f)
def replace_contraction(tweet: str):
    for contraction, full_form in contraction_mapping.items():
        tweet = re.sub(contraction, full_form, tweet)
    return tweet

In [None]:
ser_train = (df_train['text']
             .apply(normalizer.normalize_str)
             .apply(strip_hashtags)
             .apply(remove_punctuations)
             .apply(americanize)
             .apply(remove_stopwords)
             .apply(rectify_spelling)
             .apply(replace_contraction)
            )

ser_val = (df_val['text']
             .apply(normalizer.normalize_str)
             .apply(strip_hashtags)
             .apply(remove_punctuations)
             .apply(americanize)
             .apply(remove_stopwords)
             .apply(rectify_spelling)
             .apply(replace_contraction)
            )

ser_train

# Word2vec pretrained Embeddings
For mode details and embedding coverage, see Classification #1 Notebook

In [None]:
path = BASE_PATH_PRETRAINED + 'googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'
wv = gensim.models.KeyedVectors.load_word2vec_format(path, 
                                                     binary=True)

### Create Vocabulary and Custom Embeddings

In [None]:
flat_words = [word for text in ser_train for word in nltk.tokenize.word_tokenize(text)]
distinct_words = set(flat_words)
print(f'{len(distinct_words)} distinct words.')

words_with_embeddings = [w for w in distinct_words if w in wv.key_to_index]
words_without_embeddings = [w for w in distinct_words if w not in wv.key_to_index]
print(f'{len(words_with_embeddings)} words with pretrained word vectors.')
print(f'{len(words_without_embeddings)} words without pretrained word vectors. We will ignore them.')

token_to_index = {token: index for index, token in enumerate(words_with_embeddings)}
index_to_token = {index: token for token, index in token_to_index.items()}

# wrapper for token-to-index mapping
vocab = torchtext.vocab.vocab(token_to_index)

In [None]:
PAD_TOKEN = '<pad>'

# Create initiual embeddings with all-zeros in 300 dimensions (like pretrained embeddings)
embeddings = torch.zeros(len(token_to_index), wv.vectors.shape[1])
print(embeddings.shape)

# we use the known words' embeddings in our model
indices_with_embeddings = [token_to_index[w] for w in words_with_embeddings]

# map from "new" to "old" index (i.e. pretrained index)
index_to_pretrained_index = {index: wv.key_to_index[index_to_token[index]] for index in indices_with_embeddings}

for index, pretrained_index in index_to_pretrained_index.items():
    embeddings[index] = torch.Tensor(wv.vectors[pretrained_index])  # ndarray to tensor

# add the padding token (we'll need it later)
if PAD_TOKEN not in token_to_index:
    PAD_TOKEN_IDX = len(embeddings)
    token_to_index[PAD_TOKEN] = PAD_TOKEN_IDX
    index_to_token[PAD_TOKEN_IDX] = PAD_TOKEN
    embeddings = torch.cat([embeddings, 
                        torch.zeros(1, wv.vectors.shape[1])])

embeddings

# Tokenize

In [None]:
MAX_WORDS = 50

In [None]:
def tokenize_if_in_vocab(text: str) -> list[str]:
    tokens = [w for w in nltk.tokenize.word_tokenize(text) if w in vocab]
    if len(tokens) < MAX_WORDS:
        tokens = tokens + [PAD_TOKEN] * (MAX_WORDS - len(tokens))
    elif len(tokens) > MAX_WORDS:
        tokens = tokens[:50]
    return tokens

tokenized_train = ser_train.apply(tokenize_if_in_vocab)
print(f'tokenized_train is a {type(tokenized_train)} of shape {tokenized_train.shape}')

tokenized_val = ser_val.apply(tokenize_if_in_vocab)
print(f'tokenized_val is a {type(tokenized_val)} of shape {tokenized_val.shape}')

In [None]:
# tokenized_train is a pd.Series with each element being a list of size MAX_WORDS containing words (or PAD_TOKEN)
# we need to convert that to a np.Array of size [n, MAX_WORDS]

def convert_token_to_index(tokens: list[str]) -> torch.Tensor:
    context_indices: list[int] = []
    for token in tokens:
        index = token_to_index[token]
        context_indices.append(index)

    return context_indices
    
indexed_train = tokenized_train.apply(convert_token_to_index) 
indexed_val = tokenized_val.apply(convert_token_to_index) 

x_train_arr = np.stack(indexed_train.values)  # array of size (5330, 50)
x_val_arr = np.stack(indexed_val.values)  # array of size (2283, 50)

In [None]:
ser_y_train = df_train_source['target']
ser_y_val = df_val_source['target']

assert x_train_arr.shape[0] == ser_y_train.shape[0]
assert x_val_arr.shape[0] == ser_y_val.shape[0]

# Tensorize

In [None]:
x_train = torch.tensor(x_train_arr).to(DEVICE)  # [5330, 50], torch.int32
x_val = torch.tensor(x_val_arr).to(DEVICE)   # [2283, 300], torch.int32

In [None]:
y_train = torch.tensor(ser_y_train.values).to(DEVICE)  # [5330], torch.int64
y_val = torch.tensor(ser_y_val.values).to(DEVICE)  # [2283], torch.int64

# DataLoader

In [None]:
BATCH_SIZE = 64

In [None]:
train_dataset = torch.utils.data.TensorDataset(x_train, 
                                               y_train)

# we don't need a DataLoader for validation data; we're going to predict
# with validation data as a whole, without batches
train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=BATCH_SIZE, 
                                           shuffle=True)

# Model

In [None]:
EMBEDDINGS_DIM = 300

class Classifier(nn.Module):
    def __init__(self, token_embedding: torch.Tensor):
        super(Classifier, self).__init__()
        self.token_embedding = nn.Embedding.from_pretrained(token_embedding,
                                                            freeze=False)  # True)
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(EMBEDDINGS_DIM*MAX_WORDS, 128)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(128, 2)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.token_embedding(x)
        x = self.flatten(x)
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        x = self.sigmoid(x)
        return x

# Training

In [None]:
LEARNING_RATE = 0.001

classifier = Classifier(token_embedding=embeddings.to(DEVICE)).to(DEVICE)
optimizer = torch.optim.Adam(classifier.parameters(), lr=LEARNING_RATE)
loss_fn =  nn.CrossEntropyLoss()  # nn.BCELoss() 

In [None]:
def compute_metrics(classifier: Classifier, 
                    loss_fn: Callable,
                    x: torch.Tensor, 
                    y: torch.Tensor
                   )->tuple[float, float, float]:
    
        y_pred_logits = classifier(x)
        loss = loss_fn(y_pred_logits, y).item()
    
        y_pred = y_pred_logits.argmax(dim=1)
        correct = (y_pred == y).type(torch.FloatTensor)
        accuracy = correct.mean().item()

        f1_score = sklearn.metrics.f1_score(y_true=y.cpu(), 
                                    y_pred=y_pred.cpu())
        
        return loss, accuracy, f1_score

In [None]:
NUM_EPOCHS = 10

metrics = pd.DataFrame(columns=['loss_train', 'accuracy_train', 'f1_train', 
                                'loss_val', 'accuracy_val', 'f1_val'],
                       index=range(NUM_EPOCHS))

classifier.train()
for epoch in tqdm(range(NUM_EPOCHS)):

    # x_train_batch: [batch_size, 50], torch.int32
    # y_train_batch: [batch_size], torch.int64
    for x_train_batch, y_train_batch in train_loader:
        
        optimizer.zero_grad()
        
        y_pred_batch = classifier(x_train_batch)  # [batch_size, 2] of dtype torch.float32
        
        loss = loss_fn(y_pred_batch, y_train_batch)

        loss.backward()
        optimizer.step()

    with torch.no_grad():
        loss_train, accuracy_train, f1_score_train = compute_metrics(classifier, loss_fn, x_train, y_train)
        loss_val, accuracy_val, f1_score_val = compute_metrics(classifier, loss_fn, x_val, y_val)
        metrics.iloc[epoch] = [loss_train, accuracy_train, f1_score_train,
                               loss_val, accuracy_val, f1_score_val]

# Evaluation

In [None]:
metrics

In [None]:
epochs = range(NUM_EPOCHS)

fig, ((ax1, ax2), (ax3, _)) = plt.subplots(nrows=2,
                                       ncols=2,
                                       figsize=(15,5),
                                          sharex=True)

# Plot and label the training and val loss values
ax1.plot(epochs, metrics['loss_train'], label='Training Loss')
ax1.plot(epochs, metrics['loss_val'], label='val Loss')
ax1.set_ylabel('Loss')
ax1.legend(loc='best')

# ... Accuracy
ax2.plot(epochs, metrics['accuracy_train'], label='Training Accuracy')
ax2.plot(epochs, metrics['accuracy_val'], label='val Accuracy')
ax2.set_ylabel('Accuracy')
ax2.legend(loc='best')

# ... F1-Score
ax3.plot(epochs, metrics['f1_train'], label='Training F1-Score')
ax3.plot(epochs, metrics['f1_val'], label='val F1-Score')
ax3.set_ylabel('F1-Score')
ax3.legend(loc='best')
ax3.set_xlabel('Epochs')
ax3.set_xticks(np.arange(0, 
                         NUM_EPOCHS))

plt.suptitle('Training and Validation Metrics')
plt.xlabel('Epochs')
plt.xticks(np.arange(0, 
                     NUM_EPOCHS))

plt.show()

# Submission

In [None]:
df_test_source = pd.read_csv(BASE_PATH + 'nlp-getting-started/test.csv')
df_test = replace_nan(df_test_source)
df_test  # no target col

In [None]:
ser_test = (df_test['text']
             .apply(normalizer.normalize_str)
             .apply(strip_hashtags)
             .apply(remove_punctuations)
             .apply(americanize)
             .apply(remove_stopwords)
             .apply(rectify_spelling)
             .apply(replace_contraction)
            )

ser_test

In [None]:
tokenized_test = ser_test.apply(tokenize_if_in_vocab)
tokenized_test

In [None]:
indexed_test = tokenized_test.apply(convert_token_to_index)
indexed_test

In [None]:
x_test_arr = np.stack(indexed_test.values)  # array of size (3263, 50)
x_test_arr

In [None]:
x_test = torch.tensor(x_test_arr).to(DEVICE)  # [3263, 50], torch.int64
x_test

In [None]:
with torch.no_grad():
    y_pred_logits = classifier(x_test)
    y_pred = y_pred_logits.argmax(dim=1)
y_pred

In [None]:
ser_pred = pd.Series(y_pred.cpu().numpy())
ser_pred

In [None]:
df_pred = pd.DataFrame({'id': df_test['id'],
                       'target': ser_pred})
df_pred

In [None]:
df_pred['target'].value_counts()

In [None]:
df_pred.to_csv('submission.csv',
               index=False)