# NLP Disaster Tweets - Classification #1 - Pretrained Embeddings with LR and RandomForest

This kernel includes codes and ideas from kernels below.
- https://medium.com/@dhartidhami/understanding-bert-word-embeddings-7dc4d2ea54ca

# Import Libraries

In [None]:
import os
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Running on {DEVICE}')

# running in google colab
if 'google.colab' in str(get_ipython()):
    BASE_PATH = './drive/MyDrive/Colab/data/'
    BASE_PATH_PRETRAINED = './drive/MyDrive/Colab/pretrained/'
    from google.colab import drive
    drive.mount('/content/drive')
    import nltk
    nltk.download('stopwords')
    !pip install tokenizers

# running interactively in kaggle
elif get_ipython().config.IPKernelApp.connection_file.startswith('/root/.local/share'):
    BASE_PATH = '/kaggle/input/'
    BASE_PATH_PRETRAINED = '/kaggle/input/'
    
# running as background job in kaggle
elif 'SHLVL' in os.environ:
    BASE_PATH = '/kaggle/input/'
    BASE_PATH_PRETRAINED = '/kaggle/input/'

else:
    BASE_PATH = '../data/'
    BASE_PATH_PRETRAINED = '../pretrained/'

In [None]:
import random
import pprint
import string
from collections import Counter, defaultdict
import locale
locale.setlocale(locale.LC_ALL, locale='')  # for thousands separator via ... print(f'{value:n}')"
import re
from pprint import pprint
import requests

from tqdm import tqdm
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import torchtext
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import matplotlib.ticker
from matplotlib.axes._axes import Axes
import nltk
import numpy as np
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents, Lowercase
#from tokenizers import Tokenizer
#from tokenizers.models import WordPiece
#from tokenizers.trainers import WordPieceTrainer
#from tokenizers.processors import TemplateProcessing
# from transformers import BertModel
import gensim


my_seed = 42
random.seed(my_seed)
torch.manual_seed(my_seed);

# Load Data

In [None]:
df_source = pd.read_csv(BASE_PATH + 'nlp-getting-started/train.csv')
df_randomized = df_source.sample(frac=1)

In [None]:
NUM_VAL = int(len(df_randomized) * 0.3)

In [None]:
df_train_source = df_randomized[:-NUM_VAL]
df_val_source = df_randomized[-NUM_VAL:]
df_train_source

## Replace NaN

In [None]:
def replace_nan(df: pd.DataFrame) -> pd.DataFrame:
    df_ = df.copy()
    df_['keyword'] = df_['keyword'].fillna('')
    df_['location'] = df_['location'].fillna('')
    return df_

df_train = replace_nan(df_train_source)
df_val = replace_nan(df_val_source)
df_train

# Preprocessing

We will ignore keyword and location (see EDA workbook) and only use the tweets themselves.

In [None]:
ser_train = df_train['text']

## Normalize Strings
using HuggingFace's Normalizer

In [None]:
normalizer = normalizers.Sequence([
    NFD(),   # NFD unicode normalization
    Lowercase(),
    StripAccents()  #
])

print("Some string normalizing examples:")
unnormalized = ["Héllò hôw are ü?", "éàù", "kožušček", "François"]
for s in unnormalized:
    print(f'{s :<30} -> {normalizer.normalize_str(s)}')

In [None]:
ser_train = ser_train.apply(normalizer.normalize_str)

## Strip Hashtags

In [None]:
REGEX_HASHTAG_BEFORE = r'(?<!\S)#(\S+)'
REGEX_HASHTAG_AFTER = r'\1'
def strip_hashtags(tweet: str) -> str:
    return re.sub(REGEX_HASHTAG_BEFORE, REGEX_HASHTAG_AFTER, tweet)

example_sentence = 'Our Deeds are the Reason of this #earthquake'
print('Example:')
print(f'{example_sentence :<80} -> {strip_hashtags(example_sentence)}')

In [None]:
ser_train = ser_train.apply(strip_hashtags)

## Remove Punctuation

In [None]:
punct = re.compile(r'[^\w\s]')

def remove_punctuations(text: str) -> str:
    return punct.sub(r'', text)

ser_train = ser_train.apply(strip_hashtags)

## Americanize
⚠ This actually transforms a lot as we tokenize

In [None]:
dl_url ="https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/british_spellings.json"
british_to_american_map = requests.get(dl_url).json()
len(british_to_american_map)
print(british_to_american_map['colour'])
print(british_to_american_map['traumatise'])

In [None]:
def americanize(text: str):
    tokenized = [british_to_american_map[w] if w in british_to_american_map
                        else w for w in nltk.tokenize.word_tokenize(text)]
    return ' '.join(tokenized)

ser_train = ser_train.apply(americanize)

## Remove Stopwords
⚠ This actually transforms a lot as we tokenize

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))
print(f'Loaded {len(stop_words)} NLTK Stopwords')

In [None]:
def remove_stopwords(text: str):
    tokenized = [word for word in nltk.tokenize.word_tokenize(text) if word.lower() not in stop_words]
    return ' '.join(tokenized)

ser_train = ser_train.apply(remove_stopwords)

## Spelling

In [None]:
# rectify some common spelling mistakes
spelling_dict = {
    'didnt': "didn't",
    'doesnt': "doesn't",
    'isnt': "isn't",
    'aint': "ain't",
    'wasnt': "wasn't",
    'shouldnt': "shoudn't",
    'im': "i'm",
}

def rectify_spelling(text: str):
    tokenized = nltk.tokenize.word_tokenize(text)
    corrected = [spelling_dict.get(w, w) for w in tokenized]
    return ' '.join(corrected)

ser_train = ser_train.apply(rectify_spelling)

## Replace Contractions
Replace some abbreviated pronouns with full forms

In [None]:
contraction_mapping = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "could've": "could have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "here's": "here is",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "i'd": "i would",
    "I'd've": "I would have",
    "i'd've": "i would have",
    "I'll": "I will",
    "i'll": "i will",
    "i'll've": "i will have",
    "I'll've": "I will have",
    "i'm": "i am",
    "I'm": "I am",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "I've": "I have",
    "i've": "i have",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "might've": "might have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "must've": "must have",
    "needn't": "need not",
    "needn't've": "need not have",
    "n't": "not",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "should've": "should have",
    "so's": "so as",
    "so've": "so have",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "this's": "this is",
    "to've": "to have",
    "'ve": "have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "weren't": "were not",
    "we've": "we have",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "would've": "would have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

In [None]:
def replace_contraction(tweet: str):
    for contraction, full_form in contraction_mapping.items():
        tweet = re.sub(contraction, full_form, tweet)
    return tweet

ser_train = ser_train.apply(replace_contraction)

### Apply Transformation Pipeline to Train and Validation

In [None]:
ser_train = (df_train['text']
             .apply(normalizer.normalize_str)  # normalize
             .apply(strip_hashtags)
             .apply(remove_punctuations)
             .apply(remove_stopwords)
             .apply(americanize)
             .apply(rectify_spelling)
             .apply(replace_contraction)
            )

ser_val = (df_val['text']
             .apply(normalizer.normalize_str)
             .apply(strip_hashtags)
             .apply(remove_punctuations)
             .apply(remove_stopwords)
             .apply(americanize)
             .apply(rectify_spelling)
             .apply(replace_contraction)
            )

# Word2vec Pretrained Model

In [None]:
path = BASE_PATH_PRETRAINED + 'googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'
wv = gensim.models.KeyedVectors.load_word2vec_format(path, 
                                                     binary=True)

In [None]:
# the canonical example...
wv.most_similar(positive=['woman', 'king'], negative=['man'])[:3]

# Embedding Coverage

Embedding Coverage tells how much percentage of the words in our data are covered by the vocabulary. Words that are not covered by the Embeddings vocab are basically not used for classification. So we should make sure to include as much as possible using preprocessing techniques.

In [None]:
def compute_embeddings_coverage(ser: pd.Series):

    # get <<all>> and <<all distinct>> words in train data
    flat_words = [word for sentence in ser for word in nltk.word_tokenize(sentence)]
    distinct_words = set(flat_words)
    print(f'Found a total of {len(flat_words) :n} words, with {len(distinct_words) :n} distinct words.')

    missing_words = defaultdict(int)

    for word in flat_words:
        if word not in wv.key_to_index:
            missing_words[word] += 1

    print(f'Found {len(missing_words)} words missing in embeddings.')

    embeddings_vocab_coverage = (len(distinct_words) - len(missing_words)) / len(distinct_words)
    total_words_missing = sum(count for count in missing_words.values())
    embeddings_text_coverage = (len(flat_words) - total_words_missing) / len(flat_words)

    return missing_words, embeddings_vocab_coverage, embeddings_text_coverage

In [None]:
missing_words, embeddings_vocab_coverage, embeddings_text_coverage = compute_embeddings_coverage(ser=ser_train)
print(f'Embeddings Vocab Coverage: {embeddings_vocab_coverage :.2%}')
print(f'Embeddings Text Coverage: {embeddings_text_coverage :.2%}')

In [None]:
missing_words_list = [(word, count) for word, count in missing_words.items()]
sorted_missing_words = sorted(missing_words_list, key=lambda x: -x[1])
print(f'Most frequently used missing words:')
pprint(sorted_missing_words[:25])

### Create Vocabulary and Custom Embeddings

In [None]:
# we won't encode all words, but only the n most common words
flat_words = [word for text in ser_train for word in nltk.tokenize.word_tokenize(text)]
distinct_words = set(flat_words)
print(f'{len(distinct_words)} distinct words.')

In [None]:
# word to index
words_with_embeddings = [w for w in distinct_words if w in wv.key_to_index]
print(f'{len(words_with_embeddings)} words with pretrained word vectors.')

In [None]:
words_without_embeddings = [w for w in distinct_words if w not in wv.key_to_index]
print(f'{len(words_without_embeddings)} words without pretrained word vectors. We will ignore them.')

In [None]:
token_to_index = {token: index for index, token in enumerate(words_with_embeddings)}
index_to_token = {index: token for token, index in token_to_index.items()}

# wrapper for token-to-index mapping
vocab = torchtext.vocab.vocab(token_to_index)

In [None]:
# Create initiual embeddings with all-zeros for our 80.000 words with 300 dimensions (like pretrained embeddings)
embeddings = torch.zeros(len(token_to_index), wv.vectors.shape[1])
embeddings.shape

In [None]:
# we use the known words' embeddings in our model
indices_with_embeddings = [token_to_index[w] for w in words_with_embeddings]

# map from "new" to "old" index (i.e. pretrained index)
index_to_pretrained_index = {index: wv.key_to_index[index_to_token[index]] for index in indices_with_embeddings}

In [None]:
for index, pretrained_index in index_to_pretrained_index.items():
    embeddings[index] = torch.Tensor(wv.vectors[pretrained_index])  # ndarray to tensor

In [None]:
embeddings

# Tokenize

In [None]:
def tokenize_if_in_vocab(text: str) -> str:
    return [w for w in nltk.tokenize.word_tokenize(text) if w in vocab]

tokenized_train = ser_train.apply(tokenize_if_in_vocab)
print(f'tokenized_train is a {type(tokenized_train)} of shape {tokenized_train.shape}')

tokenized_val = ser_val.apply(tokenize_if_in_vocab)
print(f'tokenized_val is a {type(tokenized_val)} of shape {tokenized_val.shape}')

# Compute Average Feature Vector per Tweet

In [None]:
def compute_average_feature_vector(tokens: list[str]) -> torch.tensor:
    feature_vec_sum = np.zeros((300, ), 
                               dtype='float32')
    for token in tokens:
        index = token_to_index[token]
        feature_vec_current_token = embeddings[index]
        feature_vec_sum = np.add(feature_vec_sum, feature_vec_current_token)  # returns a tensor!

    feature_vec_avg = np.divide(feature_vec_sum, len(tokens)) if len(tokens) > 0 else torch.tensor(feature_vec_sum)
    return feature_vec_avg  # [300]

In [None]:
train_average_feature_vectors = tokenized_train.apply(compute_average_feature_vector)
val_average_feature_vectors = tokenized_val.apply(compute_average_feature_vector)

In [None]:
# train_average_feature_vectors is a pd.Series of size 7613 with each element being a Tensor of size [300]
# we need to convert that to a np.Array of size (7613, 300)
ser_train_arr = train_average_feature_vectors.apply(lambda x: x.numpy())  # series of arrays
x_train = np.stack(ser_train_arr.values)  # array of size (7613, 300)

ser_val_arr = val_average_feature_vectors.apply(lambda x: x.numpy())
x_val = np.stack(ser_val_arr.values)

In [None]:
y_train = df_train_source['target']
y_val = df_val_source['target']

assert x_train.shape[0] == y_train.shape[0]
assert x_val.shape[0] == y_val.shape[0]

# Classify using LogisticRegression and RandomForestClassifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
lr = LogisticRegression()  # we'll try with the default hyperparams
lr.fit(x_train, y_train)

y_pred_val = lr.predict(x_val)

print(f'Validation F1-Score: {metrics.f1_score(y_true=y_val, y_pred=y_pred_val)}')
print(f'Validation Accuracy: {metrics.accuracy_score(y_true=y_val, y_pred=y_pred_val)}')

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
#y_pred_train = rfc.predict(x_train)
y_pred_val = rfc.predict(x_val)

print(f'Validation F1-Score: {metrics.f1_score(y_true=y_val, y_pred=y_pred_val)}')
print(f'Validation Accuracy: {metrics.accuracy_score(y_true=y_val, y_pred=y_pred_val)}')

# Submission

In [None]:
df_test_source = pd.read_csv(BASE_PATH + 'nlp-getting-started/test.csv')
df_test = replace_nan(df_test_source)
df_test  # no target col

In [None]:
ser_test = (df_test['text']
             .apply(normalizer.normalize_str)
             .apply(strip_hashtags)
             .apply(remove_punctuations)
             .apply(remove_stopwords)
             .apply(americanize)
             .apply(rectify_spelling)
             .apply(replace_contraction)
            )

ser_test

In [None]:
tokenized_test = ser_test.apply(tokenize_if_in_vocab)
test_average_feature_vectors = tokenized_test.apply(compute_average_feature_vector)
ser_test_arr = test_average_feature_vectors.apply(lambda x: x.numpy())  # series of arrays
x_test = np.stack(ser_test_arr.values)  # array of size (3263, 300)

In [None]:
# the LogisticRegressor scored slightly better than the RandomForestClassifier in terms of F1-score
y_pred = lr.predict(x_test)
y_pred

In [None]:
ser_pred = pd.Series(y_pred)
df_pred = pd.DataFrame({'id': df_test['id'],
                       'target': ser_pred})
df_pred

In [None]:
df_pred['target'].value_counts()

In [None]:
df_pred.to_csv('submission.csv',
               index=False)