This notebook is a code-along note book that follows the following notebook by Pei-Yi Hong with additional comments to clarify each step for beginner learners like me :-)

https://www.kaggle.com/hongpeiyi/bert-with-pytorch-and-fastai 

In [None]:
#Importing the libraries
import numpy as np
import pandas as pd
from fastai.text.all import *
import re

In [None]:
# Loading the training and testing data into dataframes
dir_path = "/kaggle/input/nlp-getting-started/"
train_df = pd.read_csv(dir_path + "train.csv")
test_df = pd.read_csv(dir_path + "test.csv")

In [None]:
# Keep only the text and target columnds
traind_df = train_df.drop(columns = ["id", "keyword", "location"])
traind_df["target"].value_counts()

In [None]:
# Cleaning the text data: removing URLs, html code and emoji
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

train_df["text"] = train_df["text"].apply(remove_URL)
train_df["text"] = train_df["text"].apply(remove_html)
train_df["text"] = train_df["text"].apply(remove_emoji)
test_df["text"] = test_df["text"].apply(remove_URL)
test_df["text"] = test_df["text"].apply(remove_html)
test_df["text"] = test_df["text"].apply(remove_emoji)

In [None]:
train_df["text"].apply(lambda x:len(x.split())).plot(kind="hist")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
# Instantiate a tokenizer based on the Bert case sensitive model 
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
# Convert text sequences to numerical tokens (vector of numbers which can be fed into the model)

train_tensor = tokenizer(list(train_df["text"]), padding="max_length", 
                         truncation=True, max_length=30, 
                         return_tensors="pt")["input_ids"]

In [None]:
# Create a custom class to prepare the training data to be 
# in model input form (tuple of tokenized text sequence and tensor of target)

class TweetDataset:
    def __init__(self, tensors, targ, ids):
        self.text = tensors[ids]
        self.targ = targ[ids].reset_index(drop=True)
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        t = self.text[idx]
        y = self.targ[idx]
        return t, tensor(y)

In [None]:
# Split the data in a trainig and validation set
train_ids, valid_ids = RandomSplitter()(train_df)

# Separate the y / target into a variable
target = train_df["target"]

# create the input dataset based on the randomsplitter ids and utiliing the pre-processed tokens 
train_ds = TweetDataset(train_tensor, target, train_ids)
valid_ds = TweetDataset(train_tensor, target, valid_ids)

train_dl = DataLoader(train_ds, bs=64)
valid_dl = DataLoader(valid_ds, bs=512)

dls = DataLoaders(train_dl, valid_dl).to("cuda")

In [None]:
# Instantiate model object (Bert model with classification output)

bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased").train().to("cuda")

classifier = nn.Sequential(
    nn.Linear(768, 300),
    nn.ReLU(),
    nn.BatchNorm1d(300),
    nn.Dropout(0.5),
    nn.Linear(300, 2)
)

bert.classifier = classifier

class BertClassifier(Module):
    def __init__(self, bert):
        self.bert = bert
    def forward(self, x):
        return self.bert(x).logits

model = BertClassifier(bert)
    

In [None]:
# Set up the fastai learner using the model insantiated in previous step and find optimal learning rate

learn = Learner(dls, model, loss_func=nn.CrossEntropyLoss(), metrics=[accuracy, F1Score()])
learn.lr_find()

In [None]:
# Fit the model
learn.fit_one_cycle(4, lr_max=5e-5, wd=0.8)

In [None]:
# Check f1 scores if threshold for for the probability of the logit 
# with the highest probability is above a minimum threshold 
# (if logits is below this threshold, I believe the prediction is zero: "not a disaster tweet") 

from sklearn.metrics import f1_score

preds, targs = learn.get_preds()

min_threshold = None
max_f1 = -float("inf")
thresholds = np.linspace(0.3, 0.7, 50)
for threshold in thresholds:
    f1 = f1_score(targs, F.softmax(preds, dim=1)[:, 1]>threshold)
    if f1 > max_f1:
        min_threshold = threshold
        min_f1 = f1
    print(f"thresholds:{threshold:.4f} - f1:{f1:.4f}")

In [None]:
# Convert text sequences to numerical tokens (vector of numbers which can be fed into the model)

test_tensor = tokenizer(list(test_df["text"]),
                        padding="max_length",
                        truncation=True,
                        max_length=30,
                        return_tensors="pt")["input_ids"]

In [None]:
# Create a custom class to prepare the test data in the input form required for the model: a tuple
# of text sequence and a tensor of zero

class TestDS:
    def __init__(self, tensors):
        self.tensors = tensors
    
    def __len__(self):
        return len(self.tensors)
    
    def __getitem__(self, idx):
        t = self.tensors[idx]
        return t, tensor(0)

test_dl = DataLoader(TestDS(test_tensor), bs=128)

In [None]:
# Get test prediction from the learner model

test_preds = learn.get_preds(dl=test_dl)

In [None]:
# Submit output to Kaggle
prediction = (F.softmax(test_preds[0], dim=1)[:, 1]>min_threshold).int()
sub = pd.read_csv(dir_path + "sample_submission.csv")
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)