### Reference: https://mccormickml.com/2019/07/22/BERT-fine-tuning/

# Manual training and evaluation

### Train

In [None]:
from train import train_model_on_train_data


# Parameters
TRAIN_DATA_PATH = "../data/train.csv"
MODEL_NAME = "bert-base-cased"
BATCH_SIZE = 32
NUM_EPOCHS = 1

model, training_stats = train_model_on_train_data(TRAIN_DATA_PATH, MODEL_NAME, BATCH_SIZE, NUM_EPOCHS)

training_stats

### Plot learning curve

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

df_training = pd.DataFrame(training_stats)

plt.figure(figsize=(12,6))
sns.set(style='darkgrid')
sns.set(font_scale=1.5)

plt.plot(df_training['training_loss'], 'b-o', label="Training")
plt.plot(df_training['validation_loss'], 'g-o', label="Validation")

plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()

### Evaluate test data

In [None]:
from evaluate import evaluate_on_test_data


TEST_DATA_PATH = "../data/eval.csv"


testing_stats = evaluate_on_test_data(model, TEST_DATA_PATH, MODEL_NAME, BATCH_SIZE)

testing_stats

# Improving Model F1 Score

In [1]:
from train import train_model_on_train_data


# Parameters
TRAIN_DATA_PATH = "../data/train.csv"
MODEL_NAME = "bert-base-cased"
BATCH_SIZE = 32
NUM_EPOCHS = 1
SEED = 42

model, training_stats = train_model_on_train_data(TRAIN_DATA_PATH, MODEL_NAME, BATCH_SIZE, NUM_EPOCHS, SEED)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2070 with Max-Q Design
EPOCH 1/1

BATCH 100/831:	Training loss(0.6034036874771118)
BATCH 200/831:	Training loss(0.6421673893928528)
BATCH 300/831:	Training loss(0.685686469078064)
BATCH 400/831:	Training loss(0.4897633492946625)
BATCH 500/831:	Training loss(0.6986396908760071)
BATCH 600/831:	Training loss(0.6421265602111816)
BATCH 700/831:	Training loss(0.6367670297622681)
BATCH 800/831:	Training loss(0.5038076639175415)

Avg training loss:    0.567236914244513
Avg validation loss:  0.5310724544268782
F1 validation score:  {'f1': 0.7650404070637534}



In [4]:
import pandas as pd
from model_preparation import set_seed
from data_preparation import _load_dataset, _prepare_data, _create_dataloaders, _create_tensors
import re


data_path = "../data/eval.csv"
model_name = "bert-base-cased"
batch_size = 32
create_validation_set = False
SEED = 42

set_seed(SEED)


def custom_clean_text(row: pd.DataFrame) -> pd.DataFrame:
    # remove hyperlinks
    # src: https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python/11332580
    row['text'] = re.sub(r'\S*https?:\S*', "", row['text'])

    # remove mentions
    row['text'] = re.sub(r'@\w*', "", row['text'])

    # remove hashtags
    row['text'] = re.sub(r'#\w*', "", row['text'])

    # remove emojis
    pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            "]+", flags=re.UNICODE)

    row['text'] = pattern.sub(r' ', row['text'])

    return row


df = _load_dataset(data_path)

final_df = df["tweet"].copy()

df = _prepare_data(df, custom_clean_text)

final_df = pd.concat([final_df, df[["text","label"]]], axis = 1)

input_ids, attention_masks, labels = _create_tensors(df, model_name)
dataloaders = _create_dataloaders(input_ids, attention_masks, labels, batch_size, create_validation_set)

final_df.head(10)

Unnamed: 0,tweet,text,label
0,Selsy mistakenly thinks the #Brownlow is like ...,Selsy mistakenly thinks the is like Mad Monda...,0
1,Obit vs mesin,Obit vs mesin,0
2,growing watercolor - Decatur Cemetery This pai...,growing watercolor - Decatur Cemetery This pai...,0
3,your new chick that's my ex bitch homie in fac...,your new chick that's my ex bitch homie in fac...,1
4,http://t.co/tTYrceHxzg @nationaljournal &amp;I...,&amp;Is Hillary Clinton Too Hawkish for Iowa...,0
5,They sleepin on you,They sleepin on you,0
6,Uh oh! Chris Weidman is ducking Vitor Belfort ...,Uh oh! Chris Weidman is ducking Vitor Belfort ...,0
7,"RT @andihfg: ""remember when i said hahmkyul ar...","RT : ""remember when i said hahmkyul are the on...",1
8,"""@LosMyTeddyBear: @TheCarlosPena\n#LosMeetMilo...",""": \n\nI need you please! \nWatch my videos+tw...",0
9,RT @trvpvv: @OsamaJames lol because its clear ...,RT : lol because its clear that they take pri...,1


In [3]:
from helper_functions import get_device
from datasets import load_metric
import torch
import numpy as np


device = get_device()
model = model.to(device)

pred_list = np.array([])

test_dataloader = dataloaders

testing_stats = []

try:
    total_test_loss = 0
    metric = load_metric("f1")

    model.eval()

    for n, batch in enumerate(test_dataloader):

        parameters = {
            "input_ids" : batch[0].to(device),
            "attention_mask" :  batch[1].to(device), 
            "labels" : batch[2].to(device)
        }
        with torch.no_grad():
            outputs = model(**parameters)
        
        logits = outputs.logits
        loss = outputs.loss
        total_test_loss += loss.item()

        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=parameters["labels"])

        if input_ids[n*32:(n+1)*32].equal(parameters["input_ids"].cpu()):
            pred_list = np.append(pred_list,predictions.cpu().numpy())

    testing_stats.append({
        "test_loss": total_test_loss/len(test_dataloader),
        "test_f1_score": metric.compute()
    })

    print(f"\nAvg test loss:  {testing_stats[0]['test_loss']}")
    print(f"F1 test score:  {testing_stats[0]['test_f1_score']}\n")

except RuntimeError as e:
    print(e)

final_df["prediction"] = pred_list
final_df["match"] = final_df.apply(lambda row: "" if row["label"] == row["prediction"] else "NO", axis=1)

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2070 with Max-Q Design

Avg test loss:  0.5354963170759606
F1 test score:  {'f1': 0.7570361860999425}



In [7]:
# final_df.to_csv("predictions.csv",index=False)
final_df[final_df["match"]=="NO"].head(40)

Unnamed: 0,tweet,text,label,prediction,match
5,They sleepin on you,They sleepin on you,0,1.0,NO
6,Uh oh! Chris Weidman is ducking Vitor Belfort ...,Uh oh! Chris Weidman is ducking Vitor Belfort ...,0,1.0,NO
7,"RT @andihfg: ""remember when i said hahmkyul ar...","RT : ""remember when i said hahmkyul are the on...",1,0.0,NO
10,RT @RichardBarrow: Correspondent from Telegrap...,RT : Correspondent from Telegraph : Just got t...,1,0.0,NO
11,RT @SarahDJakes: If You're constantly asking G...,RT : If You're constantly asking God for somet...,0,1.0,NO
12,Are you ready today?haha xD zenam zenam..yuhuu...,Are you ready today?haha xD zenam zenam..yuhuu...,1,0.0,NO
16,THAT BUDWEISER COMMERCIAL. GOODBYE.,THAT BUDWEISER COMMERCIAL. GOODBYE.,0,1.0,NO
18,His dancing ass lol,His dancing ass lol,0,1.0,NO
19,I Got that fire u're searching for! rappers ch...,I Got that fire u're searching for! rappers ch...,0,1.0,NO
35,RT @JeezyJake10: LOL GPhi yes 👌👏😂,RT : LOL GPhi yes,1,0.0,NO
