### Reference: https://mccormickml.com/2019/07/22/BERT-fine-tuning/

# Manual training and evaluation

### Train

In [None]:
from train import train_model_on_train_data


# Parameters
TRAIN_DATA_PATH = "../data/train.csv"
MODEL_NAME = "bert-base-cased"
BATCH_SIZE = 32
NUM_EPOCHS = 1

model, training_stats = train_model_on_train_data(TRAIN_DATA_PATH, MODEL_NAME, BATCH_SIZE, NUM_EPOCHS)

training_stats

### Plot learning curve

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

df_training = pd.DataFrame(training_stats)

plt.figure(figsize=(12,6))
sns.set(style='darkgrid')
sns.set(font_scale=1.5)

plt.plot(df_training['training_loss'], 'b-o', label="Training")
plt.plot(df_training['validation_loss'], 'g-o', label="Validation")

plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()

### Evaluate test data

In [None]:
from evaluate import evaluate_on_test_data


TEST_DATA_PATH = "../data/eval.csv"


testing_stats = evaluate_on_test_data(model, TEST_DATA_PATH, MODEL_NAME, BATCH_SIZE)

testing_stats

# Data Augmentation

In [5]:
# https://github.com/jasonwei20/eda_nlp
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lisandro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Lisandro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


True

In [1]:
import pandas as pd

original_df = pd.read_csv("data_before_going_to_model.csv")

original_df.drop(['tweet'], axis=1, inplace=True)
original_df.rename(columns={'text': 'sentence', 'sent': 'label'}, inplace=True)
original_df = original_df[["label", "sentence"]]
original_df.to_csv("eda_nlp-master/data/original_train.txt", index=False, header=False, sep="\t")

In [6]:
!python eda_nlp-master/code/augment.py --input=original_train.txt --output=augmented_train.txt

generated augmented sentences with eda for original_train.txt to augmented_train.txt with num_aug=9


In [11]:
import pandas as pd

augmented_df = pd.read_csv("augmented_train.txt", names = ["label", "text"], sep = "\t")
augmented_df = augmented_df.sample(frac=1).reset_index(drop=True)
augmented_df.to_csv("../data/train_aug.csv", index=False)

# Improving Model F1 Score

In [14]:
import pandas as pd

results_dictionary = {
        "model_name": [],
        "pipeline": [],
        "training_loss": [],
        "validation_loss": [],
        "validation_f1": [],
        "test_loss": [],
        "test_f1": [],
        "augmented": []
    }


old_df = pd.read_csv("2nd_models_comparison.csv")

for i in range(len(old_df)):
    aa = old_df.iloc[i].to_dict()

    for k,v in aa.items():
        results_dictionary[k].append(v)

pd.set_option('display.max_colwidth', None)
results_df = pd.DataFrame(results_dictionary)
results_df.head(40)

Unnamed: 0,model_name,pipeline,training_loss,validation_loss,validation_f1,test_loss,test_f1,augmented
0,microsoft/deberta-base,"['hyperlinks', 'mentions', 'hashtags', 'retweet', 'repetitions', 'emojis', 'smileys', 'spaces']",0.539779,0.49822,{'f1': 0.7855407047387605},0.510469,{'f1': 0.7723367697594502},no
1,microsoft/deberta-base,"['lowercase', 'hyperlinks', 'mentions', 'hashtags', 'retweet', 'repetitions', 'emojis', 'smileys', 'spaces']",0.560275,0.507515,{'f1': 0.7865235539654144},0.519109,{'f1': 0.7712455516014236},no
2,microsoft/deberta-base,"['hyperlinks', 'mentions', 'hashtags', 'repetitions', 'emojis', 'smileys', 'spaces']",0.544365,0.501752,{'f1': 0.7833935018050541},0.512311,{'f1': 0.7747542384955122},no
3,microsoft/deberta-base,"['hyperlinks', 'mentions', 'hashtags', 'retweet', 'emojis', 'smileys', 'spaces']",0.55502,0.518818,{'f1': 0.7694189602446483},0.504441,{'f1': 0.777521613832853},no


In [None]:
# results_dictionary = {
#         "model_name": [],
#         "pipeline": [],
#         "training_loss": [],
#         "validation_loss": [],
#         "validation_f1": [],
#         "test_loss": [],
#         "test_f1": []
#     }

In [16]:
import pandas as pd
from model_preparation import set_seed
from data_preparation import _load_dataset, _prepare_data, _create_dataloaders, _create_tensors
from preprocessor import Preprocessor
from helper_functions import get_device
from datasets import load_metric
import torch
import numpy as np
from model_preparation import Model, set_seed
from helper_functions import get_device
from datasets import load_metric


# Parameters
TRAIN_DATA_PATH = "../data/train_aug.csv"
TEST_DATA_PATH = "../data/eval.csv"
BATCH_SIZE = 16
NUM_EPOCHS = 1
SEED = 42

set_seed(SEED)

device = get_device()


# 'microsoft/deberta-v2-xlarge'
# "bert-base-cased"
# "bert-base-uncased"
for MODEL_NAME in ['microsoft/deberta-base']:
    for pipeline in [[
                # 'lowercase',
                'hyperlinks',
                # 'remove_hyperlinks',
                'mentions',
                # 'remove_mentions',
                'hashtags',
                # 'remove_hashtags',
                'retweet',
                'repetitions',
                'emojis',
                'smileys',
                # 'punctuation',
                'spaces',
                # 'tokenize'
            ],
            ]:

        def custom_clean_text(text: str) -> str:
            preprocessor = Preprocessor(pipeline)
            return preprocessor(text)


        if TRAIN_DATA_PATH == "../data/train_aug.csv":
            df = _load_dataset(TRAIN_DATA_PATH)
        else:
            df = _load_dataset(TRAIN_DATA_PATH)
            df = _prepare_data(df, custom_clean_text)

        input_ids, attention_masks, labels = _create_tensors(df, MODEL_NAME)
        train_dataloader, validation_dataloader = _create_dataloaders(input_ids, attention_masks, labels, BATCH_SIZE, 
                                                                    create_validation_set= True)

        df = _load_dataset(TEST_DATA_PATH)
        df = _prepare_data(df, custom_clean_text)
        input_ids, attention_masks, labels = _create_tensors(df, MODEL_NAME)
        test_dataloader = _create_dataloaders(input_ids, attention_masks, labels, BATCH_SIZE, create_validation_set= False)





    
        model_class = Model(MODEL_NAME, NUM_EPOCHS, len(train_dataloader))
        model, optimizer, lr_scheduler = model_class.get_model_optimizer_scheduler()
        model = model.to(device)

        training_loss = 0
        val_loss = 0
        val_f1 = 0

        training_stats = []
        try:
            for epoch in range(NUM_EPOCHS):
                print(f"EPOCH {epoch+1}/{NUM_EPOCHS}\n")
                model.train()
                total_train_loss = 0

                for step, batch in enumerate(train_dataloader):
                    model.zero_grad()
                    parameters = {
                        "input_ids" : batch[0].to(device),
                        "attention_mask" :  batch[1].to(device), 
                        "labels" : batch[2].to(device)
                    }
                    outputs = model(**parameters)

                    loss = outputs.loss
                    total_train_loss += loss.item()
                    loss.backward()

                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad()
                    # progress_bar.update(1)

                    if step % 100 == 0 and step != 0:
                        print(f"BATCH {step}/{len(train_dataloader)}:\tTraining loss({loss.item()})")

                training_stats.append({
                    "epoch":epoch+1,
                    "training_loss":total_train_loss/len(train_dataloader)
                    })

                total_val_loss = 0
                metric = load_metric("f1")

                model.eval()
                for batch in validation_dataloader:

                    parameters = {
                        "input_ids" : batch[0].to(device),
                        "attention_mask" :  batch[1].to(device), 
                        "labels" : batch[2].to(device)
                    }
                    with torch.no_grad():
                        outputs = model(**parameters)

                    logits = outputs.logits
                    loss = outputs.loss
                    total_val_loss += loss.item()

                    predictions = torch.argmax(logits, dim=-1)
                    metric.add_batch(predictions=predictions, references=parameters["labels"])

                training_stats[epoch]["validation_loss"] = total_val_loss/len(validation_dataloader)
                training_stats[epoch]["validation_f1_score"] = metric.compute()

                print(f"\nAvg training loss:    {training_stats[epoch]['training_loss']}")
                print(f"Avg validation loss:  {training_stats[epoch]['validation_loss']}")
                print(f"F1 validation score:  {training_stats[epoch]['validation_f1_score']}\n")

                training_loss = training_stats[epoch]['training_loss']
                val_loss = training_stats[epoch]['validation_loss']
                val_f1 = training_stats[epoch]['validation_f1_score']

        except RuntimeError as e:
            print(e)






        

        model = model.to(device)
        testing_stats = []

        try:
            total_test_loss = 0
            metric = load_metric("f1")

            model.eval()

            for n, batch in enumerate(test_dataloader):

                parameters = {
                    "input_ids" : batch[0].to(device),
                    "attention_mask" :  batch[1].to(device), 
                    "labels" : batch[2].to(device)
                }
                with torch.no_grad():
                    outputs = model(**parameters)
                
                logits = outputs.logits
                loss = outputs.loss
                total_test_loss += loss.item()

                predictions = torch.argmax(logits, dim=-1)
                metric.add_batch(predictions=predictions, references=parameters["labels"])

            testing_stats.append({
                "test_loss": total_test_loss/len(test_dataloader),
                "test_f1_score": metric.compute()
            })

            print(f"\nAvg test loss:  {testing_stats[0]['test_loss']}")
            print(f"F1 test score:  {testing_stats[0]['test_f1_score']}\n")


            results_dictionary["model_name"].append(MODEL_NAME)
            results_dictionary["pipeline"].append(str(pipeline))
            results_dictionary["training_loss"].append(training_loss)
            results_dictionary["validation_loss"].append(val_loss)
            results_dictionary["validation_f1"].append(val_f1)
            results_dictionary["test_loss"].append(testing_stats[0]['test_loss'])
            results_dictionary["test_f1"].append(testing_stats[0]['test_f1_score'])
            if TRAIN_DATA_PATH == "../data/train_aug.csv":
                results_dictionary["augmented"].append("yes")
            else:
                results_dictionary["augmented"].append("no")


        except RuntimeError as e:
            print(e)

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2070 with Max-Q Design


Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.d

EPOCH 1/1



	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:882.)
  label_index = (labels >= 0).nonzero()


BATCH 100/3562:	Training loss(0.7088796496391296)
BATCH 200/3562:	Training loss(0.5513307452201843)
BATCH 300/3562:	Training loss(0.4055532217025757)
BATCH 400/3562:	Training loss(0.7488111257553101)
BATCH 500/3562:	Training loss(0.3334040641784668)
BATCH 600/3562:	Training loss(0.5062378644943237)
BATCH 700/3562:	Training loss(0.6252745985984802)
BATCH 800/3562:	Training loss(0.43599385023117065)
BATCH 900/3562:	Training loss(0.8467898368835449)
BATCH 1000/3562:	Training loss(0.405559241771698)
BATCH 1100/3562:	Training loss(0.46286144852638245)
BATCH 1200/3562:	Training loss(0.5424733757972717)
BATCH 1300/3562:	Training loss(0.5719776749610901)
BATCH 1400/3562:	Training loss(0.6917942762374878)
BATCH 1500/3562:	Training loss(0.725612998008728)
BATCH 1600/3562:	Training loss(0.43401408195495605)
BATCH 1700/3562:	Training loss(0.4001852869987488)
BATCH 1800/3562:	Training loss(0.42826128005981445)
BATCH 1900/3562:	Training loss(0.30031853914260864)
BATCH 2000/3562:	Training loss(0.3627

In [17]:
pd.set_option('display.max_colwidth', None)
results_df = pd.DataFrame(results_dictionary)
results_df.to_csv("2nd_models_comparison.csv", index=False)
results_df.head(40)

Unnamed: 0,model_name,pipeline,training_loss,validation_loss,validation_f1,test_loss,test_f1,augmented
0,microsoft/deberta-base,"['hyperlinks', 'mentions', 'hashtags', 'retweet', 'repetitions', 'emojis', 'smileys', 'spaces']",0.539779,0.49822,{'f1': 0.7855407047387605},0.510469,{'f1': 0.7723367697594502},no
1,microsoft/deberta-base,"['lowercase', 'hyperlinks', 'mentions', 'hashtags', 'retweet', 'repetitions', 'emojis', 'smileys', 'spaces']",0.560275,0.507515,{'f1': 0.7865235539654144},0.519109,{'f1': 0.7712455516014236},no
2,microsoft/deberta-base,"['hyperlinks', 'mentions', 'hashtags', 'repetitions', 'emojis', 'smileys', 'spaces']",0.544365,0.501752,{'f1': 0.7833935018050541},0.512311,{'f1': 0.7747542384955122},no
3,microsoft/deberta-base,"['hyperlinks', 'mentions', 'hashtags', 'retweet', 'emojis', 'smileys', 'spaces']",0.55502,0.518818,{'f1': 0.7694189602446483},0.504441,{'f1': 0.777521613832853},no
4,microsoft/deberta-base,"['hyperlinks', 'mentions', 'hashtags', 'retweet', 'repetitions', 'emojis', 'smileys', 'spaces']",0.418791,0.194744,{'f1': 0.9275726630007856},0.307244,{'f1': 0.8927886742368382},yes


# Dataframe preprocessing steps

In [4]:
import pandas as pd
from model_preparation import set_seed
from data_preparation import _load_dataset, _prepare_data, _create_dataloaders, _create_tensors
from preprocessor import Preprocessor


data_path = "../data/eval.csv"
model_name = 'microsoft/deberta-base'
batch_size = 16
create_validation_set = False
SEED = 42

set_seed(SEED)



def custom_clean_text(text: str) -> str:

    pipeline = ['hyperlinks', 'mentions', 'hashtags', 'retweet', 'repetitions', 'emojis', 'smileys', 'spaces']

    preprocessor = Preprocessor(pipeline)

    return preprocessor(text)


df = _load_dataset(data_path)

final_df = df["tweet"].copy()

df = _prepare_data(df, custom_clean_text)

final_df = pd.concat([final_df, df[["text","label"]]], axis = 1)

pd.set_option('display.max_colwidth', None)
final_df.head(10)

Unnamed: 0,tweet,text,label
0,Selsy mistakenly thinks the #Brownlow is like Mad Monday and wears costume. #SelwoodJoelSelwood http://t.co/H3Prjyjlyj,Selsy mistakenly thinks the Brownlow is like Mad Monday and wears costume. SelwoodJoelSelwood url,0
1,Obit vs mesin,Obit vs mesin,0
2,"growing watercolor - Decatur Cemetery This painting was done twice, the first time did not turn out well at... http://t.co/YiRp9mEK2y","growing watercolor - Decatur Cemetery This painting was done twice, the first time did not turn out well at.. url",0
3,your new chick that's my ex bitch homie in fact I'm still logged into her Netflix homie,your new chick that's my ex bitch homie in fact I'm still logged into her Netflix homie,1
4,http://t.co/tTYrceHxzg @nationaljournal &amp;Is Hillary Clinton Too Hawkish for Iowa Democrats? http://t.co/pbEZJ0obmK,url mention &amp;Is Hillary Clinton Too Hawkish for Iowa Democrats? url,0
5,They sleepin on you,They sleepin on you,0
6,Uh oh! Chris Weidman is ducking Vitor Belfort hahahahahaha but I don't blame him 😅,Uh oh! Chris Weidman is ducking Vitor Belfort hahahahahaha but I don't blame him emoji,0
7,"RT @andihfg: ""remember when i said hahmkyul are the only normal ones in the group? ... http://t.co/adioIdSjq4"" lee qri? normal? http://t.co…","retweet mention : ""remember when i said hahmkyul are the only normal ones in the group? .. url lee qri? normal? url",1
8,"""@LosMyTeddyBear: @TheCarlosPena\n#LosMeetMilou\nI need you please! \nWatch my videos+tweets ♡\n:""( \n73""",""" mention : mention LosMeetMilou I need you please! Watch my videos+tweets ♡ smiley 73""",0
9,RT @trvpvv: @OsamaJames lol because its clear that they take pride in taking pictures like this and exploiting females,retweet mention : mention lol because its clear that they take pride in taking pictures like this and exploiting females,1


# Check error cases

In [None]:
from helper_functions import get_device
from datasets import load_metric
import torch
import numpy as np

input_ids, attention_masks, labels = _create_tensors(df, model_name)
dataloaders = _create_dataloaders(input_ids, attention_masks, labels, batch_size, create_validation_set)


device = get_device()
model = model.to(device)

pred_list = np.array([])

test_dataloader = dataloaders

testing_stats = []

try:
    total_test_loss = 0
    metric = load_metric("f1")

    model.eval()

    for n, batch in enumerate(test_dataloader):

        parameters = {
            "input_ids" : batch[0].to(device),
            "attention_mask" :  batch[1].to(device), 
            "labels" : batch[2].to(device)
        }
        with torch.no_grad():
            outputs = model(**parameters)
        
        logits = outputs.logits
        loss = outputs.loss
        total_test_loss += loss.item()

        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=parameters["labels"])

        if input_ids[n*32:(n+1)*32].equal(parameters["input_ids"].cpu()):
            pred_list = np.append(pred_list,predictions.cpu().numpy())

    testing_stats.append({
        "test_loss": total_test_loss/len(test_dataloader),
        "test_f1_score": metric.compute()
    })

    print(f"\nAvg test loss:  {testing_stats[0]['test_loss']}")
    print(f"F1 test score:  {testing_stats[0]['test_f1_score']}\n")

except RuntimeError as e:
    print(e)

final_df["prediction"] = pred_list
final_df["match"] = final_df.apply(lambda row: "" if row["label"] == row["prediction"] else "NO", axis=1)

# final_df.to_csv("predictions.csv",index=False)
final_df[final_df["match"]=="NO"].head(40)