In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**In this we will be Fine tuning a BERT model to classify natural disaster tweets using HuggingFace**




In [None]:
!pip install transformers
!pip install datasets

We will use **datasets** library by huggingface for data streamlining as well as for metrics.

In [None]:
from datasets import load_dataset, load_metric

Now we will laod the csv using load_dataset function

In [None]:
train_dataset = load_dataset('csv', data_files='../input/nlp-getting-started/train.csv',split="train")

We will load the metrics required for the evalution. 
**GLUE** is a benchmark which consists of lot of NLP taks and their evalution method for scoring how well the model is generalized across the task.
sentiment classification is one of them, as ours is also a binary classification method. we will use that.

In the code below the sst2 indicates sentiment classification.

In [None]:
metric = load_metric('glue', 'sst2')

In [None]:
metric

Now for evalution we will split the train data into train and eval , for that we will use **train_test_split** function. The 0.1 means we are spliting 10% of the data as testing.

And the method we are using expects sentence and target, so we rename the coloumns likewise using **rename_columns** function which is same syntax as pandas.



In [None]:
train_dataset = train_dataset.rename_columns({"target" : "label","text" : "sentence"})
train_dataset = train_dataset.train_test_split(test_size=0.1)

In [None]:
train_dataset['train'][0]

We will use **bert-base-uncased** and fine-tune with batch size of 32

In [None]:
model_checkpoint = "bert-base-uncased"
batch_size = 32

A **tokenizer** is in charge of preparing the inputs for a model. The library contains tokenizers for all the models.

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Now we will create a function which takes sentence as input and returns the tokenized output.

**truncation=true** tells that if a lenght of string is more then certain lenght its truncated to that max lenght.

In [None]:
def preprocess_function(examples):
  return tokenizer(examples['sentence'], truncation=True)

Here we are tokenzing the whole train and eval dataset. As you can see its easy to do that. This is one of the advatange of using **datasets**, where you are not applying any process differently for train and eval.


In [None]:
encoded_dataset = train_dataset.map(preprocess_function, batched=True)

By default model will not have architecture for the specific task, so we load the model with a task specific function which does the work. In below we are using **AutoModelForSequenceClassification** which takes bert_based_uncased and makes it ready it for classification.

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer 

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Now we pass some hyperparmeters for the model. 


In [None]:
model_name = model_checkpoint.split("/")[-1]
metric_name = 'accuracy'
args = TrainingArguments(
    f"{model_name}-finetuned-dst_clf",
    per_device_train_batch_size=batch_size,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    metric_for_best_model =metric_name,
    learning_rate=2e-5
)

We will create a function which calculates accuracy. This we will be running after some iteration to check how well our model is generalising.

In [None]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

By default the train logs are reported to wandb,we will disable it for now. **Wandb** used to get details on loss and other model metrics which are usefull while making model versioning.

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

We will get the final metrics on our evalution dataset.

In [None]:
trainer.evaluate()

We will move the model to CPU for prediction

In [None]:
temp_output = model.cpu()

In [None]:
import pandas as pd

def predict(text):
  token_output = tokenizer(text, truncation=True,return_tensors='pt')
  output = model.forward(input_ids=token_output['input_ids'],attention_mask=token_output['attention_mask'])
  return np.argmax(output['logits'].detach().numpy(), axis=1)[0]
  

eval_df = pd.read_csv("../input/nlp-getting-started/test.csv")
eval_df['target'] = eval_df['text'].apply(predict)
eval_df[['id','target']].to_csv("submission.csv",index=False)