# **Classification With Bert**

On my last notebook(https://www.kaggle.com/satoshiss/nlp-with-disaster-tweets?scriptVersionId=81728347), I tried several feature engineering with XGB classifier, but I could not see any improvements. My best score is 0.78394. Meanwhile, I checked the competition discussion and saw many people mentioned Bert. I will use apply Bert on this notebook while referring to the website
(https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/ , https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb, https://www.analyticsvidhya.com/blog/2021/06/why-and-how-to-use-bert-for-nlp-text-classification/)

In [None]:
import numpy as np
import pandas as pd
import torch

import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import transformers
from transformers import AutoModel, BertTokenizerFast

import torch.nn as nn
from sklearn.metrics import classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda')

In [None]:
df_train = pd.read_csv("../input/nlp-getting-started/train.csv")
df_test =pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
model_name= "bert-base-uncased"
max_langth=15

texts = df_train.text
labels = df_train.target



In [None]:
from transformers import BertTokenizerFast

bert =AutoModel.from_pretrained(model_name,return_dict=False)
tokenizer = BertTokenizerFast.from_pretrained(model_name)




In [None]:
text = ['this is a bert model tutorial','we will fine-tune a bert model']


#encode text
send_id=tokenizer.batch_encode_plus(text,padding=True)

# output
print(send_id)

In [None]:
seq_len = [len(i.split()) for i in df_train.text]

pd.Series(seq_len).hist(bins=30)

In [None]:
#df_train['input_ids'] = df_train.text.apply(lambda x:tokenizer(x)['input_ids'])
#df_train['token_type_id'] = df_train.text.apply(lambda x:tokenizer(x)['token_type_ids'])
#df_train['attention_mask'] = df_train.text.apply(lambda x:tokenizer(x)['attention_mask'])

#df_test['input_ids'] = df_test.text.apply(lambda x:tokenizer(x)['input_ids'])
#df_test['token_type_id'] = df_test.text.apply(lambda x:tokenizer(x)['token_type_ids'])
#df_test['attention_mask'] = df_test.text.apply(lambda x:tokenizer(x)['attention_mask'])

In [None]:
df_train['word_count']=df_train['text'].apply(lambda x: len(str(x).split()))
df_train.word_count.describe()


In [None]:
X_train,X_valid,y_train,y_valid= train_test_split(df_train,df_train.target,test_size=0.3)

In [None]:
tokens_train = tokenizer.batch_encode_plus(
      X_train.text.tolist(),
      max_length=25,
      pad_to_max_length=True,
      truncation=True)

tokens_val = tokenizer.batch_encode_plus(
    X_valid.text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

tokens_test = tokenizer.batch_encode_plus( 
    df_test.text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)


In [None]:
#covert lists to tensors

train_seq=torch.tensor(tokens_train['input_ids'])
train_mask =torch.tensor(tokens_train['attention_mask'])
train_y= torch.tensor(y_train.tolist())

val_seq=torch.tensor(tokens_val['input_ids'])
val_mask =torch.tensor(tokens_val['attention_mask'])
val_y= torch.tensor(y_valid.tolist())

test_seq=torch.tensor(tokens_test['input_ids'])
test_mask =torch.tensor(tokens_test['attention_mask'])



In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [None]:
#freeze all the parameters
for param in bert.parameters():
    param.requires_gred=False

In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,2)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

In [None]:
model = BERT_Arch(bert)

model=model.to(device)


In [None]:
from transformers import AdamW

optimizer =AdamW(model.parameters(), lr=1e-5)

In [None]:
from sklearn.utils.class_weight import compute_class_weight


class_weights=compute_class_weight('balanced',np.unique(y_train),y_train)

print("Class Weights:",class_weights)

In [None]:
weights=torch.tensor(class_weights,dtype=torch.float)

weights = weights.to(device)

cross_entropy = nn.NLLLoss(weight=weights)

epochs=10

In [None]:
# function to train the model
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch

    # clear previously calculated gradients 
    model.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    loss = cross_entropy(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    
    # Progress update every 50 batches.
    if step % 50 == 0 and not step == 0:
      
      # Calculate elapsed time in minutes.
      #elapsed = format_time(time.time() - t0)
            
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

In [None]:
#def format_time(t):
#    """Return time object (t) as a formatted string"""
#    return '%.2d:%.2d:%.2d' % (t.hour, t.minute, t.second)

#import time

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

In [None]:
# get predictions for test data
with torch.no_grad():
  preds = model(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()

In [None]:
preds




In [None]:
preds1= [a.argmax() for a in preds] 
sample_submission =pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
sample_submission['target'] = [a.argmax() for a in preds]
sample_submission.to_csv('submission.csv',index=False)
