In [None]:
!pip install transformers -q

In [None]:
import numpy as np
import pandas as pd
import transformers
import torch
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch import nn
import warnings
warnings.filterwarnings("ignore")

In [None]:
from transformers import AutoModel, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")


In [None]:
bert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

#### Loading the dataset and pre-processing

In [None]:
data = pd.read_csv('/kaggle/input/kaggle/train.csv')

In [None]:
data

In [None]:
data.columns

In [None]:
data.isnull().sum()

In [None]:
data.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
data

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna()
data.isnull().sum()

In [None]:
data.shape

In [None]:
data

In [None]:
data["text"] = data["title"] + data["abstract"]

In [None]:
data

In [None]:
def NLP_cleaning(text):
    text_corpus = []
    for sent in tqdm(text, desc='Cleaning'):
        sent = re.sub('<[^>]*>', '', sent)
        sent = re.sub('[^a-zA-z0-9]', ' ', sent)
        sent = sent.lower()
        text_corpus.append(sent)

    return text_corpus

In [None]:
text = data.text.values.tolist()

In [None]:
text[0]

In [None]:
text_corpus = NLP_cleaning(text)

In [None]:
text_corpus[0]

In [None]:
data['text'] = text_corpus

In [None]:
data['title'] = NLP_cleaning(data.title.values.tolist())
data['author'] = NLP_cleaning(data.author.values.tolist())

In [None]:
data

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['label_number'] = label_encoder.fit_transform(data['label'])
data

In [None]:
label_encoder

In [None]:
val_df = pd.read_csv('/kaggle/input/kaggle/val.csv')
val_df

In [None]:
val_df.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
val_df = val_df.dropna()


In [None]:
val_df['label_number'] = label_encoder.transform(val_df['label'])
val_df

In [None]:
val_df["text"] = val_df["title"] + val_df["abstract"]

In [None]:
val_df

In [None]:
val_df['title'] = NLP_cleaning(val_df.title.values.tolist())
val_df['author'] = NLP_cleaning(val_df.author.values.tolist())
val_df['abstract'] = NLP_cleaning(val_df.abstract.values.tolist())
val_df['text'] = NLP_cleaning(val_df.text.values.tolist())
val_df

In [None]:
X=val_df[['text']].copy()
y=val_df[['label_number']].copy()

In [None]:
X

In [None]:
val_text, test_text, val_labels, test_labels = train_test_split(X,y,random_state=2018,test_size=0.4,shuffle=True)

In [None]:
test_text

In [None]:
train_text = data[["abstract","author","title","text"]].copy()

In [None]:
train_text

In [None]:
text = ["this is a bert model tutorial", "we will fine-tune a bert model"]
sent_id = tokenizer.batch_encode_plus(text, padding=True)
print(sent_id)

In [None]:
val_text

In [None]:
tokens_train = tokenizer.batch_encode_plus(
    train_text['text'].tolist(),
    max_length = 512,
    pad_to_max_length=True,
    truncation=True
)

tokens_val = tokenizer.batch_encode_plus(
    val_text['text'].tolist(),
    max_length = 512,
    pad_to_max_length=True,
    truncation=True
)

tokens_test = tokenizer.batch_encode_plus(
    test_text['text'].tolist(),
    max_length = 512,
    pad_to_max_length=True,
    truncation=True
)

In [None]:
num = len(pd.unique(data['label_number']))

In [None]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(data['label_number'].tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels['label_number'].tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels['label_number'].tolist())

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [None]:
import torch

def print_model_layers(model):
    for name, param in model.named_parameters():
        print(f"Layer Name: {name}, Size: {param.size()}")

print_model_layers(bert)

In [None]:
for name, param in bert.named_parameters():
    print(f"{name}: {param.requires_grad}")

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class BERT_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 123)  # Adjust the number of output neurons for 123 classes
        self.log_softmax = nn.LogSoftmax(dim=1)  # Use dim=1 for LogSoftmax in classification

    def forward(self, sent_id, mask):
        sent_id = torch.tensor(sent_id)

        outputs = self.bert(input_ids=sent_id, attention_mask=mask)
        last_hidden_state_cls = outputs[0][:, 0, :]

        x = self.fc1(last_hidden_state_cls)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.log_softmax(x)

        return x

In [None]:
model = BERT_Arch(bert)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [None]:
device

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(),
                  lr = 1e-4)

In [None]:
train_labels = data['label_number']

In [None]:
model

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(train_labels)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_labels)
print("Class Weights:", class_weights)

In [None]:
weights= torch.tensor(class_weights,dtype=torch.float)
weights = weights.to(device)
cross_entropy  = nn.NLLLoss(weight=weights)

epochs = 8

In [None]:
def train():

    model.train()
    total_loss, total_accuracy = 0, 0
    total_preds=[]

    for step,batch in enumerate(train_dataloader):

        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        batch = [r.to(device) for r in batch]

        sent_id, mask, labels = batch
        sent_id = torch.tensor(sent_id)

        model.zero_grad()
        preds = model(sent_id, mask)
        loss = cross_entropy(preds, labels)
        total_loss = total_loss + loss.item()
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        preds=preds.detach().cpu().numpy()
        total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
import time
def evaluate():

    print("\nEvaluating...")
    model.eval()
    total_loss, total_accuracy = 0, 0
    total_preds = []

    for step,batch in enumerate(val_dataloader):

        if step % 50 == 0 and not step == 0:
#             elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch
    sent_id = torch.tensor(sent_id)

    with torch.no_grad():

        preds = model(sent_id, mask)
        loss = cross_entropy(preds,labels)
        total_loss = total_loss + loss.item()
        preds = preds.detach().cpu().numpy()
        total_preds.append(preds)

    avg_loss = total_loss / len(val_dataloader)

    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
best_valid_loss = float('inf')

train_losses=[]
valid_losses=[]

for epoch in range(epochs):

    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    train_loss, _ = train()
    valid_loss, _ = evaluate()
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

In [None]:
checkpoint = {'model': model,
              'state_dict': model.state_dict(),
              'optimizer' : optimizer.state_dict()}

torch.save(checkpoint, 'scibert_final_uf.pth')

In [None]:
def Predict(text):
    encoded_review = tokenizer.encode_plus(
      text,
      max_length=512,
      add_special_tokens=True,
      return_token_type_ids=False,
      truncation = True,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    extracted_value = prediction.item()
    return extracted_value

In [None]:
y_pred = []
for text in test_text['text']:
    val = Predict(text)
    y_pred.append(val)

In [None]:
len(y_pred)

In [None]:
test_labels

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef, classification_report
import numpy as np

true_labels = test_labels
predicted_labels = y_pred

accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
print(classification_report(true_labels, y_pred))