In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from transformers import AutoModel, AutoTokenizer, BertTokenizer, BertForSequenceClassification, BertPreTrainedModel, BertModel
from transformers import Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [None]:
df = pd.read_csv('/content/drive/MyDrive/손진석/new_data.txt', sep='\t', header=None)
df_train, df_test = train_test_split(df, test_size=.2, stratify=df[16], random_state=0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df.head()

In [None]:
from collections import Counter
Counter(df[16])

In [None]:
X1 = df.apply(lambda x : x[14].replace(x[3].strip(), '[MASK]'), axis=1).tolist()
X2 = df.apply(lambda x : x[14].replace(x[8].strip(), '[MASK]'), axis=1).tolist()
label_encoder = LabelEncoder()
y = torch.tensor(label_encoder.fit_transform(df[16]), dtype=torch.long).to('cuda')

In [None]:
label_encoder.classes_

In [None]:
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_cased")

In [None]:
# tokenizer.save_vocabulary('./vocab.txt')

In [None]:
class MySequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.tanh = nn.Tanh()
        self.hiden_layer = nn.Linear(config.hidden_size * 3, config.hidden_size * 3)
        self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
    ):

        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

#         pooled_output = outputs[1]
#         output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)

        check = input_ids == tokenizer.mask_token_id 
        check[:, 0] = True
        
        output = torch.reshape(outputs[0][check], (-1, 3 * 768))
        output = self.hiden_layer(output)
        output = self.tanh(output)
        output = self.dropout(output)
        logits = self.classifier(output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [None]:
# model = MySequenceClassification.from_pretrained("allenai/scibert_scivocab_cased", num_labels=10).to("cuda")
model = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_cased", num_labels=10, torchscript=True).to("cuda")
model.train()
# model.eval()
print('done')

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=.2, stratify=df[16], random_state=0)
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=.2, stratify=df[16], random_state=0)

In [None]:
encodings_train = tokenizer(X1_train, X2_train,  return_tensors='pt', padding=True, max_length=512)
encodings_test = tokenizer(X1_test, X2_test,  return_tensors='pt', padding=True, max_length=512)

In [None]:
print(encodings_train['input_ids'].shape[1], encodings_test['input_ids'].shape[1])
# max_length를 두 숫자 중 더 큰 숫자로 통일

In [None]:
encodings_train = tokenizer(X1_train, X2_train,  return_tensors='pt', padding='max_length', max_length=277)
encodings_test = tokenizer(X1_test, X2_test,  return_tensors='pt', padding='max_length', max_length=277)

In [None]:
train_dataset = MyDataset(encodings_train, y_train)
test_dataset = MyDataset(encodings_test, y_test)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    print(classification_report(labels, preds, digits=3))
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    do_train=True,
    do_eval=True,
    #evaluate_during_training=False,
    num_train_epochs=10,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=252,
    weight_decay=0.01,
  #   fp16=True,
    logging_dir='./logs',
    eval_steps=252
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# model = BertForSequenceClassification.from_pretrained("results/checkpoint-2500")

In [None]:
pred = trainer.predict(test_dataset)
df_test['pred'] = label_encoder.inverse_transform(np.argmax(pred.predictions, axis=1))
df_test.head()

In [None]:
# dummy_input = [encodings_train['input_ids'][2951:2953].cpu(), encodings_train['attention_mask'][2951:2953].cpu(), encodings_train['token_type_ids'][2951:2953].cpu()]

# model.eval()

# # If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
# # model = BertForSequenceClassification.from_pretrained("results/checkpoint-2500", torchscript=True)
# # model.eval()

# # Creating the trace
# traced_model = torch.jit.trace(model.cpu(), dummy_input)
# torch.jit.save(traced_model, "./traced_bert2.pt")

In [None]:
df_test['pred'] = label_encoder.inverse_transform(np.argmax(pred.predictions, axis=1))
df_test.head()