In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.2-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.4 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 36.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 48.0 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
  

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from transformers import AutoModel, AutoTokenizer, BertTokenizer, BertForSequenceClassification, BertPreTrainedModel, BertModel
from transformers import Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [None]:
df = pd.read_csv('/content/drive/MyDrive/손진석/new_data.txt', sep='\t', header=None)
df_train, df_test = train_test_split(df, test_size=.2, stratify=df[16], random_state=0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,30445307,32,,P2X7R,GENE,,Bacteroidetes=ORGANISM; Verrucomicrobia=ORGANI...,,egr-1,GENE,,POSITIVE,ACTIVE,inhibit,"Furthermore, P2X7R blockade inhibited MEK1/2-E...",Directed Link,Negative Decrease
1,30445307,10,,P2X7R,GENE,,Bacteroidetes=ORGANISM; Verrucomicrobia=ORGANI...,,alcoholic liver disease,PHENOTYPE,,POSITIVE,ACTIVE,treat,These studies strongly suggest that P2X7R bloc...,Directed Link,Negative Decrease
2,30445312,1,,oridonin,COMPOUND,,NA;,,autophagy,BIOLOGICAL PROCESS,,POSITIVE,ACTIVE,induce,"In our previous study, we demonstrated that or...",Directed Link,Positive Increase
3,30445312,1,,oridonin,COMPOUND,,NA;,,phagocytosis,BIOLOGICAL PROCESS,,POSITIVE,ACTIVE,enhance,"In our previous study, we demonstrated that or...",Directed Link,Positive Increase
4,30445312,3,,oridonin,COMPOUND,,NA;,,Toll-like receptor 4,GENE,,POSITIVE,ACTIVE,activate,"Herein, we showed that Toll-like receptor 4 (T...",Directed Link,Positive Increase


In [None]:
from collections import Counter
Counter(df[16])

Counter({'Directed Link': 835,
         'Negative Cause': 702,
         'Negative Correlation': 150,
         'Negative Decrease': 365,
         'Negative Increase': 249,
         'Positive Cause': 1088,
         'Positive Correlation': 285,
         'Positive Decrease': 133,
         'Positive Increase': 279,
         'Undirected Link': 959})

In [None]:
X1 = df.apply(lambda x : x[14].replace(x[3].strip(), '[MASK]'), axis=1).tolist()
X2 = df.apply(lambda x : x[14].replace(x[8].strip(), '[MASK]'), axis=1).tolist()
label_encoder = LabelEncoder()
y = torch.tensor(label_encoder.fit_transform(df[16]), dtype=torch.long).to('cuda')

In [None]:
label_encoder.classes_

array(['Directed Link', 'Negative Cause', 'Negative Correlation',
       'Negative Decrease', 'Negative Increase', 'Positive Cause',
       'Positive Correlation', 'Positive Decrease', 'Positive Increase',
       'Undirected Link'], dtype=object)

In [None]:
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_cased")

Downloading:   0%|          | 0.00/217k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [None]:
# tokenizer.save_vocabulary('./vocab.txt')

In [None]:
class MySequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.tanh = nn.Tanh()
        self.hiden_layer = nn.Linear(config.hidden_size * 3, config.hidden_size * 3)
        self.classifier = nn.Linear(config.hidden_size * 3, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
    ):

        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

#         pooled_output = outputs[1]
#         output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)

        check = input_ids == tokenizer.mask_token_id 
        check[:, 0] = True
        
        output = torch.reshape(outputs[0][check], (-1, 3 * 768))
        output = self.hiden_layer(output)
        output = self.tanh(output)
        output = self.dropout(output)
        logits = self.classifier(output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [None]:
# model = MySequenceClassification.from_pretrained("allenai/scibert_scivocab_cased", num_labels=10).to("cuda")
model = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_cased", num_labels=10, torchscript=True).to("cuda")
model.train()
# model.eval()
print('done')

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

done


In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=.2, stratify=df[16], random_state=0)
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=.2, stratify=df[16], random_state=0)

In [None]:
encodings_train = tokenizer(X1_train, X2_train,  return_tensors='pt', padding=True, max_length=512)
encodings_test = tokenizer(X1_test, X2_test,  return_tensors='pt', padding=True, max_length=512)



In [None]:
print(encodings_train['input_ids'].shape[1], encodings_test['input_ids'].shape[1])
# max_length를 두 숫자 중 더 큰 숫자로 통일

277 265


In [None]:
encodings_train = tokenizer(X1_train, X2_train,  return_tensors='pt', padding='max_length', max_length=277)
encodings_test = tokenizer(X1_test, X2_test,  return_tensors='pt', padding='max_length', max_length=277)

In [None]:
train_dataset = MyDataset(encodings_train, y_train)
test_dataset = MyDataset(encodings_test, y_test)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    print(classification_report(labels, preds, digits=3))
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    do_train=True,
    do_eval=True,
    #evaluate_during_training=False,
    num_train_epochs=10,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=252,
    weight_decay=0.01,
  #   fp16=True,
    logging_dir='./logs',
    eval_steps=252
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 4036
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2530
  import sys
  


RuntimeError: ignored

In [None]:
# model = BertForSequenceClassification.from_pretrained("results/checkpoint-2500")

In [None]:
pred = trainer.predict(test_dataset)
df_test['pred'] = label_encoder.inverse_transform(np.argmax(pred.predictions, axis=1))
df_test.head()

HBox(children=(FloatProgress(value=0.0, description='Prediction', max=16.0, style=ProgressStyle(description_wi…


              precision    recall  f1-score   support

           0      0.867     0.862     0.865       167
           1      0.879     0.886     0.883       140
           2      0.684     0.433     0.531        30
           3      0.893     0.918     0.905        73
           4      0.800     0.898     0.846        49
           5      0.846     0.885     0.865       218
           6      0.636     0.614     0.625        57
           7      0.621     0.667     0.643        27
           8      0.655     0.655     0.655        55
           9      0.800     0.771     0.785       192

    accuracy                          0.815      1008
   macro avg      0.768     0.759     0.760      1008
weighted avg      0.814     0.815     0.813      1008



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,pred
4474,15236635,4,,anaesthesia,PHENOTYPE,,NA;,,midazolam,COMPOUND,,POSITIVE,PASSIVE,perform,Caudal block was performed after induction of...,Undirected Link,Positive Correlation,Positive Correlation
1017,30448542,1,,Prg4,GENE,,NA;,,weight gain,PHENOTYPE,,POSITIVE,ACTIVE,contribute,Proteoglycan 4 (Prg4) has emerged from human a...,Directed Link,Directed Link,Directed Link
14,30445412,8,,dexmedetomidine,COMPOUND,,NA;,,hypertension,PHENOTYPE,,POSITIVE,ACTIVE,reduce,Dexmedetomidine vs. Placebo: High to moderate ...,Directed Link,Negative Cause,Negative Cause
1811,30452920,19,,SLC1A5,PROTEIN,,Mice=ORGANISM; Human=ORGANISM; Serum=ORGAN; so...,,ATF4,PROTEIN,,POSITIVE,ACTIVE,correlate,Levels of SLC1A5 were decreased in inflamed in...,Directed Link,Directed Link,Positive Increase
3967,15261105,2,,brain injury,PHENOTYPE,,NA;,,cerebral stroke,PHENOTYPE,,POSITIVE,ACTIVE,reduce,"Here, we investigated the neuroprotective eff...",Undirected Link,Undirected Link,Positive Correlation


In [None]:
# dummy_input = [encodings_train['input_ids'][2951:2953].cpu(), encodings_train['attention_mask'][2951:2953].cpu(), encodings_train['token_type_ids'][2951:2953].cpu()]

# model.eval()

# # If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
# # model = BertForSequenceClassification.from_pretrained("results/checkpoint-2500", torchscript=True)
# # model.eval()

# # Creating the trace
# traced_model = torch.jit.trace(model.cpu(), dummy_input)
# torch.jit.save(traced_model, "./traced_bert2.pt")

In [None]:
df_test['pred'] = label_encoder.inverse_transform(np.argmax(pred.predictions, axis=1))
df_test.head()