In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))


else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3090


In [3]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("./fine_tuning.csv")

train_data_domain = df_train.domain.values
train_data_label = df_train.label.values
train_data_label = train_data_label.tolist()
train_data_label = [0 if item == 2 else 1 for item in train_data_label]
train_data_label = np.array(train_data_label)

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer(vocab_file="./bert_tokenizer/vocab.txt")

In [5]:
input_ids_train = []
attention_masks_train = []

for sent in train_data_domain:
    encoded_dict = tokenizer.encode_plus(
        sent,                      # Sentence to encode.
        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
        max_length = 64,           # Pad & truncate all sentences.
        pad_to_max_length = True,
        return_attention_mask = True,   # Construct attn. masks.
        return_tensors = 'pt',     # Return pytorch tensors.
    )
    # Add the encoded sentence to the list.
    input_ids_train.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks_train.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids_train = torch.cat(input_ids_train, dim=0)
attention_masks_train = torch.cat(attention_masks_train, dim=0)
labels_train = torch.tensor(train_data_label)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
from torch.utils.data import TensorDataset, random_split

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
# Calculate the number of samples to include in each set.
train_size = int(0.7 * len(dataset_train))
test_size = len(dataset_train) - train_size

train_dataset, test_dataset = random_split(dataset_train, [train_size, test_size])
print('{:>5,} training samples'.format(train_size))
print('{:>5,} test samples'.format(test_size))

111,998 training samples
48,000 test samples


In [7]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(
    train_dataset,  # The training samples.
    sampler = RandomSampler(train_dataset), # Select batches randomly
    batch_size = batch_size # Trains with this batch size.
)

test_dataloader = DataLoader(
    test_dataset, # The test samples.
    sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
    batch_size = batch_size # Test with this batch size.
)

In [8]:
from torch import nn

EmbeddingPath = "./FedBert/FedTransformer.pt"
TransformerPath = "./FedBert/FedEmbedding.pt"

In [9]:
from transformers import (
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,DataCollatorForLanguageModeling,HfArgumentParser,Trainer,TrainingArguments,set_seed,
)

config_kwargs = {
    "cache_dir": None,
    "revision": 'main',
    "use_auth_token": None,
    "hidden_dropout_prob": 0.2,
    "vocab_size": 1000
}

config = AutoConfig.from_pretrained('./bert-base-uncased-model/', **config_kwargs)
print(config)

model = AutoModelForMaskedLM.from_config(
    config=config,
)
model.resize_token_embeddings(config_kwargs["vocab_size"])

BertConfig {
  "_name_or_path": "./bert-base-uncased-model/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.2,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.34.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 1000
}



Embedding(1000, 768, padding_idx=0)

In [10]:
import copy

embedding = model.bert.embeddings

class Bert_Embedding(nn.Module):
    def __init__(self):
        super(Bert_Embedding, self).__init__()
        self.embeddings = copy.deepcopy(embedding)

    def forward(self, input_ids, mask):
        embedding_output = self.embeddings(input_ids, mask)
        return embedding_output

embedding_model = Bert_Embedding()
embedding_model.load_state_dict(torch.load(EmbeddingPath))

<All keys matched successfully>

In [11]:
encoder = model.bert.encoder
cls = model.cls

class Bert_Encoder(nn.Module):
    def __init__(self):
        super(Bert_Encoder, self).__init__()
        self.encoder = copy.deepcopy(encoder)
        self.cls = copy.deepcopy(cls)

    def forward(self, embedding_output):
        output_encoder = self.encoder(embedding_output).last_hidden_state
        return output_encoder
encoder_model = Bert_Encoder()
encoder_model.load_state_dict(torch.load(TransformerPath))

<All keys matched successfully>

In [12]:
from transformers.models.bert.modeling_bert import BertPooler
class Pooler_Config:
    def __init__(self, entries: dict={}):
        for k, v in entries.items():
            if isinstance(v, dict):
                self.__dict__[k] = Pooler_Config(v)
            else:
                self.__dict__[k] = v

config_pooler = {"hidden_size": 768}
config_pooler = Pooler_Config(config_pooler)
pooler = BertPooler(config_pooler)
print(pooler)

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)


In [13]:
class MyModel(nn.Module):
    def __init__(self, hidden_size=768, num_classes=2, freeze_bert=False):
        super(MyModel, self).__init__()
        self.embedding = Bert_Embedding()
        self.encoder = Bert_Encoder()
        self.pooler = copy.deepcopy(pooler)
        if freeze_bert:
            for p in self.embedding.parameters():
                p.requires_grad = False
            for p in self.encoder.parameters():
                p.requires_grad = False
        self.fc = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(hidden_size, num_classes, bias=False),
            nn.Sigmoid(),
        )

    def forward(self, input_ids, mask):
        embedding_outputs = self.embedding(input_ids, mask)
        encoder_outputs = self.encoder(embedding_outputs)
        pooler_outputs = self.pooler(encoder_outputs)

        logits = self.fc(pooler_outputs)
        return logits

In [14]:
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW

model = MyModel()
model.encoder.cls = nn.Sequential()

optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                  )


epochs = 30


total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)




In [15]:
model.cuda()

MyModel(
  (embedding): Bert_Embedding(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(1000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
  )
  (encoder): Bert_Encoder(
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), 

In [16]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import time
import datetime
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

def flat_accuracy(preds, labels):
    return np.sum(preds == labels) / len(labels)

def tpr_calculate(preds, labels):
    return recall_score(labels, preds, zero_division=1)

def fpr_calculate(preds, labels):
    conf_matrix = confusion_matrix(labels, preds)
    fp = conf_matrix[0, 1]
    tn = conf_matrix[0, 0]
    fpr = fp / (fp + tn)
    return fpr

def f1_score_calculate(preds, labels):
    return f1_score(labels, preds)

def AUC_calculate(preds, labels):
    return roc_auc_score(labels, preds)

def roc_curve_calculate(preds, labels):
    return roc_curve(labels, preds)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [17]:
import random

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

total_t0 = time.time()

criterion = torch.nn.BCELoss()

test_save = {}
test_save["loss"] = []
test_save["acc"] = []
test_save["tpr"] = []
test_save["fpr"] = []
test_save["f1"] = []
test_save["auc"] = []

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    loss_batch = []
    acc_batch = []
    tpr_batch = []
    fpr_batch = []
    f1_batch = []
    auc_batch = []

    t0 = time.time()

    total_train_loss = 0

    model.train()
    
    for step, batch in enumerate(train_dataloader):

        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)

            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        logits = model.forward(b_input_ids, b_input_mask)

        b_labels = b_labels.unsqueeze(1)
        b_labels = b_labels.repeat(1,2)
        for i in range(len(b_labels)):
            b_labels[i][1] = 1-b_labels[i][0]
        BCELoss = criterion(logits, b_labels.float())

        total_train_loss += BCELoss.item()

        BCELoss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.20f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))


    print("")
    print("Running test...")

    t0 = time.time()

    model.eval()

    # Tracking variables
    total_test_accuracy = 0
    total_test_loss = 0
    nb_test_steps = 0

    # Test data for one epoch
    for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():
            logits = model.forward(b_input_ids, b_input_mask)

        b_labels = b_labels.unsqueeze(1)
        b_labels = b_labels.repeat(1,2)
        for i in range(len(b_labels)):
            b_labels[i][1] = 1-b_labels[i][0]

        BCELoss = criterion(logits, b_labels.float())
        total_test_loss += BCELoss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        logits = np.argmax(logits, axis=1).flatten()
        label_ids = np.argmax(label_ids,axis=1).flatten()

        accuracy = flat_accuracy(logits, label_ids)
        tpr = tpr_calculate(logits, label_ids)
        fpr = fpr_calculate(logits, label_ids)
        f1 = f1_score_calculate(logits, label_ids)
        if len(set(label_ids)) == 1:
            pass
        else:
            auc = AUC_calculate(logits, label_ids)
            auc_batch.append(auc)
        
        loss_batch.append(BCELoss.item())
        acc_batch.append(accuracy)
        tpr_batch.append(tpr)
        fpr_batch.append(fpr)
        f1_batch.append(f1)
        
        total_test_accuracy += flat_accuracy(logits, label_ids)
    
    test_save["loss"].append(sum(loss_batch)/len(loss_batch))
    test_save["acc"].append(sum(acc_batch)/len(acc_batch))
    test_save["tpr"].append(sum(tpr_batch)/len(tpr_batch))
    test_save["fpr"].append(sum(fpr_batch)/len(fpr_batch))
    test_save["f1"].append(sum(f1_batch)/len(f1_batch))
    test_save["auc"].append(sum(auc_batch)/len(auc_batch))
    
    avg_test_accuracy = total_test_accuracy / len(test_dataloader)
    print("Loss:{:.10f}\tAcc:{:.10f}\tTpr:{:.10f}\tFpr:{:.10f}\tF1:{:.10f}\tAuc:{:.10f}".format(sum(loss_batch)/len(loss_batch), sum(acc_batch)/len(acc_batch), sum(tpr_batch)/len(tpr_batch), sum(fpr_batch)/len(fpr_batch), sum(f1_batch)/len(f1_batch), sum(auc_batch)/len(auc_batch)))

    # Calculate the average loss over all of the batches.
    avg_test_loss = total_test_loss / len(test_dataloader)

    # Measure how long the test run took.
    test_time = format_time(time.time() - t0)

    print("  test Loss: {0:.20f}".format(avg_test_loss))
    print("  test took: {:}".format(test_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_test_loss,
            'Valid. Accur.': avg_test_accuracy,
            'Training Time': training_time,
            'test Time': test_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch   500  of  3,500.    Elapsed: 0:00:44.
  Batch 1,000  of  3,500.    Elapsed: 0:01:28.
  Batch 1,500  of  3,500.    Elapsed: 0:02:12.
  Batch 2,000  of  3,500.    Elapsed: 0:02:56.
  Batch 2,500  of  3,500.    Elapsed: 0:03:41.
  Batch 3,000  of  3,500.    Elapsed: 0:04:25.

  Average training loss: 0.27645259898315582836
  Training epcoh took: 0:05:09

Running test...
Loss:0.1870613099	Acc:0.9285208333	Tpr:0.9298040527	Fpr:0.0732483519	F1:0.9263041437	Auc:0.9282778504
  test Loss: 0.18706130990261832236
  test took: 0:00:44

Training...
  Batch   500  of  3,500.    Elapsed: 0:00:44.
  Batch 1,000  of  3,500.    Elapsed: 0:01:28.
  Batch 1,500  of  3,500.    Elapsed: 0:02:12.
  Batch 2,000  of  3,500.    Elapsed: 0:02:57.
  Batch 2,500  of  3,500.    Elapsed: 0:03:41.
  Batch 3,000  of  3,500.    Elapsed: 0:04:25.

  Average training loss: 0.19068384832476398261
  Training epcoh took: 0:05:09

Running test...
Loss:0.1729152758	Acc:0.9351041667	Tpr:0.9242910315	Fpr:0

In [18]:
import xlwt
f = xlwt.Workbook('encoding = utf-8')
sheet1 = f.add_sheet('sheet1',cell_overwrite_ok=True)
for i in range(len(test_save["loss"])):
    sheet1.write(i+1,0,test_save["loss"][i])
for i in range(len(test_save["acc"])):
    sheet1.write(i+1,1,test_save["acc"][i])
for i in range(len(test_save["tpr"])):
    sheet1.write(i+1,2,test_save["tpr"][i])
for i in range(len(test_save["fpr"])):
    sheet1.write(i+1,3,test_save["fpr"][i])
for i in range(len(test_save["f1"])):
    sheet1.write(i+1,4,test_save["f1"][i])
for i in range(len(test_save["auc"])):
    sheet1.write(i+1,5,test_save["auc"][i])

f.save('test_save.xls')