In [1]:
import os
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import EarlyStoppingCallback, IntervalStrategy, SchedulerType
import math
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

In [2]:
model_checkpoint = "m3rg-iitd/matscibert"
tokenizer_checkpoint = "m3rg-iitd/matscibert"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
config = AutoConfig.from_pretrained(model_checkpoint)
#model = AutoModelForMaskedLM.from_config(config)
model = BertForMaskedLM.from_pretrained(model_checkpoint)
model = BertForMaskedLM.from_pretrained('model_save')

In [3]:
model.resize_token_embeddings(len(tokenizer))

Embedding(31090, 768, padding_idx=0)

In [4]:
geo_df = pd.read_csv('./datasets/Geo_Dataset/Training_paras_for_BERT.csv')
#geo_df = geo_df.loc[geo_df['Source']!='Onepetro']
#geo_df = geo_df.reset_index(drop=True)
print(geo_df)

                                                     Text    Source
0       Further reading == Chen, Gang. Nanoscale Energ...      Wiki
1       Cased hole completion === This involves runnin...      Wiki
2       With a roar like a hundred express trains raci...      Wiki
3       Reflection seismology === Seismic reflection i...      Wiki
4       Oil wells === The question of what constituted...      Wiki
...                                                   ...       ...
199317  Seismic curvature attributes, as being second-...  Onepetro
199318  A 10-years research program at the U. of Stava...  Onepetro
199319  A case study of one of the reservoirs of X Fie...  Onepetro
199320  In El Huemul field, four main subvertical feat...  Onepetro
199321  The problem of radial crack propagation from a...  Onepetro

[199322 rows x 2 columns]


In [5]:
def tokenize_function(examples):
    return tokenizer(examples["Text"])

def group_texts(examples, block_size=512):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [6]:
train, val = train_test_split(geo_df, test_size=0.2, random_state=100)
train = train['Text']
val = val['Text']
train.to_csv('./datasets/Geo_Dataset/Train.csv', index=False)
val.to_csv('./datasets/Geo_Dataset/Val.csv', index=False)

In [7]:
data_files = {}
data_files["train"] = './datasets/Geo_Dataset/Train.csv'
data_files["validation"] = './datasets/Geo_Dataset/Val.csv'
extension='csv'
raw_datasets = load_dataset(extension, data_files=data_files)  

Using custom data configuration default-3bfa6b1dd7c5cf2a


Downloading and preparing dataset csv/default to /home/jupyter/.cache/huggingface/datasets/csv/default-3bfa6b1dd7c5cf2a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/jupyter/.cache/huggingface/datasets/csv/default-3bfa6b1dd7c5cf2a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, num_proc=8, remove_columns=["Text"])
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

                

#2:   0%|          | 0/20 [00:00<?, ?ba/s]

#1:   0%|          | 0/20 [00:00<?, ?ba/s]

#0:   0%|          | 0/20 [00:00<?, ?ba/s]

#4:   0%|          | 0/20 [00:00<?, ?ba/s]

#5:   0%|          | 0/20 [00:00<?, ?ba/s]

#3:   0%|          | 0/20 [00:00<?, ?ba/s]

#7:   0%|          | 0/20 [00:00<?, ?ba/s]

#6:   0%|          | 0/20 [00:00<?, ?ba/s]

                

#1:   0%|          | 0/5 [00:00<?, ?ba/s]

#0:   0%|          | 0/5 [00:00<?, ?ba/s]

#2:   0%|          | 0/5 [00:00<?, ?ba/s]

#4:   0%|          | 0/5 [00:00<?, ?ba/s]

#5:   0%|          | 0/5 [00:00<?, ?ba/s]

#3:   0%|          | 0/5 [00:00<?, ?ba/s]

#6:   0%|          | 0/5 [00:00<?, ?ba/s]

#7:   0%|          | 0/5 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/40 [00:00<?, ?ba/s]

#1:   0%|          | 0/40 [00:00<?, ?ba/s]

#2:   0%|          | 0/40 [00:00<?, ?ba/s]

#3:   0%|          | 0/40 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/10 [00:00<?, ?ba/s]

#1:   0%|          | 0/10 [00:00<?, ?ba/s]

#2:   0%|          | 0/10 [00:00<?, ?ba/s]

#3:   0%|          | 0/10 [00:00<?, ?ba/s]

In [9]:
NGPU = torch.cuda.device_count()
EPOCHS=19
TRAIN_BATCHSIZE = 6
VAL_BATCHSIZE = 6
TRAIN_SIZE = len(lm_datasets["train"])
EVAL_SIZE = len(lm_datasets["validation"])
GRADACCUM = int(256/(TRAIN_BATCHSIZE * NGPU))
total_steps = TRAIN_SIZE/(TRAIN_BATCHSIZE * NGPU * GRADACCUM) * EPOCHS
print('Train size:', TRAIN_SIZE,', Eval size:',EVAL_SIZE, ', Steps:',total_steps, ', Gradient accum:', GRADACCUM)

Train size: 123236 , Eval size: 30828 , Steps: 9756.183333333334 , Gradient accum: 5


In [10]:
#data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm_probability=0.15)

In [12]:
training_args = TrainingArguments(
    f"{model_checkpoint}-geo",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    #evaluation_strategy = IntervalStrategy.STEPS,
    num_train_epochs=EPOCHS, #default 3
    per_device_train_batch_size=TRAIN_BATCHSIZE, #default 8
    per_device_eval_batch_size=VAL_BATCHSIZE, #default 8
    gradient_accumulation_steps=GRADACCUM, #default 1
    warmup_ratio=0.048,
    learning_rate=1e-4,
    weight_decay=1e-2,
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
    max_grad_norm=0.0,
    push_to_hub=False,
    logging_steps=100,
    load_best_model_at_end=True
    #lr_scheduler_type=SchedulerType.LINEAR
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
    data_collator=data_collator
)

In [None]:
trainer.train()
model.save_pretrained('model_save')

***** Running training *****
  Num examples = 123236
  Num Epochs = 19
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 240
  Gradient Accumulation steps = 5
  Total optimization steps = 9747


Epoch,Training Loss,Validation Loss
0,1.9459,1.856643
1,1.8839,1.804067
2,1.8401,1.76936
3,1.8083,1.742321
4,1.782,1.725365
5,1.7635,1.70751
6,1.7474,1.697418
7,1.7247,1.681129
8,1.7075,1.667323
9,1.7015,1.661141


***** Running Evaluation *****
  Num examples = 30828
  Batch size = 48
Saving model checkpoint to m3rg-iitd/matscibert-geo/checkpoint-513
Configuration saved in m3rg-iitd/matscibert-geo/checkpoint-513/config.json
Model weights saved in m3rg-iitd/matscibert-geo/checkpoint-513/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 30828
  Batch size = 48
Saving model checkpoint to m3rg-iitd/matscibert-geo/checkpoint-1026
Configuration saved in m3rg-iitd/matscibert-geo/checkpoint-1026/config.json
Model weights saved in m3rg-iitd/matscibert-geo/checkpoint-1026/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 30828
  Batch size = 48
Saving model checkpoint to m3rg-iitd/matscibert-geo/checkpoint-1539
Configuration saved in m3rg-iitd/matscibert-geo/checkpoint-1539/config.json
Model weights saved in m3rg-iitd/matscibert-geo/checkpoint-1539/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 30828
  Batch size = 48
Saving model checkpoint to m3rg-iitd/

Model weights saved in m3rg-iitd/matscibert-geo/checkpoint-7695/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 30828
  Batch size = 48
Saving model checkpoint to m3rg-iitd/matscibert-geo/checkpoint-8208
Configuration saved in m3rg-iitd/matscibert-geo/checkpoint-8208/config.json
Model weights saved in m3rg-iitd/matscibert-geo/checkpoint-8208/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 30828
  Batch size = 48
Saving model checkpoint to m3rg-iitd/matscibert-geo/checkpoint-8721
Configuration saved in m3rg-iitd/matscibert-geo/checkpoint-8721/config.json
Model weights saved in m3rg-iitd/matscibert-geo/checkpoint-8721/pytorch_model.bin


In [None]:
'''train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

train_output = trainer.evaluate(train_dataset)
eval_output = trainer.evaluate()
print(train_output)
print('----')
print(eval_output)'''