In [21]:
import os
from pathlib import Path
from tqdm import tqdm
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM, set_seed, BertTokenizer
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import EarlyStoppingCallback, IntervalStrategy, SchedulerType
import math
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import random
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

In [2]:
model_name = 'm3rg-iitd/matscibert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
scibert_model = BertForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased')
matsci_model = BertForMaskedLM.from_pretrained(model_name)
geosci_model = BertForMaskedLM.from_pretrained('/home/ppillai6/Desktop/BERT_training/geoscibert/checkpoint-20190/')

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClass

In [12]:
train_df = pd.read_csv('/home/ppillai6/Desktop/BERT_training/datasets/Geo_Dataset/Train.csv')
paras = train_df['Text'].tolist()

text = paras[-1] #500,250
print(text)

The key for determining electrofacies is core and log data integration. Recognition of electrofacies in a wide variety of depositional environments can be achieved through inductive and deductive methodologies. In two different fields located in Argentina


In [13]:
# Tokenized input
tokenized = tokenizer(text, padding='max_length', max_length=512, truncation=True)
inp = tokenized['input_ids']
print(len(inp))
inp = [x for x in inp if x!=0]
tokenizer.decode(inp)

512


'[CLS] the key for determining electrofacies is core and log data integration. recognition of electrofacies in a wide variety of depositional environments can be achieved through inductive and deductive methodologies. in two different fields located in argentina [SEP]'

In [14]:
# Masked input
aa = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=0.15)
masked_inp = aa([tokenized])['input_ids'][0]
masked_inp = [x for x in masked_inp if x!=0]
masked_text = tokenizer.decode(masked_inp)
tokenizer.decode(masked_inp)

'[CLS] the key for determining electrofacies [MASK] core [MASK] log data integration. recognition of electrofacies in a [MASK] varietytise [MASK] [MASK] environments can be achieved through inductive and deductive methodologies. in two different unnecessary located in argentina [SEP]'

In [15]:
# Prepare data
inputs = tokenized
inputs['labels'] = inputs['input_ids']
inputs['input_ids'] =list(aa([tokenized])['input_ids'][0].numpy())
inputs = {x:torch.tensor([inputs[x]]) for x in inputs.keys()}

In [39]:
def compute_bert_loss(prediction_scores, labels, vocab_size):
    loss_fct = CrossEntropyLoss()        
    loss = loss_fct(prediction_scores.view(-1, vocab_size), labels.view(-1))
    return loss 

In [40]:
# SciBERT Model predictions
output = scibert_model(**inputs)
loss = output.loss
print('Loss:', loss)
print('Computed loss', compute_bert_loss(output[1], inputs['labels'], tokenizer.vocab_size))

predictions = output[1].detach().numpy()[0,:,:]
tok_preds = predictions.argmax(axis=1)
tokenizer.decode(tok_preds)

Loss: tensor(16.5412, grad_fn=<NllLossBackward0>)
Computed loss tensor(16.5412, grad_fn=<NllLossBackward0>)


'. the key for determining electrofacies is core and log data integration. recognition of electrofacies in a wide variety of depositional environments can be achieved through core and log analysis methodologies. in the different exped sites in argentina, the mining sites were conducted, core log log minings, the the deposition sites in. the deposition deposition sites in argentina. core core the electroies, core log log data integration the of these the environments deposition sites in argentina. depositional environments can be achieved through core log log mining analysis.. the mining industrial sites in.. mining and log of electro... a........ the...... the.. of... a.., argentina........ the core log log mining analysis. the the... conducted the core log mining techniques, the the exped sites sites argentina..al have core log log recognition recognition, core core log log analysis, the deposition deposition sites argentina. the exped sites.. the log sitess. the the exped sites sites

In [41]:
# MatSciBERT Model predictions
output = matsci_model(**inputs)
loss = output.loss
print('Loss:', loss)
print('Computed loss', compute_bert_loss(output[1], inputs['labels'], tokenizer.vocab_size))

predictions = output[1].detach().numpy()[0,:,:]
tok_preds = predictions.argmax(axis=1)
tokenizer.decode(tok_preds)

Loss: tensor(15.2873, grad_fn=<NllLossBackward0>)
Computed loss tensor(15.2873, grad_fn=<NllLossBackward0>)


'and the key for determining electrofacies is core and log data integration. recognition of electrofacies in a wide variety of depositional environments can be achieved through core and logmetric methodologies. in the different depositions in argentina. the core data.. in core log logss conclusions the the environments in in, the deposition depositionss argentina. core core the analysis analysis through core log log data integration recognition recognition the theies in a wide variety variety depositional environments can be achieved core core log logmetric methods. the two deposition depositionss argentina. the the the and of of the the the the the of of the the the the the of the the the the the the the the the the the the the the the of and the the the the and the the the the the the data methods the introduction the - the the the the the - the in the. the the and methodss in the the the core log logfacmetric. introduction the the inmetrics the two deposition depositions the. the de

In [42]:
# Geosci Model predictions
output = geosci_model(**inputs)
loss = output.loss
print('Loss:', loss)
print('Computed loss', compute_bert_loss(output[1], inputs['labels'], tokenizer.vocab_size))

predictions = output[1].detach().numpy()[0,:,:]
tok_preds = predictions.argmax(axis=1)
tokenizer.decode(tok_preds)

Loss: tensor(16.9911, grad_fn=<NllLossBackward0>)
Computed loss tensor(16.9911, grad_fn=<NllLossBackward0>)


'. the key for determining electrofacies is core and log data integration. recognition of electrofacies in a wide variety of depositional environments can be achieved through core and loggraphic methodologies. in the different facs in argentina. of oil froms the. sediment and sediments in. of of environments... the fac facs in. from core for reservoirfacies is core and data data integration the recognition thefacies in in wide variety variety depositional environments can be achieved sediment core core loggraphic methodologies. the many deposition facs. argentina.. of log datasfac.. the the fac facs the.. of core data integration... core log data integration... the fac information determiningfacfac accurately from core log data fac fac through integration and data data integration the recognition the - types in the recognition thefac, in a variety variety deposition deposition facies wide variety variety core core log logfac methodologies. these many sediment loggraphic. the many depos

In [43]:
# Vanilla BERT predictions
bert_tokenized = bert_tokenizer(text, padding='max_length', max_length=512, truncation=True)
bert_inputs = bert_tokenized
bert_inputs['labels'] = bert_inputs['input_ids']
bert_inputs['input_ids'] = bert_tokenizer(masked_text, padding='max_length', max_length=512, truncation=True)['input_ids']
bert_inputs = {x:torch.tensor([bert_inputs[x]]) for x in bert_inputs.keys()}

output = bert_model(**bert_inputs)
loss = output.loss
print('Loss:', loss)
print('Computed loss', compute_bert_loss(output[1], bert_inputs['labels'], bert_tokenizer.vocab_size))

predictions = output[1].detach().numpy()[0,:,:]
tok_preds = predictions.argmax(axis=1)
bert_tokenizer.decode(tok_preds)

Loss: tensor(13.4599, grad_fn=<NllLossBackward0>)
Computed loss tensor(13.4599, grad_fn=<NllLossBackward0>)


'.. the key for determining electrofacies is the and and data integration. recognition of electrofas in a different in of of and and environment can be achieved through inductive and deductive methodologies. in two different and and in a " " in and and and and and different : the electro electro : is is and and and and. recognition the electro the and and a different in of and and and and, the, the the in electro the and and and and and in, and and environments in and and the a the in the the the the the the the the the the the the the the the the the the the and and and the the the the the a the in and and and and and the and also the the the electro electro per is is and and and and ) the recognition the electro vas in different different in in and and and and and different different environment achieved in through both induc, and de - method and and different and different electro environments and environments and in a different in in and of in and and and and achieved through induc