# All Needed Modules

In [1]:
# !pip install torch
# !pip install pdfminer
# !pip install pytesseract
# !pip install tensorflow
# !pip install pdf2image
# !pip install transformers
# !pip install spacy
# !spacy download en_core_web_lg
# !pip install opencv-python
# !pip install seqeval
# !pip install evaluate

In [39]:
from transformers import BertConfig
bert_config = BertConfig.from_pretrained('bert-base-cased')
print(bert_config)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



In [2]:
#PDF Miner (PDF Scraping)
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import HTMLConverter, TextConverter, XMLConverter
from io import StringIO
from pdfminer.pdfpage import PDFPage

#OCR
from pdf2image import convert_from_path
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd=r'Tesseract-OCR\tesseract.exe'


#NLP Preprocessing
import nltk
import re
import regex
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


#NLP Processing TensorFLow
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

#NLP Processing Spacy
import spacy
# import spacy_transformers
# spc = spacy.load('en_core_web_trf')
spc = spacy.load('en_core_web_lg')
# spc.add_pipe('sentencizer')

#Model Training and Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import torch

#Basic
import numpy as np
import pandas as pd
import pickle
import glob
import os
# import docker
import string
import matplotlib.pyplot as plt

pc = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~'

# Text Preprocessing 1 & 2 (V2.0)

In [3]:
def get_pdf_file_content_Text(path_to_pdf):
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''

    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''

    resource_manager = PDFResourceManager(caching=True)

    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()

    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'

    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams(line_overlap=0.3, detect_vertical=True)

    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')

    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)

    '''
    We are going to process the content of each page of the original PDF File
    '''
    try:
        for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True):
            interpreter.process_page(page)
    except:
        for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password=b"", caching=True, check_extractable=True):
            interpreter.process_page(page)

    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()

    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()

    '''
    Return the final variable containing all the text of the PDF
    '''
    return text

def group_read_pdfminer(folder_path):
    all_extracted_text={}
    for i,filename in enumerate(glob.glob(os.path.join(folder_path, '*.pdf'))):
        try:
            all_extracted_text[re.search(r'(\w+).pdf$',filename).group(1) +'_'+ str(i)] = get_pdf_file_content_Text(filename).lower()
        except:
            pass
        
def get_pdf_file_content_Text_ocr(pdf_path):
    #Convert pdf to images
    images = convert_from_path(pdf_path, 500,poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
    output_path = 'ocr_out/'
    for i, image in enumerate(images):
        fname = output_path + 'image_'+str(i)+'.png'
        image.save(fname, "PNG")

    extracted_text = []
    #images to text
    for i,filename in enumerate(glob.glob(os.path.join(output_path, '*.png'))):
        try:
            img1 = cv2.imread(filename)
            extracted_text.append(pytesseract.image_to_string(img1))
            print('successful'+str(i))
        except:
            print('unsuccessful'+str(i))

    text = ' '.join(extracted_text)
    
    return text

def group_read_ocr(folder_path):
    all_extracted_text={}
    for i,filename in enumerate(glob.glob(os.path.join(folder_path, '*.pdf'))):
        try:
            all_extracted_text[re.search(r'(\w+).pdf$',filename).group(1) +'_'+ str(i)] = get_pdf_file_content_Text_ocr(filename).lower()
        except:
            pass
        
def preprocessing_1(text_pdfmine):
    #Punctuation of '-' because make a newline 
    text_punew = re.sub(r'(-\n?\s)','',text_pdfmine.lower())
    
    #Remove spacing n dot '\n' & '•'
    if bool(regex.search(r'(?<=(\w\w))[\n•](?=(\w\w))',text_punew))==True:
        text_punew_spdo = re.sub(r'[\n•]', ' ', text_punew)
    else:
        text_punew_spdo = re.sub(r'[\n•]', '', text_punew)
        
    #Adjustment in Germany alphabetic
    text_punew_spdo_ger = re.sub(r'(?<=[oua])(¨)','e',text_punew_spdo)
    text_punew_spdo_ger = re.sub(r'ß','ss',text_punew_spdo_ger)
    
#     #Adjusment in Over-Spacing
#     text_punew_spdo_ger_ovsp = re.sub(r'(\s)(?=\s+)','',text_punew_spdo_ger)
    
    #Delete et al. + lower
    text_punew_spdo_ger_ovsp_etal = regex.sub(r'(\w+\s+et al.)',' ',text_punew_spdo_ger)
    
    #Eliminate fig and table
    text_punew_spdo_ger_ovsp_etal = re.sub(r'ﬁ','fi',text_punew_spdo_ger_ovsp_etal)
    text_punew_spdo_ger_ovsp_etal_fig = re.sub(r'(table|tables|figure|figures|fig.?|figs.)\s+(\d+|\d+.)', ' ', text_punew_spdo_ger_ovsp_etal)
    
    #Eliminate email
    text_punew_spdo_ger_ovsp_etal_fig_emai = regex.sub(r'\b[\w-.]+?@\w+?.\w+[\w\.\-]+\b',' ',text_punew_spdo_ger_ovsp_etal_fig)

    #Eliminate website
    text_punew_spdo_ger_ovsp_etal_fig_emai_web = regex.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w\s_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',' ',text_punew_spdo_ger_ovsp_etal_fig_emai)
    
    #Adjusment in Over-Spacing
    text_punew_spdo_ger_ovsp_etal_fig_emai_web = re.sub(r'(\s)(?=\s+)','',text_punew_spdo_ger_ovsp_etal_fig_emai_web)
    
    return text_punew_spdo_ger_ovsp_etal_fig_emai_web

def preprocessing_2(text_punew_spdo_ger_ovsp_etal_fig_emai_web):
    #Splitted into sentences by Spacy
    spc_text = spc(text_punew_spdo_ger_ovsp_etal_fig_emai_web)

    #Preparation
    prepro1 = []
    col_intro = []
    stop_ack_stc = []
    stop_ref_stc = []
    name = []
    prepro2=[]
    
    #List all possible name with spaCy
    for i in spc_text.ents:
        if i.label_.lower() == 'person':
            name.append(i.text)
        else:
            pass
    
    #Pattern for combination of alphabet and digit in word
    pca = '!"#$%&\*+/:;<=>?@[\]^_`{|}~'
    pattern_comb = regex.compile(r'([a-z]+[\d]+|\d+[a-z]+|\d+[{}]+|[a-z]+[{}]+|[{}]+[a-z]+|[{}]+[\d]+)'.format(pca,pca,pca,pca))
    
    #List of Sentences with application of lemmatization
    for i,j in enumerate(spc_text.sents):

        #Eliminate word - number - punc-chars combination
        sent_lem = regex.sub(pattern_comb,' ',j.text)

        #delete name
        if len(name) != 0:
            try:
                sent_lem = regex.sub(r",*(\s*\b(?:{}))\b".format("|".join(name)),'',sent_lem)
            except:
                pass
        else:
            pass

        prepro1.append(sent_lem)

        #Search for the start pf acknowledgements and references
        if bool(re.search('(acknowledgments|acknowledgment)',sent_lem)) == True:
            stop_ack_stc.append(i)

        if bool(re.search('(references|reference)',sent_lem)) == True:
            stop_ref_stc.append(i)

        elif bool(re.search('(introduction)',sent_lem)) == True:
            col_intro.append(i)

    #Cut parts before Introduction and after the acknowledgments or references
    try:
        if len(stop_ack_stc) == 0:
            try:
                prepro1 = prepro1[col_intro[0]:stop_ref_stc[-1]]
            except:
                prepro1 = prepro1[:stop_ref_stc[-1]]
        else:
            try:
                prepro1 = prepro1[col_intro[0]:stop_ack_stc[-1]]
            except:
                prepro1 = prepro1[:stop_ack_stc[-1]]
    except:
        pass

    #Pattern for Remove Citation
    pattern1 = regex.compile(r'\(([\w\s\d{}]+)\)'.format(pc))

    #Pattern for unused space first n last
    pattern3 = regex.compile(r'((^\s+)(?=.)|(?<=.)(\s+$))')

    #Pattern for Apply over-space
    pattern4 = regex.compile(r'(\s)(?=\s+)')

    #Pattern Additional compiler
    pattern_add = regex.compile(r'(\x01|\x0c|\b[a-z]\b|°|^\b(\w\s)|I|)')

    #Pattern for combined word and number
    for i,j in enumerate(prepro1):
        if bool(regex.search(pattern1,j)) == True:
            if bool(regex.search(r'\d',regex.search(pattern1,j).group())) == True:
                try:
                    layer = regex.findall(pattern1,j)
                    if type(layer[0]) == tuple:
                        layer = list(filter(None, [i for i in layer[0]]))
                        prepro2.append(re.sub(layer[0],' ',j))
                    else:
                        prepro2.append(re.sub(regex.findall(pattern1,j)[0],' ',j))
                except:
                    prepro2.append(j)
            else:
                prepro2.append(j)
        else:
            prepro2.append(j)

        #Weird character of ﬂ
        layer = regex.sub(r'ﬂ','fl',prepro2[i])

        #With Removing number and punctuations    
        layer = regex.sub(r'([^A-Za-zöäüéíáúóðèñæýßôþ\s])',' ',layer) 

        #With Removing Additional compiler
        layer = regex.sub(pattern_add,'',layer)
        
        #With Removing unused space first n las
        layer = regex.sub(pattern3,'',layer)
        
        #With Removing over-space
        prepro2[i] = regex.sub(pattern4,'',layer) + '.'

    #Final filtering for less than three words sentence
    d = lambda y : None if len(y.split(' '))<=4 else y
    prepro2 = list(filter(d, prepro2))
    
    return prepro2

def group_read_all(folder_path):
    prepro3 = []
    for i,filename in enumerate(glob.glob(os.path.join(folder_path, '*.pdf'))):
        try:
            text_pdfmine = get_pdf_file_content_Text(filename).lower()
        except:
            text_pdfmine = get_pdf_file_content_Text_ocr(filename).lower()
        
        text_punew_spdo_ger_ovsp_etal_fig_emai_web = preprocessing_1(text_pdfmine)
        prepro2 = preprocessing_2(text_punew_spdo_ger_ovsp_etal_fig_emai_web)
        prepro3 = prepro2 + prepro3
    return prepro3

In [4]:
# prepro2 = group_read_all('PDF Data/combine/')
import re
prepro2 = []
with open('data_prepro2_set.txt', 'r') as f:
    lines = f.readlines()
    for i in lines:
        prepro2.append(re.sub(r'\.\n','',i))

In [5]:
prepro2

['par la ré côté and konrad introduction heat transfer and frost action analyses in pavements require the knowledge of the thermal properties of each layer of the pavement structure including subgrade soils.',
 'among the various thermal properties thermal conductivity is one of the most important input parameters in heat transfer modelling.',
 'it is well known that the thermal conductivity of received march.',
 'published on the nrc research press web site at université laval sainte foy qc canada.',
 'industrial chair on the operation of infrastructures submitted to frost action.',
 'soil is strongly influenced by its density and water content because of contrasting values of its basic components.',
 'for instance the thermal conductivity of solid particles generally varies from to and those for water ice and air are and respectively.',
 'moreover several other factors such as grain mineralogy and fabric also need to be considered.',
 'in the last number of decades many studies were 

# Model Building
## Tag - Entities Library

In [6]:
#Auto Read entities
dict_entities={}
path='list_entities - V2/'
for filename in glob.glob(os.path.join(path, '*.txt')):
    gist_file = open(filename, "r", encoding = "ISO-8859-1")
    try:
        content = gist_file.read().lower()
        dict_entities[re.search(r'(\w+).txt$',filename).group(1)] = list(filter(None, content.split("\n")))
    finally:
        gist_file.close()

In [7]:
dict_entities

{'GeoLoc': ['democratic republic of the congo',
  'united saint tes minor outlying islands',
  'united states minor outlying islands',
  'referred to as gondwana island',
  'saint inbrueche e bad orb',
  'alter saint inbruch am hirschberg',
  'saint inbrueche e bad orb',
  'sint eustatius saba bonaire',
  'british indian ocean territory',
  'plurinational saint te bolivia',
  'southwest german molasse basin',
  'molokai or niihau island',
  'oblast of nishni novgorod',
  'southeast corner of þingvallavatn',
  'slightly west of votuklettar',
  'variscan batholith of corsica',
  'tethyan neyriz opholite complex',
  'gulf of mexico basin',
  'santa rosa de tastil',
  'los humeros geothermal field',
  'western canada sedimentary basin',
  'steinbrüche e bad orb',
  'alter steinbruch am hirschberg',
  'altes kalkwerk firma ickes',
  'acker auf dem mühlberg',
  'the taupo volcanic zone',
  'south and west australia',
  'sainte croix en plaine',
  'waington ams dst col',
  'cap rock gulf coas

In [8]:
list(dict_entities.values())

[['democratic republic of the congo',
  'united saint tes minor outlying islands',
  'united states minor outlying islands',
  'referred to as gondwana island',
  'saint inbrueche e bad orb',
  'alter saint inbruch am hirschberg',
  'saint inbrueche e bad orb',
  'sint eustatius saba bonaire',
  'british indian ocean territory',
  'plurinational saint te bolivia',
  'southwest german molasse basin',
  'molokai or niihau island',
  'oblast of nishni novgorod',
  'southeast corner of þingvallavatn',
  'slightly west of votuklettar',
  'variscan batholith of corsica',
  'tethyan neyriz opholite complex',
  'gulf of mexico basin',
  'santa rosa de tastil',
  'los humeros geothermal field',
  'western canada sedimentary basin',
  'steinbrüche e bad orb',
  'alter steinbruch am hirschberg',
  'altes kalkwerk firma ickes',
  'acker auf dem mühlberg',
  'the taupo volcanic zone',
  'south and west australia',
  'sainte croix en plaine',
  'waington ams dst col',
  'cap rock gulf coast',
  'new

## Entities Map

In [9]:
ent_map = {'B-GeoTime': 1,
 'O': 0,
 'I-GeoTime': 2,
 'B-GeoLoc': 3,
 'I-GeoLoc': 4,
 'B-GeoMeth': 5,
 'I-GeoMeth': 6,
 'B-GeoPetro': 7,
 'I-GeoPetro': 8, 'None': -100}

In [10]:
reverse_ent_map = {i:j for j,i in ent_map.items()}
reverse_ent_map

{1: 'B-GeoTime',
 0: 'O',
 2: 'I-GeoTime',
 3: 'B-GeoLoc',
 4: 'I-GeoLoc',
 5: 'B-GeoMeth',
 6: 'I-GeoMeth',
 7: 'B-GeoPetro',
 8: 'I-GeoPetro',
 -100: 'None'}

## Data Input NER

In [11]:
#Grab all needed sentences that has Entities
all_sents = []

for i in prepro2:

    ent_list = i.split(' ')
    
    for j,k in enumerate(list(dict_entities.values())):
        pat = r",*(\b(?:{})e*s*)\b".format("|".join(k))
        sear = re.findall(pat,str(i))
        
        try:
            if type(sear[0]) == tuple:
                sear = list(filter(None, [d for d in sear[0]]))
            else:
                pass
        except:
            pass
        
        if bool(sear) == True:
            all_sents.append(i)

# TRANSFER LEARNING: BERT HUGGINGFACE

## Build Database with BERT Preprocessing (Tokenizer) in BIO-Encoding

In [12]:
df = pd.DataFrame(list(zip(all_sents)), columns=['sent'])
df

Unnamed: 0,sent
0,par la ré côté and konrad introduction heat tr...
1,among the various thermal properties thermal c...
2,it is well known that the thermal conductivity...
3,it is well known that the thermal conductivity...
4,published on the nrc research press web site a...
...,...
3774,diorite clb clb fineto mediumgrained texture p...
3775,appendix inverse geochemical approach the dire...
3776,appendix inverse geochemical approach the dire...
3777,using published individual melt partition coef...


In [13]:
from transformers import AutoTokenizer

model_checkpoint = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [14]:
def get_training_corpus():
    for start_idx in df.sent.values.tolist():
        yield start_idx
        
#Ready for NLP NER V3.0 (BIO Encoding V2)
def bio_encoding(input_ids, input_sent): 
    #Check entities in the sentence.
    for j,k in enumerate(list(dict_entities.values())):
        pat = r",*(\b(?:{})e*s*)\b".format("|".join(k))
        se = re.sub(r'\.', '', input_sent)
        sear = re.findall(pat,se)

        try:
            if type(sear[0]) == tuple:
                sear = list(filter(None, [d for d in sear[0]]))
            else:
                pass
        except:
            pass
        
        #Grab all entities for BIO Encoding
        if bool(sear) == True:
            sent = input_sent.split(' ')

            for l in sear:
                inp_array = np.array(input_ids)
                
                #Entities with more than one words
                if len(l.split(' ')) > 1:
                    try:
                        first = l.split(' ')[0]
                        id_num = sent.index(first)

                        if len(np.where(inp_array == id_num)[0].tolist())==1:
                            input_ids = np.where(inp_array == id_num,'B-' + list(dict_entities.keys())[j],inp_array).tolist()

                        elif len(np.where(inp_array == id_num)[0].tolist())>1:
                            k = 1
                            for i in np.where(inp_array == id_num)[0].tolist():
                                if k == 1:
                                    input_ids[i] = 'B-' + list(dict_entities.keys())[j]
                                    k = k+1
                                else:
                                    input_ids[i] = 'I-' + list(dict_entities.keys())[j]

                        else:
                            pass

                        for b in range(1,len(l.split(' '))):
                                input_ids = np.where(inp_array == id_num+b,'I-' + list(dict_entities.keys())[j],np.array(input_ids).tolist())
                    except:
                        pass
                    
                #Entities with one word
                elif len(l.split(' ')) == 1:
                    try:
                        id_num = sent.index(l)
                        if len(np.where(inp_array == id_num)[0].tolist())==1:
                            input_ids = np.where(inp_array == id_num,'B-' + list(dict_entities.keys())[j],inp_array).tolist()

                        elif len(np.where(inp_array == id_num)[0].tolist())>1:
                            k = 1
                            for i in np.where(inp_array == id_num)[0].tolist():
                                if k == 1:
                                    input_ids[i] = 'B-' + list(dict_entities.keys())[j]
                                    k = k+1
                                else:
                                    input_ids[i] = 'I-' + list(dict_entities.keys())[j]
                        else:
                            pass

                        sent[id_num] = 'B-' + list(dict_entities.keys())[j]
                    except:
                        pass

                #No Entities
                else:
                    pass
        else:
            pass

    list_map = list(ent_map.keys())
    for i,j in enumerate(input_ids):
        if j in list(list_map) or j == None:
            pass
        else:
            input_ids[i] = 'O'
            
    return input_ids

def ner_encoded(input_ids):
    return [ent_map[str(i)] for i in input_ids]

In [15]:
#Update Tokenizer
old_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), 52000)

#Build Dataframe
df_tok = df
d = df_tok['sent'].apply(lambda x: tokenizer(x))
df_tok.sent_tokens = d.apply(lambda x: x.tokens())
df_tok['inp_id_tok'] = d.apply(lambda x: x.word_ids())
df_tok['input_ids'] = d.apply(lambda x: x.data['input_ids'])
df_tok['attention_mask'] = d.apply(lambda x: x.data['attention_mask'])
df_tok['token_type_ids'] = d.apply(lambda x: x.data['token_type_ids'])
df_tok['ner_tags'] = df_tok.apply(lambda x: bio_encoding(x['inp_id_tok'], x['sent']), axis=1)
df_tok['labels'] = df_tok['ner_tags'].apply(lambda x: ner_encoded(x))

  df_tok.sent_tokens = d.apply(lambda x: x.tokens())


In [16]:
df_tok

Unnamed: 0,sent,inp_id_tok,input_ids,attention_mask,token_type_ids,ner_tags,labels
0,par la ré côté and konrad introduction heat tr...,"[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","[2, 651, 285, 138, 5163, 80, 3769, 2140, 148, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, O, O, O, O, O, O, O, B-GeoMeth, O, O, O...","[-100, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, ..."
1,among the various thermal properties thermal c...,"[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","[2, 1954, 62, 916, 153, 1308, 153, 189, 108, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, O, O, O, B-GeoMeth, O, B-GeoMeth, I-Geo...","[-100, 0, 0, 0, 5, 0, 5, 6, 0, 0, 0, 0, 0, 0, ..."
2,it is well known that the thermal conductivity...,"[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, N...","[2, 321, 108, 460, 1569, 160, 62, 153, 189, 73...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[None, O, O, B-GeoLoc, O, O, O, B-GeoMeth, I-G...","[-100, 0, 0, 3, 0, 0, 0, 5, 6, 0, 0, 0, 0, -100]"
3,it is well known that the thermal conductivity...,"[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, N...","[2, 321, 108, 460, 1569, 160, 62, 153, 189, 73...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[None, O, O, B-GeoLoc, O, O, O, B-GeoMeth, I-G...","[-100, 0, 0, 3, 0, 0, 0, 5, 6, 0, 0, 0, 0, -100]"
4,published on the nrc research press web site a...,"[None, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[2, 2136, 166, 62, 1911, 1548, 259, 5518, 2218...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
3774,diorite clb clb fineto mediumgrained texture p...,"[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","[2, 1089, 870, 870, 7008, 7058, 2347, 696, 629...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, B-GeoPetro, O, O, O, O, O, B-GeoPetro, ...","[-100, 7, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, 0, ..."
3775,appendix inverse geochemical approach the dire...,"[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","[2, 5544, 4688, 872, 2946, 62, 1659, 872, 2946...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3776,appendix inverse geochemical approach the dire...,"[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","[2, 5544, 4688, 872, 2946, 62, 1659, 872, 2946...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3777,using published individual melt partition coef...,"[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","[2, 415, 2136, 1631, 1424, 2248, 1320, 80, 878...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, O, O, O, O, O, B-GeoMeth, O, O, O, O, O...","[-100, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, ..."


In [17]:
# for i,j in enumerate(df_tok.sent_tokens):
#     print(j)
#     print(df_tok.ner_tags[i])
#     print('---------')  

# Data Input Preparation

## Splitting Data (Train, Test, and Validation) in Dataset Dictionary format

In [18]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

train_dataset, validation_dataset = train_test_split(df_tok[["attention_mask", "input_ids", "labels", "token_type_ids"]], test_size=0.30, random_state=42)
validation_dataset, test_dataset = train_test_split(validation_dataset, test_size=0.2, random_state=42)

train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)
validation_dataset =  Dataset.from_dict(validation_dataset)
my_dataset_dict = DatasetDict({"train":train_dataset,"test":test_dataset, "validation": validation_dataset})

In [25]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer
)

In [26]:
batch = data_collator([my_dataset_dict["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    5,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    5,    0,    0,    5,    0,    0,    0,    5,    0,    5,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0, -100],
        [-100,    0,    0,    0,    0,    5,    0,    0,    7,    0,    0,    0,
            0,    0,    0,    0,    5,    0,    0,    0,    0,    0,    0,    0,
            0,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [27]:
tf_train_dataset = my_dataset_dict["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=5,
)

tf_eval_dataset = my_dataset_dict["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=5,
)

  tensor = as_tensor(value)


In [28]:
id2label = {
 '0': 'O',   
 '1': 'B-GeoTime',
 '2': 'I-GeoTime',
 '3': 'B-GeoLoc',
 '4': 'I-GeoLoc',
 '5': 'B-GeoMeth',
 '6': 'I-GeoMeth',
 '7': 'B-GeoPetro',
 '8': 'I-GeoPetro'}

In [29]:
label2id = {
 'O': '0',
 'B-GeoTime': '1',
 'I-GeoTime': '2',
 'B-GeoLoc': '3',
 'I-GeoLoc': '4',
 'B-GeoMeth': '5',
 'I-GeoMeth': '6',
 'B-GeoPetro': '7',
 'I-GeoPetro': '8'}

In [30]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
    from_pt=True
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
model.get_output_embeddings

<bound method TFPreTrainedModel.get_output_embeddings of <transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification object at 0x0000014F42CA6280>>

In [32]:
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [33]:
# from transformers.keras_callbacks import PushToHubCallback

# callback = PushToHubCallback(output_dir="bert-finetuned-ner", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14f42941490>

In [34]:
model

<transformers.models.bert.modeling_tf_bert.TFBertForTokenClassification at 0x14f42ca6280>

In [35]:
import evaluate

metric = evaluate.load("seqeval")

In [36]:
label_names = list(label2id.keys())

In [37]:
import numpy as np

all_predictions = []
all_labels = []
for batch in tf_eval_dataset:
    logits = model.predict(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])

  tensor = as_tensor(value)






{'GeoLoc': {'precision': 0.7730769230769231,
  'recall': 0.8104838709677419,
  'f1': 0.7913385826771654,
  'number': 248},
 'GeoMeth': {'precision': 0.9568892645815723,
  'recall': 0.9609507640067911,
  'f1': 0.9589157136806439,
  'number': 1178},
 'GeoPetro': {'precision': 0.9340245051837889,
  'recall': 0.9411206077872745,
  'f1': 0.9375591296121096,
  'number': 1053},
 'GeoTime': {'precision': 1.0,
  'recall': 0.875,
  'f1': 0.9333333333333333,
  'number': 88},
 'overall_precision': 0.9302595893064703,
 'overall_recall': 0.9353330736268017,
 'overall_f1': 0.9327894327894327,
 'overall_accuracy': 0.986333081066643}

In [38]:
set(all_labels)

{'B-GeoLoc',
 'B-GeoMeth',
 'B-GeoPetro',
 'B-GeoTime',
 'I-GeoLoc',
 'I-GeoMeth',
 'I-GeoPetro',
 'I-GeoTime',
 'O'}