# Load and prepare the data

In [1]:
import os
import pandas as pd

In [2]:
path_working_dir = "/content/drive/MyDrive/NLP/innoscripta"
path_data_dir = os.path.join(path_working_dir, "data")
path_models_dir = os.path.join(path_working_dir, "models")
print("{}:\n{}".format(path_working_dir, os.listdir(path_data_dir)))

/content/drive/MyDrive/NLP/innoscripta:
['keywords_dataset.csv', '.ipynb_checkpoints', 'df_clean.pkl', 'cleaned.csv', 'df_fixed.pkl', 'train_df.pkl', 'test_df.pkl']


In [3]:
df_to_open = "df_clean.pkl"
df_fixed = pd.read_pickle(os.path.join(path_data_dir, df_to_open))
print(df_fixed.dtypes, "\n")
print(df_fixed.head(5), "\n")
print(df_fixed.shape, "\n")

sentence_id    object
words          object
labels         object
dtype: object 

  sentence_id                words labels
0         112    Gebäudevermessung  B-KEY
1         112                  von      O
2         112  Mehrfamilienhäusern      O
3         112                    ,      O
4         112      Einkaufszentren      O 

(157108, 3) 



In [4]:
labels = df_fixed["labels"].unique().tolist()
print("Labels in the dataset: ", labels)

Labels in the dataset:  ['B-KEY', 'O', 'I-KEY']


In [5]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df_fixed, test_size=0.15, random_state = 42)
df_train, df_val = train_test_split(df_train, test_size=0.15, random_state = 42)
print("Train: ", df_train.shape)
print("Validate: ", df_val.shape)
print("Test: ", df_test.shape)

Train:  (113509, 3)
Validate:  (20032, 3)
Test:  (23567, 3)


In [6]:
# train_data_name = "train_df.pkl"
# df_train = pd.read_pickle(os.path.join(path_data_dir, train_data_name))

# test_data_name = "test_df.pkl"
# df_test = pd.read_pickle(os.path.join(path_data_dir, test_data_name))

# Init, train and evaluate the model

In [7]:
!pip install simpletransformers > /dev/null
from simpletransformers.ner import NERModel, NERArgs
# import warnings
# warnings.filterwarnings('ignore')

In [8]:
transformers_dict = {
  0: "distilbert-base-german-cased",
  1: "deepset/gelectra-base",
  2: "deepset/gelectra-large",
  3: "deepset/gbert-base",
  4: "deepset/gbert-large",
  5: "xlm-roberta-large-finetuned-conll03-german",
  6: "bert-base-german-cased",
  7: "xlm-roberta-base",
  8: "xlm-roberta-large-finetuned-conll03-german"
}

In [9]:
model_name = transformers_dict[3]
model_dir_name = model_name.replace("/","-") + "-df_fixed"
model_path_dir = os.path.join(path_models_dir, model_dir_name)
print("Model to be trained:\t{}\nModel to be saved to:\t{}".format(model_name, model_path_dir))

Model to be trained:	deepset/gbert-base
Model to be saved to:	/content/drive/MyDrive/NLP/innoscripta/models/deepset-gbert-base-df_fixed


In [10]:
args = NERArgs()
args.num_train_epochs = 8
args.learning_rate = 1e-4
args.train_batch_size = 16
args.eval_batch_size = 16
args.output_dir = model_path_dir
args.overwrite_output_dir = True
args.evaluate_during_training = True
args.evaluate_during_training_verbose = True
#args.labels_list = df_clean["labels"].unique().tolist()

In [11]:
print(model_name)

deepset/gbert-base


In [12]:
%%capture
model = NERModel('bert', model_name, args = args, labels=labels)
#model.get_named_parameters()

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gb

In [13]:
model.train_model(df_train, eval_data=df_val, verbose=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7121.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 8'), FloatProgress(value=0.0, max=446.0), HTML(value='')))








HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5624.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=352.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 1 of 8'), FloatProgress(value=0.0, max=446.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5624.0), HTML(value='')))

Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7f9f50efff60>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1203, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1177, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 122, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process





HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=352.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 2 of 8'), FloatProgress(value=0.0, max=446.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5624.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=352.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 3 of 8'), FloatProgress(value=0.0, max=446.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5624.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=352.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 4 of 8'), FloatProgress(value=0.0, max=446.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5624.0), HTML(value='')))

Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7f9fbfff4588>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1203, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1177, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 122, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process





HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=352.0), HTML(value='')))





HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5624.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=352.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 5 of 8'), FloatProgress(value=0.0, max=446.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5624.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=352.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 6 of 8'), FloatProgress(value=0.0, max=446.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5624.0), HTML(value='')))

Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7f9f4f63e128>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1203, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1177, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 122, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process





HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=352.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 7 of 8'), FloatProgress(value=0.0, max=446.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5624.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=352.0), HTML(value='')))





(3568,
 {'eval_loss': [0.2548997637859429,
   0.24221539519401797,
   0.25561478603537,
   0.3201824469936334,
   0.32021264039047476,
   0.3121258786038549,
   0.38082995094113026,
   0.43877809755263475,
   0.4444126656591875],
  'f1_score': [0.7705534658785599,
   0.7885774168194918,
   0.801526717557252,
   0.8024014617593318,
   0.8070735274564552,
   0.8162111215834118,
   0.8121935056328695,
   0.8160358697085586,
   0.8165915716936125],
  'global_step': [446, 892, 1338, 1784, 2000, 2230, 2676, 3122, 3568],
  'precision': [0.7353846153846154,
   0.7359413202933985,
   0.7511100148001973,
   0.7464788732394366,
   0.7631380437515716,
   0.7805820242080865,
   0.7658085478630342,
   0.7660311958405546,
   0.7698650674662668],
  'recall': [0.809255079006772,
   0.8493227990970654,
   0.8591986455981941,
   0.8673814898419865,
   0.8563769751693002,
   0.8552483069977427,
   0.8645598194130926,
   0.8730248306997742,
   0.8693566591422122],
  'train_loss': [0.0015014158561825752,
  

In [14]:
result, model_outputs, preds_list = model.eval_model(df_test)
result

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5886.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=368.0), HTML(value='')))




{'eval_loss': 0.40869809420373415,
 'f1_score': 0.8270833333333334,
 'precision': 0.7837245009870586,
 'recall': 0.875520705709385}

In [15]:
my_text = "Gebäudevermessung von Mehrfamilienhäusern Einkaufszentren Außenanlagen und bereiten ihn auf 3D Scan vor."
prediction, model_output = model.predict([my_text])
prediction

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Running Prediction'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




[[{'Gebäudevermessung': 'B-KEY'},
  {'von': 'O'},
  {'Mehrfamilienhäusern': 'O'},
  {'Einkaufszentren': 'O'},
  {'Außenanlagen': 'O'},
  {'und': 'O'},
  {'bereiten': 'O'},
  {'ihn': 'O'},
  {'auf': 'O'},
  {'3D': 'B-KEY'},
  {'Scan': 'I-KEY'},
  {'vor.': 'O'}]]

In [16]:
#model = NERModel('bert', model_name = "/content/outputs/best_model")