Um den Anforderungen gerecht zu werden welche die gestellte Aufgabe mit sich bringt soll in diesem Teil ein Transformersmodel eingesetzt werden. Konkret wird ein Bert Modell eingesetzt.


In [3]:
# install the transformer package
!pip install -U transformers==4.9.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import re
import tqdm
from datetime import datetime

import pandas as pd
import numpy as np
import seaborn as sns

import tensorflow as tf

from sklearn import metrics
from sklearn.model_selection import train_test_split

from transformers import TFBertModel,  BertConfig, BertTokenizerFast

from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

"\ntqdm.tqdm.pandas()\nsns.set_style('whitegrid')\n\npd.options.display.max_colwidth = 600\npd.options.display.max_rows = 400\n"

In [4]:
df = pd.read_csv('drive/MyDrive/Colab Notebooks NLP/data/Total_AlyMan.csv')
df.shape

(5841, 2)

In [5]:
# Trimm the Label to the basic ones
df["Kennzeichen"] = df["Kennzeichen"].str.slice(start=0, stop=1)
# Label distibution
df["Kennzeichen"].value_counts(dropna=False).count()

22

In [6]:
# Remove rows, where the label is present only min_labelEntrys
min_labelEntrys = 5
df = df.groupby('Kennzeichen').filter(lambda x : len(x) >= min_labelEntrys)
df.shape

(5835, 2)

## BERT

In [7]:
data = df

In [8]:
# Set your model output as categorical and save in new label col
data['Kennzeichen_label'] = pd.Categorical(data['Kennzeichen'])

# Transform your output to numeric
data['Kennzeichen'] = data['Kennzeichen_label'].cat.codes

In [87]:
labels_index= dict(zip( data['Kennzeichen'], data['Kennzeichen_label'].tolist()))
labels_index

{6: 'G',
 11: 'P',
 4: 'E',
 13: 'R',
 1: 'B',
 5: 'F',
 16: 'U',
 15: 'T',
 14: 'S',
 12: 'Q',
 8: 'K',
 18: 'X',
 7: 'H',
 10: 'N',
 17: 'W',
 9: 'M',
 0: 'A',
 3: 'D',
 2: 'C'}

In [9]:
# Split into train and test - stratify over Issue
data, data_test = train_test_split(data, test_size = 0.25, stratify = data[['Kennzeichen']], random_state=2)

### Setup

In [10]:
# Name of the BERT model to use
model_name = 'bert-base-german-cased'
# Max length of tokens
max_length = 150
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/485k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/533M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-german-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-german-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


### Build

In [11]:
# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model
# Load the MainLayer
bert = transformer_model.layers[0]
# Build your model input
input_ids = tf.keras.layers.Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)
# Then build your model output
output = tf.keras.layers.Dense(units=len(data.Kennzeichen.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='output')(pooled_output)
# And combine it all in a model object
model = tf.keras.models.Model(inputs=inputs, outputs=output)
# Take a look at the model
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids (InputLayer)      [(None, 150)]             0         
                                                                 
 bert (TFBertMainLayer)      TFBaseModelOutputWithPoo  109081344 
                             ling(last_hidden_state=(            
                             None, 150, 768),                    
                              pooler_output=(None, 76            
                             8),                                 
                              hidden_states=None, att            
                             entions=None)                       
                                                                 
 pooled_output (Dropout)     (None, 768)               0         
                                                                 
 output (Dense)              (None, 19)                14611 

### Train and fit

In [12]:
# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss = {'output': CategoricalCrossentropy(from_logits = True)}
metric = {'output' : CategoricalAccuracy('f1-score')}
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)
# Ready output data for the model
y = to_categorical(data["Kennzeichen"])

In [13]:
# Tokenize the input (takes some time)
x = tokenizer(
    text=data['Beispiel'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding="max_length",
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

In [14]:
# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids']},
    y={'output': y},
    validation_split=0.2,
    batch_size=64,
    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Evaluate

In [15]:
#######################################
### ----- Evaluate the model ------ ###
# Read test data
y_test = to_categorical(data_test["Kennzeichen"])
X_test = tokenizer(
    text=data_test['Beispiel'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding="max_length", # True 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

In [16]:
# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': X_test['input_ids']},
    y=y_test)



Das Ergebniss auf den Test Daten zeigt das mit diesem Modell realistische predictions auf neuen Daten gemacht werden können.

# Anwendung des Modell
Wie das Modell nun für individuelle Anfragen verwendet werden kann wird in diesem Abschnitt gezeigt.

In [98]:
# Predict value, get labels back
def predict_values(betriebsmittel:list, labels_index:dict, max_length:int) -> str:
  # use dataframe to handel the input
  dff = pd.DataFrame(data={'Beispiel': betriebsmittel})
  #create tokenizer for the input
  x = tokenizer(
    text=dff['Beispiel'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding="max_length", # True 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)
  # get prediction from the trained model
  y_prob = model.predict(x={'input_ids': x['input_ids']})
  # get the predicted class
  max_pred = np.argmax(y_prob, axis=1)
  # convert the class into the related label
  labels = []
  for i in range(len(betriebsmittel)):
    labels.append(labels_index.get(max_pred[i]))
  return labels  

In [107]:
betriebsmittel = ["Schaltschrank", "Schneehöhensensor", "Blitzschutz"]

In [108]:
predictions = predict_values(betriebsmittel, labels_index, max_length)
predictions



['U', 'B', 'F']

In [109]:
t = 0

for text in betriebsmittel:
    print("Prediction for \"%s\": %s" % (text, predictions[t]))
    t = t + 1

Prediction for "Schaltschrank": U
Prediction for "Schneehöhensensor": B
Prediction for "Blitzschutz": F


Nun ist das Projekt an einem Punkt an welchem realistische Predictions erstellt werden können für die Hauptklassen. Der nächste Schritt wäre es das Modell zu exportieren und auf einer Webseite einbinden so dass ein Kunde die Abfrage eines Betriebsmittels tätigen kann. Weiter könnten die Unterklassen in jeweils separaten Modellen trainiert werden um auch diese angeben zu können. Ich persöndlich bin begeistert mit dem Transformer Modell eine Antwort auf die Aufgabenstellung gefunden zu haben.

In [110]:
model.save('drive/MyDrive/Colab Notebooks NLP/model/BERT_DIN_81346-2')



# Appendix: Auswertung der Featureengineering Daten
In diesem Abschnitt wird noch abschliessend angeschaut ob die Daten inkl. Featureengineering zu einer Verbesserung der Predictions führt.

In [6]:
dfEng = pd.read_csv('drive/MyDrive/Colab Notebooks NLP/data/Total_FEN.csv')
dfEng.shape

(9380, 2)

In [7]:
# Trimm the Label to the basic ones
dfEng["Kennzeichen"] = dfEng["Kennzeichen"].str.slice(start=0, stop=1)
# Label distibution
dfEng["Kennzeichen"].value_counts(dropna=False).count()

22

In [8]:
data = dfEng

In [9]:
# Set your model output as categorical and save in new label col
data['Kennzeichen_label'] = pd.Categorical(data['Kennzeichen'])

# Transform your output to numeric
data['Kennzeichen'] = data['Kennzeichen_label'].cat.codes

In [10]:
# Split into train and test - stratify over Issue
data, data_test = train_test_split(data, test_size = 0.25, stratify = data[['Kennzeichen']], random_state=2)

In [12]:
# Name of the BERT model to use
model_name = 'bert-base-german-cased'
# Max length of tokens
max_length = 150
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Some layers from the model checkpoint at bert-base-german-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-german-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [13]:
# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model
# Load the MainLayer
bert = transformer_model.layers[0]
# Build your model input
input_ids = tf.keras.layers.Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)
# Then build your model output
output = tf.keras.layers.Dense(units=len(data.Kennzeichen.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='output')(pooled_output)
# And combine it all in a model object
model_FEN = tf.keras.models.Model(inputs=inputs, outputs=output)
# Take a look at the model
model_FEN.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids (InputLayer)      [(None, 150)]             0         
                                                                 
 bert (TFBertMainLayer)      TFBaseModelOutputWithPoo  109081344 
                             ling(last_hidden_state=(            
                             None, 150, 768),                    
                              pooler_output=(None, 76            
                             8),                                 
                              hidden_states=None, att            
                             entions=None)                       
                                                                 
 pooled_output (Dropout)     (None, 768)               0         
                                                                 
 output (Dense)              (None, 22)                16918 

In [14]:
# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss = {'output': CategoricalCrossentropy(from_logits = True)}
metric = {'output' : CategoricalAccuracy('f1-score')}
# Compile the model
model_FEN.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)
# Ready output data for the model
y = to_categorical(data["Kennzeichen"])

In [16]:
# Tokenize the input (takes some time)
x = tokenizer(
    text=data['Beispiel'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding="max_length",
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

In [17]:
# Fit the model - Baseclass
history = model_FEN.fit(
    x={'input_ids': x['input_ids']},
    y={'output': y},
    validation_split=0.2,
    batch_size=64,
    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
#######################################
### ----- Evaluate the model ------ ###
# Read test data
y_test = to_categorical(data_test["Kennzeichen"])
X_test = tokenizer(
    text=data_test['Beispiel'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding="max_length", # True 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

In [20]:
# Run evaluation
model_eval = model_FEN.evaluate(
    x={'input_ids': X_test['input_ids']},
    y=y_test)



Die Daten welche im Featureengineering aufbereitet wurden konnten nicht zu einem besseren Ergebniss führen. Um an bessere Daten zu kommen muss ein anderer Weg gesucht werden, z.B. mehr Beispiele aus Porjekten in welchen die Norm verwendet wurde.