###                                          **Supply Chain - NLP based Threat Detection**

This model uses the implementation of **BERT from the TensorFlow Models** repository on GitHub at tensorflow/models/official/nlp/bert. It uses L=12 hidden layers (Transformer blocks), a hidden size of H=768, and A=12 attention heads

**Configuration Section**

In [55]:

input_training_master_filepath = 'master_train.csv'
input_training_node_filepath = 'node_train.csv'
input_test_master_filepath = 'master_test.csv'
input_test_node_filepath = 'node_test.csv'

Output_file_path = 'result_submission.csv'

bert_module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2' # default path '
   
    

In [56]:
# import libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
import tensorflow_hub as hub
import tokenization



Helper Function 1 for `BERT Layer`

In [57]:

# Encode text into tokens, masks, and segment flags

def bert_encode(texts, tokenizer, max_len=128):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


Helper Function 2 for `BERT Layer`

In [58]:

def build_model(bert_layer, max_len=128):
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


`ClassificationReport` which is similar to `sklearn.metrics.classification_report`, computes **Precision, Recall and F1-Score** metrics after every epoch.

In [59]:

class ClassificationReport(Callback):
    
    def __init__(self, train_data=(), validation_data=()):
        super(Callback, self).__init__()
        
        self.X_train, self.y_train = train_data
        self.train_precision_scores = []
        self.train_recall_scores = []
        self.train_f1_scores = []
        
        self.X_val, self.y_val = validation_data
        self.val_precision_scores = []
        self.val_recall_scores = []
        self.val_f1_scores = [] 
               
    def on_epoch_end(self, epoch, logs={}):
        
        train_predictions = np.round(self.model.predict(self.X_train, verbose=0))
        
        train_predictions.astype(int)
        
        np.reshape(self.y_train,(self.y_train.size))
        np.reshape(train_predictions,(train_predictions.size))
        
        train_precision = precision_score(self.y_train, train_predictions, average='macro',zero_division=1)
        train_recall = recall_score(self.y_train, train_predictions, average='macro',zero_division=1)
        train_f1 = f1_score(self.y_train, train_predictions, average='macro',zero_division=1)
        self.train_precision_scores.append(train_precision)        
        self.train_recall_scores.append(train_recall)
        self.train_f1_scores.append(train_f1)
        
        val_predictions = np.round(self.model.predict(self.X_val, verbose=0))
        val_precision = precision_score(self.y_val, val_predictions, average='macro',zero_division=1)
        val_recall = recall_score(self.y_val, val_predictions, average='macro',zero_division=1)
        val_f1 = f1_score(self.y_val, val_predictions, average='macro',zero_division=1)
        self.val_precision_scores.append(val_precision)        
        self.val_recall_scores.append(val_recall)        
        self.val_f1_scores.append(val_f1)
        
        print('\nEpoch: {} - Training Precision: {:.6} - Training Recall: {:.6} - Training F1: {:.6}'.format(epoch + 1, train_precision, train_recall, train_f1))
        print('Epoch: {} - Validation Precision: {:.6} - Validation Recall: {:.6} - Validation F1: {:.6}'.format(epoch + 1, val_precision, val_recall, val_f1))  
        

Data cleaning and formatting for `Training Data`.Create `Train` `Validation` set. 

In [60]:
def train_data_cleaning():
    
    train_csv = pd.read_csv(input_training_master_filepath)
    train_nodes_csv = pd.read_csv(input_training_node_filepath)
    
    count_title_subject_difference = 0
    total_count = 0

    # 'Id' field is the link between the 2 csv files. truncate left and right spaces
    train_nodes_csv['Id'] = train_nodes_csv['Id'].str.strip()
    train_csv['Id'] = train_csv['Id'].str.strip()
    
    # create fields in the primary csv df for the data to be populated from node csv df
    train_csv['City'] = ''
    train_csv['Country'] = ''
    train_csv['Node Type'] = ''
    train_csv['Status'] = ''
    
    # populate City, Node Type, Country from the node csv df to the primary df
    for i,row in train_csv.iterrows():
        id_node = train_nodes_csv.loc[train_nodes_csv['Id'] == row['Id']]
        for i1,row1 in id_node.iterrows():
            if str(row1['City']) not in str(row['City']):
                train_csv.loc[i,'City'] = str(train_csv.loc[i,'City']) + ' ' + str(row1['City'])
            if str(row1['Node Type']) not in str(train_csv.loc[i,'Node Type']):
                train_csv.loc[i,'Node Type'] = str(train_csv.loc[i,'Node Type']) + ' ' + str(row1['Node Type'])
            if (len(str(row1['Country'])) > 0):
                train_csv.loc[i,'Country'] = str(row1['Country'])
                country_str = str(row1['Country']).strip()
            if (len(str(row1['Status'])) > 0):
                train_csv.loc[i,'Status'] = str(row1['Status'])

    # check if 'Title' and 'Subject' fields contains same data. If different concatenate
        title_str = row['Title'].replace(' ','')
        subject_str = row['Subject'].replace(' ','')
        if title_str != subject_str:
            count_title_subject_difference += 1
            row['Title'] = row['Title'] + '. ' + row['Subject']
        total_count += 1

        # clean 'Title' field
        row['Title'] = row['Title'].replace(country_str,'')
        row['Title'] = row['Title'].replace('Incident ','')
        row['Title'] = row['Title'].replace('Moderate ','')
        row['Title'] = row['Title'].replace('Severe ','')
        row['Title'] = row['Title'].replace('Minor ','')
        row['Title'] = row['Title'].replace('Extreme ','')
        train_csv.loc[i,'Title'] = row['Title']

    print("Total No. of rows in Training Data: ", total_count)
    print("No. of rows with Title & Subject different in train Data: ", count_title_subject_difference)
    
    train = pd.DataFrame()
    
    # concatenate fields for creating text to be processed by BERT model
    train_csv['text'] = train_csv['Severity'].map(str) + '. ' + train_csv['Status'].map(str) + '. ' + train_csv['Country'].map(str) + '.' +  train_csv['City'].map(str) + '. ' + train_csv['Category'].map(str) + '.' +  train_csv['Node Type'].map(str) + '.' + train_csv['Title'].map(str) + '. ' + train_csv['Summary'].map(str)
    
    # create train, validation and test df
    train = train_csv[['Alert ID','text']]
    Xtrain_input = train['text']
    Ytrain_labels = train['Alert ID']

    train_input, val_input, train_labels, val_labels = train_test_split(Xtrain_input, Ytrain_labels, test_size=0.2, random_state=42)

    return train_input, val_input, train_labels, val_labels


Data cleaning and formatting for `Test Data`. 

In [61]:
def test_data_cleaning():
    
    test_input = pd.DataFrame()
    test_csv = pd.read_csv(input_test_master_filepath)
    test_nodes_csv = pd.read_csv(input_test_node_filepath)
    
    count_title_subject_difference = 0
    total_count = 0
    
    # 'Id' field is the link between the 2 csv files. truncate left and right spaces
    test_nodes_csv['Id'] = test_nodes_csv['Id'].str.strip()
    test_csv['Id'] = test_csv['Id'].str.strip()
    
    # create fields in the primary csv df for the data to be populated from node csv df
    test_csv['City'] = ''
    test_csv['Country'] = ''
    test_csv['Node Type'] = ''
    test_csv['Status'] = ''
    
    # populate City, Node Type, Country from the node csv df to the primary df
    for i,row in test_csv.iterrows():
        id_node = test_nodes_csv.loc[test_nodes_csv['Id'] == row['Id']]
        for i1,row1 in id_node.iterrows():
            if str(row1['City']) not in str(row['City']):
                test_csv.loc[i,'City'] = str(test_csv.loc[i,'City']) + ' ' + str(row1['City'])
            if str(row1['Node Type']) not in str(test_csv.loc[i,'Node Type']):
                test_csv.loc[i,'Node Type'] = str(test_csv.loc[i,'Node Type']) + ' ' + str(row1['Node Type'])
            if (len(str(row1['Country'])) > 0):
                test_csv.loc[i,'Country'] = str(row1['Country'])
                country_str = str(row1['Country']).strip()
            if (len(str(row1['Status'])) > 0):
                test_csv.loc[i,'Status'] = str(row1['Status'])

        # check if 'Title' and 'Subject' fields contains same data. If different concatenate
        title_str = row['Title'].replace(' ','')
        subject_str = row['Subject'].replace(' ','')
        if title_str != subject_str:
            count_title_subject_difference += 1
            row['Title'] = row['Title'] + '. ' + row['Subject']
        total_count += 1

        # clean 'Title' field
        row['Title'] = row['Title'].replace(country_str,'')
        row['Title'] = row['Title'].replace('Incident ','')
        row['Title'] = row['Title'].replace('Moderate ','')
        row['Title'] = row['Title'].replace('Severe ','')
        row['Title'] = row['Title'].replace('Minor ','')
        row['Title'] = row['Title'].replace('Extreme ','')
        test_csv.loc[i,'Title'] = row['Title']

    print("Total No. of rows in Test Data: ", total_count)
    print("No. of rows with Title & Subject different in Test Data: ", count_title_subject_difference)
    
    
    # concatenate fields for creating text to be processed by BERT model
    test_csv['text'] =  test_csv['Severity'].map(str) + '. ' + test_csv['Status'].map(str) + '. ' + test_csv['Country'].map(str) + '.' +  test_csv['City'].map(str) + '. ' + test_csv['Category'].map(str) + '.' +  test_csv['Node Type'].map(str) + '.' + test_csv['Title'].map(str) + '. ' + test_csv['Summary'].map(str)
    test_input['text'] = test_csv['text']
    return test_input, test_csv


Load `BERT` layer

In [62]:
def load_bert_layer():
    bert_layer = hub.KerasLayer(bert_module_url, trainable=True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
    return tokenizer, bert_layer

Encode the text into tokens, masks, and segment flags (`BERT`)

In [63]:
def encode_tokens(tokenizer, train_input, val_input, test_input):
    train_input = bert_encode(train_input.values, tokenizer, max_len=128)
    val_input = bert_encode(val_input.values, tokenizer, max_len=128)
    test_input = bert_encode(test_input.text.values, tokenizer, max_len=128)
    return train_input, val_input, test_input

`Build, train and validate model` 

In [64]:

def train_model_default_option(bert_layer, train_input, val_input, train_labels, val_labels):
    
    model = build_model(bert_layer, max_len=128)
    model.summary()
    
    metrics = ClassificationReport(train_data=(train_input, train_labels), validation_data=(val_input, val_labels))

    train_history = model.fit(
        train_input, train_labels,
        validation_data=(val_input, val_labels),
        epochs=3,
        batch_size=32,
        callbacks=[metrics]
    )

    return model


`Model` Prediction

In [65]:

def predict_model(model, test_input, test_csv):
    
    test_pred = model.predict(test_input)
    
    test_csv.drop(columns=['City','Country','Node Type','text','Status'], inplace=True,axis=1)
    test_csv['Threat Level'] = test_pred
    test_csv['Alert ID'] = np.round(test_pred)
    
    test_csv.to_csv(Output_file_path, index=False)


`Main Processing Section`

In [66]:

print("Train data being processed")
train_input, val_input, train_labels, val_labels = train_data_cleaning()
print("Train data processing completed")
test_input, test_csv = test_data_cleaning()
print("Test data processing completed")
tokenizer, bert_layer =load_bert_layer()
print("BERT layer loaded")
train_input, val_input, test_input = encode_tokens(tokenizer, train_input, val_input, test_input)
print("BERT encode tokens completed")
model =  train_model_default_option(bert_layer, train_input, val_input, train_labels, val_labels)
print("Model training completed")
predict_model(model, test_input, test_csv)
print("The model is trained and has predicted the values. File Path: ",Output_file_path)

    

Train data being processed
Total No. of rows in Training Data:  23518
No. of rows with Title & Subject different in train Data:  0
Train data processing completed
Total No. of rows in Test Data:  11861
No. of rows with Title & Subject different in Test Data:  0
Test data processing completed
BERT layer loaded
BERT encode tokens completed
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]    