Notebook action: extracts named entities from an input text.
    
input: 
- .csv file fcontaining the text from every document node that wants to be processed. 
- fields: node_id, text

output:
- .csv file containing all entitis extracted from each document node
- fields: node_id, entities list. Example: [ 2180102, [ 'sap hana', 'sap ase'] ], [ 562662, [ 'sap ea designer', 'sap powerdesigner' ] ]


# EXECUTION PARAMETERS:

### 1) Input text: (.csv format, fields: id, text)

In [2]:
INPUT_FOLDER = "ekg_enrichment/"
input_file = "input_5000_corpwiki_random.csv"

### 2) Model to be Inferenced:

In [15]:
MODELS_FOLDER = 'models_saved/'
selected_model = '2021-03-22_01-22-44.checkpoint'

### 3) Entity mapping table:

In [41]:
INPUT_FOLDER = "sap-products/"
input_file = "Final_Entity_mapping.csv"

# 0) IMPORT LIBRARIES

In [3]:
import pandas as pd  
import numpy as np
import re

# 1) DEFINE CLEANING FUNCTIONS:

In [4]:
# -------------------------------------------------------------------------
# create function to remove numbers in brackets from text_string
# -------------------------------------------------------------------------
def generate_numbers_in_brackets_list(n=200):
    numbers_list = np.arange(1,n)
    brackets_numbers_list = []
    for i in numbers_list:
        string = '[' + str(i) + ']'
        brackets_numbers_list.append(string)
    return brackets_numbers_list

# -------------------------------------------------------------------------
# create function to replace a string with another string
# -------------------------------------------------------------------------
def replace_string_from_texts_list(string, strings_list, replace_by):
    for s in strings_list:
        new_string = string.replace(s, replace_by)
    return new_string

In [4]:
def smart_truncate(content, length=100, suffix=''):
    if len(content) <= length:
        return content
    else:
        return ' '.join(content[:length+1].split(' ')[0:-1]) + suffix
    
def truncate(s, max_len):
    trunc1 = smart_truncate(s, length=max_len, suffix='')
    s_nlts = trunc1.strip()
    s_nlts = s_nlts.replace('     ',' ') # replace 5 spaces by 1 space
    s_nlts = s_nlts.replace('    ',' ') # replace 4 spaces by 1 space
    s_nlts = s_nlts.replace('   ',' ') # replace 3 spaces by 1 space
    s_nlts = s_nlts.replace('  ',' ') # replace 2 spaces by 1 space
#     text_sentences.append(s_nlts)
     # truncated 2
    index_trunc1 = len(trunc1)
    trunc2 = s[index_trunc1:]
    return s_nlts, trunc2

In [5]:
def eliminate_spaces(s):
    s_nlts = s.strip()
    s_nlts = s_nlts.replace('     ',' ') # replace 5 spaces by 1 space
    s_nlts = s_nlts.replace('    ',' ') # replace 4 spaces by 1 space
    s_nlts = s_nlts.replace('   ',' ') # replace 3 spaces by 1 space
    s_nlts = s_nlts.replace('  ',' ') # replace 2 spaces by 1 space
    return s_nlts

In [6]:
def split_text_string_into_sentences_v2(text_string, max_len):
    text_sentences = []
    text_sentences_tmp = text_string.split('.')
    strg_carry_forward = '' 
    
    for s_ in text_sentences_tmp:
        s = strg_carry_forward + ' ' + s_
#         print(s)
        
        nr_wrds = s.split()
        
        if len(nr_wrds) > 5:
            if len(s) <= max_len: 
#                 print(len(s), ' <= ' , max_len)
                s = eliminate_spaces(s)
                text_sentences.append(s)
                strg_carry_forward = ''
            else:
#                 print(len(s), ' > ' , max_len)
            
                counter = 1
                while len(s) > max_len:
#                     print(counter)
                    tr1, tr2 = truncate(s, max_len)
                    tr1 = eliminate_spaces(tr1)
                    text_sentences.append(tr1)

                    s = tr2
#                     print('tr1: ', tr1)
#                     print('tr2: ', tr2)
#                     print('s: ', s)
                    
                if len(s) > 0 and len(s) < max_len:
#                     print('len(s) > 0 and len(s) < max_len:')
                    nr_wrds = s.split()
                    if len(nr_wrds) > 5:
                        s = eliminate_spaces(s)
                        text_sentences.append(s)
                        strg_carry_forward = ''
                    else:
                        strg_carry_forward = s
                counter = counter + 1
        else:
#             print('s has less than 5 words')
            strg_carry_forward = s
    
    if len(strg_carry_forward) > 0:
        strg_carry_forward = eliminate_spaces(strg_carry_forward)
        text_sentences.append(strg_carry_forward)
        
    
    return text_sentences

In [5]:
# -------------------------------------------------------------------------
# CLEAN TEXT AND BREAK INTO SENTENCES v2.0
# -------------------------------------------------------------------------
def clean_text_string_and_break_into_sentences_v2(text_string, max_len):
    s = text_string.lower()

    s = re.sub("@\S+", " ", s)    #remove twitter account
    s = re.sub("https*\S+", " ", s)  #remove url
    s = s.replace("\t", " ")  # remove tabs
    s = s.replace("\n",". ")  # remove line-breaks

    # DELETE BEGGINING_OF_ARTICLE NUMBER STRING by DOT SPACE
    # s = "1965636085\ This must not b3 delet3d, but the number at the end yes 134411 the date of release was 23 March 2019\n2268135735\tthe following are the test cases/browser/platform combinations that were run."
    s = s.replace("\ ", " ")
    regex = re.compile(r'\d{10}')
    numbers_to_delete = regex.findall(s)
    numbers_to_delete
    for i in numbers_to_delete:
        s = s.replace(str(i), ". ")

    # DELETE VERSION NUMBERS
    # s = "sap hana 2.0 is the mew version. sap s/4hana 15.11 is the latest s4 version."                  
    regex = re.compile(r'\b\d+\.\d+\b')
    version_numbers = regex.findall(s)
    #version_numbers

    for i in version_numbers:
        s = s.replace(str(i), " ")

    # REMOVE / EXCEPT FROM IN S/4HANA
    # s = "sap hana is the in-memory database. sap s/4hana is the latest sap erp version. the test cases/browser/platform combinations that will be run"
    s = s.replace("s/4hana", "s4-hana")
    s = s.replace("s/4", "s-4")
    s = s.replace("/", " ")
    s = s.replace("s4-hana", "s4/hana")
    s = s.replace("s-4", "s/4")

    # OTHER CHARACTER REPLACEMENTS
    s = s.replace("...",". ")  # remove line-breaks
    s = s.replace(". ..",". ") # remove line-breaks
    s = s.replace(":"," ")     # remove line-breaks
    s = re.sub("#\S+", " ", s) # Remove hashtags
    s = re.sub("\'\w+", '', s) # Remove ticks and the next character

    brackets_numbers_list = generate_numbers_in_brackets_list(n=200)
    s = replace_string_from_texts_list(s, brackets_numbers_list,'')

    s = re.sub('[^A-Za-z0-9.]+', ' ', s)  # Remove all punctuations, except from .
    s = re.sub(r'\w*\d+\w*', '', s)       # Remove digits

    #replace strings
    s = s.replace("i.e", " ")  # remove i.e
    s = s.replace("run.", " ")  # remove run.
    s = s.replace(".zip", " ")  # remove .zip
    s = s.replace("server.xml", " ")  # remove server.xml
    s = s.replace(".yml", " ")  # remove .yml
    s = s.replace(".pai", " ")  # remove .pai
    s = s.replace(".xml", " ")  # remove .xml
    s = s.replace(".run", " ")  # remove .rum
    s = s.replace(".com", " ")  # remove .com

    s = re.sub('\s{2,}', " ", s)  # Replace the over spaces

    text_sentences = split_text_string_into_sentences_v2(s, max_len)
    
    return text_sentences

# 2) LOAD DATA

In [7]:
data = pd.read_csv(INPUT_FOLDER+input_file,sep=",",encoding="latin1").fillna(method='ffill')    
data.head(5)

Unnamed: 0,ID(n),n.text,r
0,1474086,Screenshot 2020-05-19 at 18.55.13.png,6.223421e-07
1,905576,image2019-1-22_17-48-10.png,1.260983e-06
2,1587998,Authorization_Harmonisation_JAN_2019.pptx,2.294894e-06
3,491131,image2020-9-24_9-31-29.png,2.637947e-06
4,368285,Screen Shot 2019-11-07 at 16.45.33.png,3.463602e-06


In [8]:
data.shape

(5000, 3)

In [9]:
# Changing columns name with index number
data = data.rename(columns={data.columns[0]: 'ID'})
data = data.rename(columns={data.columns[1]: 'Text'})
# data = data.rename(columns={data.columns[2]: 'R'})

In [11]:
data.head(5)

Unnamed: 0,ID,Text,R
0,1474086,Screenshot 2020-05-19 at 18.55.13.png,6.223421e-07
1,905576,image2019-1-22_17-48-10.png,1.260983e-06
2,1587998,Authorization_Harmonisation_JAN_2019.pptx,2.294894e-06
3,491131,image2020-9-24_9-31-29.png,2.637947e-06
4,368285,Screen Shot 2019-11-07 at 16.45.33.png,3.463602e-06


Create a data subset: df

In [12]:
df = data[:1000]
df.head(5)

Unnamed: 0,ID,Text,R
0,1474086,Screenshot 2020-05-19 at 18.55.13.png,6.223421e-07
1,905576,image2019-1-22_17-48-10.png,1.260983e-06
2,1587998,Authorization_Harmonisation_JAN_2019.pptx,2.294894e-06
3,491131,image2020-9-24_9-31-29.png,2.637947e-06
4,368285,Screen Shot 2019-11-07 at 16.45.33.png,3.463602e-06


In [13]:
df.shape

(1000, 3)

# 4) Clean data and split text into sentences

In [14]:
id_sentences = []
max_len = 256

for index, row in df.iterrows():
    line_id = row['ID']
    text_string = row['Text']
    text_sentences = clean_text_string_and_break_into_sentences_v2(text_string, max_len)
   
    new_line = []
    new_line.append(line_id)
    new_line.append(text_sentences)
    id_sentences.append(new_line)
    
id_sentences[:3]
    

[[1474086, ['screenshot at png']],
 [905576, ['png']],
 [1587998, ['authorization harmonisation jan pptx']]]

# 5) LOAD MODEL for Inference 

In [15]:
# MODELS_FOLDER = 'models_saved/'
# selected_model = '2021-03-22_01-22-44.checkpoint'

### Import Libraries

In [11]:
print('install transformers')
!pip install --upgrade transformers==4.2.2 --quiet
!pip install transformers[sentencepiece] --quiet #transformers v4.x -->

print('import packages')
import torch
from transformers import BertModel, BertTokenizer, BertForTokenClassification, BertConfig
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

import numpy as np
import pandas as pd

print('import BERT model')
from transformers import BertForTokenClassification, AdamW
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    #num_labels=len(tag2idx),
    num_labels= 7,
    output_attentions = False,
    output_hidden_states = False)
print('done')

install transformers
import packages
import BERT model


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

done


### Set GPUs 

In [17]:
# Set GPUs 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)  
    
# Pass the model parameters to the GPU.
model.cuda();

### Load model checkpoint

In [18]:
# selected_model = '2021-03-11_23-52-31'

PATH = MODELS_FOLDER+selected_model

# PATH = MODELS_FOLDER+selected_model+'/'+selected_model+'.checkpoint'
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint)

# # set model status
model.eval()
# # - or -
# #model.train()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

### Get Model Parameters

In [19]:
# Get MOdel Parameters
param_path = PATH[:32]+'_parameters'
file_param = param_path[13:]

folder = param_path[:13]

model_name = PATH[:32]
model_name = model_name[13:]

folder, model_name, file_param

('models_saved/', '2021-03-22_01-22-44', '2021-03-22_01-22-44_parameters')

### LOAD tag2idx, idx2tag, tag2name from file: 

In [20]:
selected_model

'2021-03-22_01-22-44.checkpoint'

In [21]:
# LOAD tag2idx, idx2tag, tag2name from file: 
def tag_values_tag2idx_idx2tag_tag2name_from_model(model):

    import pickle

    # tag2idx
    file = open(MODELS_FOLDER+selected_model[:19]+'_'+'tag2idx', 'rb')
    tag2idx = pickle.load(file)
    #print(tag2idx)

    # idx2tag
    file = open(MODELS_FOLDER+selected_model[:19]+'_'+'idx2tag', 'rb')
    idx2tag = pickle.load(file)
    #print(idx2tag)

    # tag2name
    file = open(MODELS_FOLDER+selected_model[:19]+'_'+'tag2name', 'rb')
    tag2name = pickle.load(file)
    #print(tag2name)
    
    tag_values = []


    for key in tag2idx.keys():
        tag_values.append(key)
        
    tags_vals = tag_values
    
    return tags_vals, tag_values, tag2idx, idx2tag, tag2name



tags_vals, tag_values, tag2idx, idx2tag, tag2name = tag_values_tag2idx_idx2tag_tag2name_from_model(selected_model)
print('tags_vals: ', tags_vals)
print('tag_values: ', tag_values)
print('tag2idx: ', tag2idx)
print('idx2tag: ', idx2tag)
print('tag2name: ', tag2name)

tags_vals:  ['I-PROD', 'B-PROD', 'O', 'X', '[CLS]', '[SEP]', 'PAD']
tag_values:  ['I-PROD', 'B-PROD', 'O', 'X', '[CLS]', '[SEP]', 'PAD']
tag2idx:  {'I-PROD': 0, 'B-PROD': 1, 'O': 2, 'X': 3, '[CLS]': 4, '[SEP]': 5, 'PAD': 6}
idx2tag:  {0: 'I-PROD', 1: 'B-PROD', 2: 'O', 3: 'X', 4: '[CLS]', 5: '[SEP]', 6: 'PAD'}
tag2name:  {0: 'I-PROD', 1: 'B-PROD', 2: 'O', 3: 'X', 4: '[CLS]', 5: '[SEP]', 6: 'PAD'}


# 6) INFERENCE MODEL

## a) Define Function:

In [22]:
def inference_sap_bert(test_sentence, inference_model, tag2name):
    
    model = inference_model
    
    test_sentence = test_sentence.lower()

    
    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
    tokenized_sentence = tokenizer.encode(test_sentence)
    #print('tokenized_sentence: ', tokenized_sentence)
    
    input_ids = torch.tensor([tokenized_sentence]).cuda()

    with torch.no_grad():
        output = model(input_ids)
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    #print('label_indices: ', label_indices)

    # join bpe split tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
    #print('tokens: ', tokens)
    
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, label_indices[0]):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            #new_labels.append(tag_values[label_idx])
            new_labels.append(tag2name[label_idx])
            new_tokens.append(token)

#     for token, label in zip(new_tokens, new_labels):
#         print("{}\t{}".format(label, token))
        
    prediction = {"Token": new_tokens, "Label": new_labels}
    df = pd.DataFrame(prediction)
    
    df2 = df[df['Token'] != '[CLS]']   
    prediction_df = df2[df2['Token'] != '[SEP]'] 
        
    return prediction_df

def inference_sap_bert_to_list(test_sentence, inference_model,tag2name):
    
    prediction_df = inference_sap_bert(test_sentence, inference_model,tag2name)
    prediction_list = prediction_df.values.tolist()

    return prediction_list

# b) Loop over NodeIDs-Text and Inference Model

In [23]:
tag2name = {0: 'B-PROD', 1: 'I-PROD', 2: 'O', 3: 'X', 4: '[CLS]', 5: '[SEP]', 6: 'PAD'}

In [24]:
def remove_o_tokens_from_token_label_list(list):
    new_list= []
    for i in list:
        if i[1] != 'O':
            new_list.append(i)
    return new_list

In [25]:
len(id_sentences)

1000

### b.1) Unit Testing:

In [26]:
# UNIT TESTING:
test_sentence = 'sap hana is the in-memory database from sap'

pred = inference_sap_bert_to_list(test_sentence, model,tag2name)
pred

[['sap', 'B-PROD'],
 ['hana', 'I-PROD'],
 ['is', 'O'],
 ['the', 'O'],
 ['in', 'O'],
 ['-', 'O'],
 ['memory', 'O'],
 ['database', 'O'],
 ['from', 'O'],
 ['sap', 'O']]

In [27]:
clean_pred = remove_o_tokens_from_token_label_list(pred)
clean_pred

[['sap', 'B-PROD'], ['hana', 'I-PROD']]

In [28]:
# for row in id_sentences[:3]:
#     node_id = row[0]
#     sentces = row[1]
#     print(node_id)
#     #print(sentces)
#     for i in sentces:
#         sentces_pred = [] 
#         pred = inference_sap_bert_to_list(i, model,tag2name)
#         clean_pred = remove_o_tokens_from_token_label_list(pred)
#         print(len(clean_pred))

### b.2) Real-Execution:

In [29]:
len(id_sentences)

1000

In [30]:
id_predictions = []
sentces_pred = [] 
new_line= []

for row in id_sentences:
    node_id = row[0]
    sentces = row[1]
    
    # for each sentence:
    for i in sentces:
        pred = inference_sap_bert_to_list(i, model,tag2name)
        #print(pred)
        clean_pred = remove_o_tokens_from_token_label_list(pred)
        if len(clean_pred) > 0:
            sentces_pred.append(clean_pred)
        #print(sentces_pred)
        
    if len(sentces_pred) > 0:    
        new_line.append(node_id)
        new_line.append(sentces_pred)
        id_predictions.append(new_line)
    sentces_pred = [] 
    new_line= []
    
id_predictions[0]

[2180102,
 [[['hanamobilepreview', 'B-PROD']],
  [['sybase', 'B-PROD'], ['ase', 'I-PROD'], ['ase', 'I-PROD']]]]

In [31]:
len(id_predictions)

117

In [32]:
def save_df_to_file(filename, df):
    file_path_to_a = 'ekg_enrichment/'+filename
    df.to_csv(file_path_to_a, sep=',',index=False)

In [35]:
id_pred_df = pd.DataFrame(id_predictions)

save_df_to_file('node_pred_1_1000_nodes.csv', id_pred_df)


In [34]:
stop

NameError: name 'stop' is not defined

# 7) Consolidate Full-Entities from predictions:

## a) Define function to Get entities from token/label list

In [36]:
def get_entities_from_token_label_list(tkn_lbl):
    prev_lbl = 'O'
    entity = ''
    entity_list = []
    sentence_entity = []
    entity_list2 = []
    space = ' '

    # tkn_lbl = [['sa', 'B-PROD']]

    for i in tkn_lbl:
        if i[1] == 'B-PROD':
            if prev_lbl == 'O' or prev_lbl == 'I-PROD':
                if entity != '':
                    entity_list.append(entity)   # append previous entity, understand it is completed
                    entity = ''

                entity = i[0]
                prev_lbl = 'B-PROD'

            elif prev_lbl == 'B-PROD':
                entity_list.append(entity)   # append previous entity, understand it is completed
                entity = ''
                
                entity = i[0]
                prev_lbl = 'B-PROD'         

        elif i[1] == 'I-PROD':
            if prev_lbl == 'B-PROD':
                tkn = i[0]
                entity = entity + space
                entity = entity + tkn
                prev_lbl = 'I-PROD'

            elif prev_lbl == 'I-PROD':
                if '##' in i[0]:
                    tkn = i[0].replace('##','')
                    entity = entity + tkn    
                    prev_lbl = 'I-PROD'
                elif '##' not in i[0]:
                    tkn = i[0].replace('##','')
                    entity = entity + space
                    entity = entity + tkn    
                    prev_lbl = 'I-PROD'
                    
            elif prev_lbl == 'O':
                if entity != '':
                    entity_list.append(entity)                 
                    entity = ''

                tkn = i[0].replace('##','')
                entity = entity + tkn 
                prev_lbl = 'I-PROD'

        elif i[1] == 'O':
            if prev_lbl == 'B-PROD' or prev_lbl == 'I-PROD':
                entity_list.append(entity)              
                entity = ''
                prev_lbl = 'O'
            elif prev_lbl == 'O':
                prev_lbl = 'O'

    if entity != '':
        entity_list.append(entity)     

    return entity_list

## b) Prepare id_predictions list:

In [37]:
id_predictions[:5]

[[2180102,
  [[['hanamobilepreview', 'B-PROD']],
   [['sybase', 'B-PROD'], ['ase', 'I-PROD'], ['ase', 'I-PROD']]]],
 [1562662,
  [[['sap', 'B-PROD'],
    ['ea', 'I-PROD'],
    ['designer', 'I-PROD'],
    ['powerdesigner', 'B-PROD']],
   [['sap', 'B-PROD'],
    ['data', 'I-PROD'],
    ['warehouse', 'I-PROD'],
    ['cloud', 'I-PROD'],
    ['hana', 'B-PROD'],
    ['hana', 'B-PROD']],
   [['hana', 'B-PROD'],
    ['hana', 'B-PROD'],
    ['hana', 'B-PROD'],
    ['sap', 'B-PROD'],
    ['cloud', 'I-PROD'],
    ['platform', 'I-PROD']],
   [['sap', 'B-PROD'],
    ['cloud', 'I-PROD'],
    ['platform', 'I-PROD'],
    ['hana', 'B-PROD'],
    ['predictive', 'B-PROD'],
    ['analytics', 'I-PROD'],
    ['hybris', 'B-PROD'],
    ['sap', 'B-PROD'],
    ['bw', 'I-PROD']],
   [['intelligent', 'B-PROD']]]],
 [842808,
  [[['sap', 'B-PROD'], ['marketing', 'I-PROD']],
   [['sap', 'B-PROD'], ['marketing', 'I-PROD']],
   [['sap', 'B-PROD'], ['marketing', 'I-PROD']]]],
 [2517425,
  [[['scp', 'B-PROD'], ['scp', '

In [38]:
id_predictions_cont = []
node_pred_cont = []
new_line = []

for row in id_predictions:
    node_id = row[0]
    node_pred = row[1]
    
    for sent in node_pred:
        for tkn_pred in sent:
            node_pred_cont.append(tkn_pred)
    
    new_line.append(node_id)
    new_line.append(node_pred_cont)
    
    id_predictions_cont.append(new_line)
    
    node_pred_cont = []
    new_line = []

print(len(id_predictions_cont))
id_predictions_cont[0]                 

117


[2180102,
 [['hanamobilepreview', 'B-PROD'],
  ['sybase', 'B-PROD'],
  ['ase', 'I-PROD'],
  ['ase', 'I-PROD']]]

In [39]:
id_pred_entities = []
new_line = []

for node in id_predictions_cont:
    node_id = node[0]
    pred    = node[1]
    
    node_ent = get_entities_from_token_label_list(pred)
    
    new_line.append(node_id)
    new_line.append(node_ent)
    id_pred_entities.append(new_line)
    new_line = []
    
id_pred_entities

[[2180102, ['hanamobilepreview', 'sybase ase ase']],
 [1562662,
  ['sap ea designer',
   'powerdesigner',
   'sap data warehouse cloud',
   'hana',
   'hana',
   'hana',
   'hana',
   'hana',
   'sap cloud platform',
   'sap cloud platform',
   'hana',
   'predictive analytics',
   'hybris',
   'sap bw',
   'intelligent']],
 [842808, ['sap marketing', 'sap marketing', 'sap marketing']],
 [2517425, ['scp', 'scp', 'cpi', 'cloud platform']],
 [1310766, ['sybase', 'sybasegeneva', 'sybase', 'sybase', 'sybase']],
 [626553, ['mira']],
 [1263543, ['sap hana', 'netweaver']],
 [1877764,
  ['erp',
   'erp',
   'erp',
   'kw',
   'erp',
   'hcm',
   'erp',
   'erp',
   'erp',
   'erp',
   'kw',
   'hcm',
   'hcm',
   'erp',
   'hcm',
   'erp',
   'erp',
   'hcm',
   'hcm',
   'hcm',
   'erp']],
 [2596248, ['cf', 'cf']],
 [574195, ['hana', 'bw', 'hana', 'hana', 'hana', 'bw']],
 [2042638,
  ['bobobo',
   'sap sybase',
   'sybase',
   'bobj',
   'bobj',
   'bobj',
   'bobj',
   'bobj',
   'bobj',
   

# MAP FINAL ENTITIES:

In [40]:
id_pred_entities[:3]

[[2180102, ['hanamobilepreview', 'sybase ase ase']],
 [1562662,
  ['sap ea designer',
   'powerdesigner',
   'sap data warehouse cloud',
   'hana',
   'hana',
   'hana',
   'hana',
   'hana',
   'sap cloud platform',
   'sap cloud platform',
   'hana',
   'predictive analytics',
   'hybris',
   'sap bw',
   'intelligent']],
 [842808, ['sap marketing', 'sap marketing', 'sap marketing']]]

### Load Entity mapping table:

In [41]:
# INPUT_FOLDER = "sap-products/"
# input_file = "Final_Entity_mapping.csv"

In [42]:
df_ent_map = pd.read_csv(INPUT_FOLDER+input_file,sep=",",encoding="latin1").fillna(method='ffill')    
df_ent_map.head(5)

Unnamed: 0,Source,Target
0,sap 365 messaging hub,sap 365 messaging hub
1,sap 3d visual enterprise,sap 3d visual enterprise
2,sap a1s,sap a1s
3,sap accelerated trade promotion planning,sap accelerated trade promotion planning
4,sap accelerometer,sap accelerometer


In [43]:
list_ent_map = df_ent_map.values.tolist()

In [44]:
print(list_ent_map)

[['sap 365 messaging hub', 'sap 365 messaging hub'], ['sap 3d visual enterprise', 'sap 3d visual enterprise'], ['sap a1s', 'sap a1s'], ['sap accelerated trade promotion planning', 'sap accelerated trade promotion planning'], ['sap accelerometer', 'sap accelerometer'], ['sap access control', 'sap access control'], ['sap access violation management', 'sap access violation management'], ['sap account intelligence', 'sap account intelligence'], ['sap account substantiation and automation', 'sap account substantiation and automation'], ['sap accounting doc approve', 'sap accounting doc approve'], ['sap accounts payable', 'sap accounts payable'], ['sap accpak for data srvs', 'sap accpak for data srvs'], ['sap accpak inf steward', 'sap accpak inf steward'], ['sap acrobat connect', 'sap acrobat connect'], ['sap acs for sap s/4hana', 'sap acs for sap s/4hana'], ['sap act', 'sap act'], ['sap ada cloud etd', 'sap ada cloud etd'], ['sap adapter for quality center', 'sap adapter for quality center'

### Map Entities

#### a) Define function:

In [45]:
# UNIT TESTING:
entity_in = 'hana'

ent_mapping = df_ent_map.loc[df_ent_map['Source'] == entity_in]
entity_out = ent_mapping.iloc[0]['Target']
entity_out

'sap hana'

In [46]:
def entity_in_entity_out(entity_in):
    ent_mapping = df_ent_map.loc[df_ent_map['Source'] == entity_in]
    if len(ent_mapping) > 0:
        ent_out = ent_mapping.iloc[0]['Target']
    entity_out = ent_out
    return entity_out

In [47]:
entity_in = 'hana'
entity_in_entity_out(entity_in)

'sap hana'

#### b) Execute Entity Mapping:

In [48]:
final_id_entities = []
final_row_ent = []
final_row = []

for row in id_pred_entities:
    nodeid = row[0]
    entities = row[1]
    
    for entity_in in entities:
#         entity_out = entity_in_entity_out(entity_in)
#         if len(entity_out) > 0:
        ent_mapping = df_ent_map.loc[df_ent_map['Source'] == entity_in]
        if len(ent_mapping) > 0:
            entity_out = ent_mapping.iloc[0]['Target']
            final_row_ent.append(entity_out)
     
    final_row.append(nodeid)
    final_row.append(final_row_ent)
    final_id_entities.append(final_row)
    
    final_row_ent = []
    final_row = []

In [49]:
len(final_id_entities)
final_id_entities

[[2180102, ['sap hana', 'sap ase']],
 [1562662,
  ['sap ea designer',
   'sap powerdesigner',
   'sap data warehouse cloud',
   'sap hana',
   'sap hana',
   'sap hana',
   'sap hana',
   'sap hana',
   'sap cloud platform',
   'sap cloud platform',
   'sap hana',
   'sap predictive analytics',
   'sap hybris']],
 [842808, ['sap marketing', 'sap marketing', 'sap marketing']],
 [2517425, ['sap cloud platform integration', 'sap cloud platform']],
 [1310766,
  ['sap sybase iq', 'sap sybase iq', 'sap sybase iq', 'sap sybase iq']],
 [626553, []],
 [1263543, ['sap hana', 'sap netweaver']],
 [1877764,
  ['sap erp',
   'sap erp',
   'sap erp',
   'sap knowledge warehouse',
   'sap erp',
   'sap hcm',
   'sap erp',
   'sap erp',
   'sap erp',
   'sap erp',
   'sap knowledge warehouse',
   'sap hcm',
   'sap hcm',
   'sap erp',
   'sap hcm',
   'sap erp',
   'sap erp',
   'sap hcm',
   'sap hcm',
   'sap hcm',
   'sap erp']],
 [2596248, ['sap cloud foundry', 'sap cloud foundry']],
 [574195,
  ['

# SAVE FINAL ID-ENTITIES TO FILE:

In [50]:
def save_df_to_file(filename, df):
    file_path_to_a = 'ekg_enrichment/'+filename
    df.to_csv(file_path_to_a, sep=',',index=False)

In [51]:
def save_list_to_csv(filename, listname):

    df = pd.DataFrame(listname)

    save_df_to_file(filename, df)
    
    return print('Data saved to file')

In [52]:
save_list_to_csv('final_id_entities_1_1000_nodes.csv', final_id_entities)

Data saved to file
