In [1]:
# import required libs
import pandas as pd
from transformers import BertTokenizer, BertModel
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


In [2]:
# import self-build excel datasets, manually scrapping the articles on different websites 
# to be categorized into: [0] -> Healthcare, AI, IoT, Blockchain
df = pd.read_excel("text_summary_datasets_v2.xlsx")

# tokenizer and bert
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")



In [3]:
# nltk libs
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# text pre-processing function
def preprocess_text(text):
    # case standardization
    text = text.lower() # dont care about capitalization yet
    
    # puntuation removal
    text = text.replace('"', '') # our text consists of multiple sentences, some punctuations are needed

    # tokenized text
    tokens = tokenizer.tokenize(text)
    
    ## stop word removal
    #new_tokens = []
    #for token in tokens:
    #    if token.lower() not in stop_words:
    #        new_tokens.append(token)
    '''
    original:
    ['the', 'diagnosis', 'of', 'v', '##kh', 'followed', 'revised', 'diagnostic', 'criteria', 'by', 'the', 'internation', ...]
    remove stop words:
    ['diagnosis', 'v', '##kh', 'followed', 'revised', 'diagnostic', 'criteria', 'international', ...]
    Thus don't remove stop words, it might lead to poor BERT semantic understand. 
    '''

    # lemmatizer and stemmer
    # Lemmatization and Stemming
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    #stemmed_tokens = [stemmer.stem(token) for token in tokens]

    #print(tokens)
    #print(lemmatized_tokens)
    #print(stemmed_tokens)
    '''
    lemmatized:
    ['the', 'diagnosis', 'of', 'v', '##kh', 'followed', 'revised', 'diagnostic', 'criterion', 'by', 'the', 'international', ...]
    stemmed:
    ['the', 'diagnosi', 'of', 'v', '##kh', 'follow', 'revis', 'diagnost', 'criteria', 'by', 'the', 'intern', ...]
    Stemmed is bad here, choose lemmatizer over stemmer.
    '''

    # change tokens back to senteces
    def detokenize(tokens):
        new_tokens = []
        for token in tokens:
            if token.startswith("##"):
                new_tokens[-1] += token[2:]
            else:
                new_tokens.append(token)
        text = " ".join(new_tokens)
        text = re.sub(r'\s([?.!,\'-](?:\s|$))', r'\1', text)
        return text

    text = detokenize(lemmatized_tokens)

    # capitalize first alphabet of each sentence
    text = re.sub(r"(^|[.!?]\s+)(\w+)", lambda match: match.group(1) + match.group(2).capitalize(), text)

    return text

In [5]:
# BERT
'''
choose BERT to get text semantic meaning to be used for classification and clustering. This is more advanced than keywords counting.
'''
def toBert(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    outputs = bert_model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
    return outputs.last_hidden_state[0, 0, :]

In [6]:
# empty data
data = {
    "Index": [],
    "Category": [],
    **{f"dim_{it+1}": [] for it in range(768)}
}

In [7]:
for p, row in df.iterrows():
    data["Index"].append(row["Index"])
    data["Category"].append(row["Category"])
    text = str(row["Summary"])
    #print(preprocess_text(text))
    text = preprocess_text(text)

    outputs = toBert(text)

    for it in range(768):
        data[f"dim_{it+1}"].append(outputs[it].item())

In [8]:
new_df = pd.DataFrame(data)

new_df.to_excel("training_data_v2.xlsx", index=False)