In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install bert-embedding

In [4]:
import os, re, io
import pandas as pd
import numpy as np
import requests

from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import spacy
nlp = spacy.load('en_core_web_sm')

## Data preparation

In [5]:
QA_df = pd.read_csv("/content/drive/MyDrive/tutorial/FAQ nlp/FAQs.csv")
QA_df.head()

Unnamed: 0,Question,Answer
0,When was Albert Einstein born?,Albert Einstein was born on 14 March 1879.
1,Where was he born?,"He was born in Ulm, Germany."
2,When did he die?,"He died 18 April 1955 in Princeton, New Jersey..."
3,Who were his parents?,His father was Hermann Einstein and his mother...
4,Did he have any sisters and brothers?,He had one sister named Maja.


## Preprocessing Techniques

1. Remove unwanted characters
2. Remove Question number
3. Remove stopwords

In [7]:
## Data Preprocessing
class TextPreprocessor():
    def __init__(self, data_df, column_name=None):
        self.data_df = data_df  
        if not column_name and type(column_name) == str:
            raise Exception("column name is mandatory. Make sure type is string format")
        self.column = column_name
        self.convert_lowercase()    
        self.applied_stopword = False
        self.processed_column_name = f"processed_{self.column}"
        
    def convert_lowercase(self):
        ## fill empty values into empty
        self.data_df.fillna('',inplace=True)
        ## reduce all the columns to lowercase
        self.data_df = self.data_df.apply(lambda column: column.astype(str).str.lower(), axis=0)    

    def remove_question_no(self):
        ## remove question no        
        self.data_df[self.column] = self.data_df[self.column].apply(lambda row: re.sub(r'^\d+[.]',' ', row))    
        
    def remove_symbols(self):
        ## remove unwanted character          
        self.data_df[self.column] = self.data_df[self.column].apply(lambda row: re.sub(r'[^A-Za-z0-9\s]', ' ', row))    

    def remove_stopwords(self):
        ## remove stopwords and create a new column 
        for idx, question in enumerate(self.data_df[self.column]):      
            self.data_df.loc[idx, self.processed_column_name] = remove_stopwords(question)        


    def process(self, perform_stopword = True):
        self.remove_question_no()
        self.remove_symbols()
        if perform_stopword:
            self.remove_stopwords()
        return self.data_df

In [8]:
## pre-process training question data
text_preprocessor = TextPreprocessor(QA_df.copy(), column_name="Question")
processed_QA_df = text_preprocessor.process(perform_stopword=True)
processed_QA_df.head()

Unnamed: 0,Question,Answer,processed_Question
0,when was albert einstein born,albert einstein was born on 14 march 1879.,albert einstein born
1,where was he born,"he was born in ulm, germany.",born
2,when did he die,"he died 18 april 1955 in princeton, new jersey...",die
3,who were his parents,his father was hermann einstein and his mother...,parents
4,did he have any sisters and brothers,he had one sister named maja.,sisters brothers


## Techniques for Question representations

In this section will be discussing on multiple ways of representing FAQ questions.

1. TF-IDF
2. Word Embedding
3. BERT Embedding


### TF_IDF Representation

The first approach we will use for semantic similarity is leveraging Bag of Words (BOW). TF-IDF transforms the text into meaningful numbers. The technique is a widely used feature extraction in NLP applications. TF (Term Frequency) measures the no of times that words appear in a document. IDF (Inverse Document Frequency) measures low value for words that has high frequency across all the documents.

In [16]:
class TF_IDF():
    def __init__(self):
        self.dictionary = None    
        self.model = None
        self.bow_corpus = None

    def create_tf_idf_model(self, data_df, column_name):
        ## create sentence token list
        sentence_token_list = [sentence.split(" ") for sentence in data_df[column_name]]

        ## dataset vocabulary
        self.dictionary = Dictionary(sentence_token_list) 

        ## bow representation of dataset
        self.bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list]

        ## compute TF-IDF score for corpus
        self.model = TfidfModel(self.bow_corpus)

        ## representation of question and respective TF-IDF value
        print(f"First 10 question representation of TF-IDF vector")
        for index, sentence in enumerate(data_df[column_name]):
            if index <= 10:
                print(f"{sentence} {self.model[self.bow_corpus[index]]}")
            else:
                break

    def get_vector_for_test_set(self, test_df, column_name):
        ## store tf-idf vector
        testset_tf_idf_vector = []
        sentence_token_list = [sentence.split(" ") for sentence in test_df[column_name]]
        test_bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list]   
        for test_sentence in test_bow_corpus:
            testset_tf_idf_vector.append(self.model[test_sentence])      

        return testset_tf_idf_vector

    def get_training_QA_vectors(self):
        QA_vectors = []
        for sentence_vector in self.bow_corpus:
            QA_vectors.append(self.model[sentence_vector])      
        return QA_vectors

    def get_train_vocabulary(self):
        vocab = []
        for index in self.dictionary:
            vocab.append(self.dictionary[index])
        return vocab

### Word Embedding

*GloVe* is an unsupervised learning algorithm for obtaining vector representations for words. It trained on the global word-word co-occurrence matrix. I downloaded a pre-trained word vector from Glove for our analysis. The code snippets for generating word embedding representation as below code snippet,

In [9]:
class Embeddings():
    def __init__(self, model_path):
        self.model_path = model_path
        self.model = None
        self.__load_model__()
        
    def __load_model__(self):
        #word_vectors = api.load("glove-wiki-gigaword-100")  
        model_name = 'glove-twitter-25' #'word2vec-google-news-50' #'glove-twitter-25'  
        if not os.path.exists(self.model_path+ model_name):
            print("Downloading model")
            self.model = api.load(model_name)
            self.model.save(self.model_path+ model_name)
        else:
            print("Loading model from Drive")
            self.model = KeyedVectors.load(self.model_path+ model_name)
        
    def get_oov_from_model(self, document_vocabulary):
        ## the below words are not available in our pre-trained model model_name
        print("The below words are not found in our pre-trained model")
        words = []
        for word in set(document_vocabulary):  
            if word not in self.model:
                words.append(word)
        print(words)  

    def get_sentence_embeddings(self, data_df, column_name):
        sentence_embeddings_list = []
        for sentence in data_df[column_name]:      
            sentence_embeddings = np.repeat(0, self.model.vector_size)
            try:
                tokens = sentence.split(" ")
                ## get the word embedding
                for word in tokens:
                    if word in self.model:
                        word_embedding = self.model[word]
                    else:
                        word_embedding = np.repeat(0, self.model.vector_size)          
                    sentence_embeddings = sentence_embeddings + word_embedding
                ## take the average for sentence embeddings
                #sentence_embeddings = sentence_embeddings / len(tokens)
                sentence_embeddings_list.append(sentence_embeddings.reshape(1, -1))
            except Exception as e:
                print(e)
            
        return sentence_embeddings_list

### BERT Embedding

*BERT* is a transformer-based model attempts to use the context of words to get embedding. BERT broke several records in NLP tasks. 


We observe that the relationship of the word “to” to other words in the sentence are important to decode the meaning semantically. Returning information about USA citizens traveling to Brazil is not relevant since we are talking about Brazil citizens traveling to the USA. BERT can handle this well.

In [10]:
from bert_embedding import BertEmbedding

In [11]:
## get bert embeddings
def get_bert_embeddings(sentences):
    bert_embedding = BertEmbedding()
    return bert_embedding(sentences)

### Use above techniques for our analysis

#### TF-IDF Computation

In [17]:
tf_idf = TF_IDF()
tf_idf.create_tf_idf_model(processed_QA_df, "processed_Question")
## get the tf-idf reprentation 
question_QA_vectors = tf_idf.get_training_QA_vectors()

First 10 question representation of TF-IDF vector
albert einstein born [(0, 0.5138715540007341), (1, 0.6869294374080508), (2, 0.5138715540007341)]
born [(1, 1.0)]
die [(3, 1.0)]
parents [(4, 1.0)]
sisters brothers [(5, 0.7071067811865476), (6, 0.7071067811865476)]
marry children [(7, 0.7071067811865476), (8, 0.7071067811865476)]
receive education [(9, 0.819628335022292), (10, 0.5728956208844553)]
albert einstein awarded nobel prize physics [(0, 0.297281782345028), (2, 0.297281782345028), (11, 0.568548224666586), (12, 0.297281782345028), (13, 0.568548224666586), (14, 0.297281782345028)]
albert einstein attend nobel prize award ceremony [(0, 0.25843291327671597), (2, 0.25843291327671597), (12, 0.25843291327671597), (14, 0.25843291327671597), (15, 0.49425017866840054), (16, 0.49425017866840054), (17, 0.49425017866840054)]
receive nobel prize [(10, 0.6869294374080508), (12, 0.5138715540007341), (14, 0.5138715540007341)]


In [18]:
## Get the document vocabulary list from TF-IDF
document_vocabulary = tf_idf.get_train_vocabulary()

#### Embeddings (Glove)

In [12]:
import gensim.downloader as api
from gensim.models import KeyedVectors

In [19]:
## create Embedding object
embedding = Embeddings("")
## look for out of vocabulary FAQ dataset - pretrained model
embedding.get_oov_from_model(document_vocabulary)
## get the sentence embedding for FAQ dataset
question_QA_embeddings = embedding.get_sentence_embeddings(processed_QA_df, "processed_Question")

Loading model from Drive
The below words are not found in our pre-trained model
[]


#### BERT Embeddings

In [20]:
question_QA_bert_embeddings_list = get_bert_embeddings(processed_QA_df["Question"].to_list())

Vocab file is not found. Downloading.
Downloading /root/.mxnet/models/book_corpus_wiki_en_uncased-a6607397.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/book_corpus_wiki_en_uncased-a6607397.zip...
Downloading /root/.mxnet/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip...


## Evaluate with test query

One of the best techniques to find a similarity score is **Cosine Similarity**. We will use cosine similarity for comparing each representation now.

In [21]:
## helps to retrieve similar question based of input vectors/embeddings for test query
def retrieveSimilarFAQ(train_question_vectors, test_question_vectors, train_QA_df, train_column_name, test_QA_df, test_column_name):
    similar_question_index = []
    for test_index, test_vector in enumerate(test_question_vectors):
        sim, sim_Q_index = -1, -1
        for train_index, train_vector in enumerate(train_question_vectors):
            sim_score = cosine_similarity(train_vector, test_vector)[0][0]
            
            if sim < sim_score:
                sim = sim_score
                sim_Q_index = train_index

        print("######")
        print(f"Query Question: {test_QA_df[test_column_name].iloc[test_index]}") 
        print(f"Retrieved Question: {train_QA_df[train_column_name].iloc[sim_Q_index]}")
        print(f"Score: {sim}")
        print("-----------------------------")

Let's create sample few question for testing purpose.

In [22]:
test_QA_df = pd.read_csv("/content/drive/MyDrive/tutorial/FAQ nlp/FAQs_test.csv")
test_QA_df.head()

Unnamed: 0,Question
0,What is the date of his death?
1,Did Einstein have siblings?
2,Who was his wife?
3,What was Einstein's father's name?
4,At what institutions did he study?


In [23]:
## pre-process testing QA data
text_preprocessor = TextPreprocessor(test_QA_df, column_name="Question")
query_QA_df = text_preprocessor.process(perform_stopword=True)

In [24]:
## TF-IDF vector represetation
query_QA_vectors = tf_idf.get_vector_for_test_set(query_QA_df, "processed_Question")
query_QA_df.head()

Unnamed: 0,Question,processed_Question
0,what is the date of his death,date death
1,did einstein have siblings,einstein siblings
2,who was his wife,wife
3,what was einstein s father s name,einstein s father s
4,at what institutions did he study,institutions study


### Test with Embeddings

In [25]:
## get the sentence embedding for COVID QA query
query_QA_embeddings = embedding.get_sentence_embeddings(query_QA_df, "processed_Question")

retrieveSimilarFAQ(question_QA_embeddings, query_QA_embeddings, processed_QA_df, "Question", query_QA_df, "Question")

######
Query Question: what is the date of his death 
Retrieved Question: did he marry and have children 
Score: 0.8474869727438736
-----------------------------
######
Query Question: did einstein have siblings 
Retrieved Question: did he marry and have children 
Score: 0.8570299123210484
-----------------------------
######
Query Question: who was his wife 
Retrieved Question: did he marry and have children 
Score: 0.8876234649257587
-----------------------------
######
Query Question: what was einstein s father s name 
Retrieved Question: when was albert einstein born 
Score: 0.7792911365065182
-----------------------------
######
Query Question: at what institutions did he study 
Retrieved Question: where did he receive his education 
Score: 0.8314761186414249
-----------------------------


### Test with BERT Embeddings

In [26]:
query_QA_bert_embeddings_list = get_bert_embeddings(test_QA_df["Question"].to_list())

In [27]:
## store QA bert embeddings in list
question_QA_bert_embeddings = []
for embeddings in question_QA_bert_embeddings_list:
    question_QA_bert_embeddings.append(embeddings[1])

## store query string bert embeddings in list
query_QA_bert_embeddings = []
for embeddings in query_QA_bert_embeddings_list:
    query_QA_bert_embeddings.append(embeddings[1])

In [28]:
retrieveSimilarFAQ(question_QA_bert_embeddings, query_QA_bert_embeddings, processed_QA_df, "Question", query_QA_df, "Question")

######
Query Question: what is the date of his death 
Retrieved Question: when was albert einstein born 
Score: 0.7349212169647217
-----------------------------
######
Query Question: did einstein have siblings 
Retrieved Question: did he have any sisters and brothers 
Score: 0.8166143298149109
-----------------------------
######
Query Question: who was his wife 
Retrieved Question: who were his parents 
Score: 0.8101394772529602
-----------------------------
######
Query Question: what was einstein s father s name 
Retrieved Question: where was he born 
Score: 0.6459028720855713
-----------------------------
######
Query Question: at what institutions did he study 
Retrieved Question: where did he receive his education 
Score: 0.4249792993068695
-----------------------------
