In [9]:
import numpy as np 
import pandas as pd 
import os
from sklearn.model_selection import train_test_split
import re                                  # library for regular expression operations
import string                              # for string operations
import nltk
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import word_tokenize   # module for tokenizing strings
from tqdm import tqdm
import matplotlib.pyplot as plt

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader

In [10]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [48]:
DATA_DIR = '../input/faq-ques/'

In [49]:
data_df = pd.read_csv(DATA_DIR+'FAQs.csv')
data_df.head()

Unnamed: 0,Question,Answer
0,When was Albert Einstein born?,Albert Einstein was born on 14 March 1879.
1,Where was he born?,"He was born in Ulm, Germany."
2,When did he die?,"He died 18 April 1955 in Princeton, New Jersey..."
3,Who were his parents?,His father was Hermann Einstein and his mother...
4,Did he have any sisters and brothers?,He had one sister named Maja.


In [50]:
data_df_text = data_df[['Question','Answer']]
data_df_text.head()

Unnamed: 0,Question,Answer
0,When was Albert Einstein born?,Albert Einstein was born on 14 March 1879.
1,Where was he born?,"He was born in Ulm, Germany."
2,When did he die?,"He died 18 April 1955 in Princeton, New Jersey..."
3,Who were his parents?,His father was Hermann Einstein and his mother...
4,Did he have any sisters and brothers?,He had one sister named Maja.


# Preprocessing Helper Functions

In [33]:
def remove_punctuation(text):
    return "".join(["" if ch in string.punctuation else ch.lower() for ch in text])
stopwords_english = set(stopwords.words('english'))
def clean_words(headline):
    return [
    word for word in headline
    if word not in stopwords_english
]  
stemmer = PorterStemmer()
def words_stems(headline):
    return [
    stemmer.stem(word) for word in headline
]
def tokenize_text(text):
    return word_tokenize(text)
def remove_numbers(text):
    return re.sub("[^a-zA-Z]", " ", text)

In [51]:
data_df_text['Question'] = data_df_text['Question'].apply(remove_punctuation).apply(remove_numbers).apply(tokenize_text).apply(clean_words)
data_df_text.head()

Unnamed: 0,Question,Answer
0,"[albert, einstein, born]",Albert Einstein was born on 14 March 1879.
1,[born],"He was born in Ulm, Germany."
2,[die],"He died 18 April 1955 in Princeton, New Jersey..."
3,[parents],His father was Hermann Einstein and his mother...
4,"[sisters, brothers]",He had one sister named Maja.


In [35]:
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')

# Building corpus for Training

In [36]:
# tagged_data = [TaggedDocument(row['Question'], [i]) for i, row in data_df_text.iterrows()]

# Define Model

In [37]:
# model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)

In [38]:
# model.save("similar-faq.model")


In [39]:
## Load saved doc2vec model
# model= Doc2Vec.load("similar-faq.model")

# Generate embeddings from trained model

In [40]:
def get_embedding(sentence):
    embeddings = []
    for word in sentence:

        try:
            embeddings.append(glove_vectors[word])
        except:
            pass
    return embeddings

In [52]:
texts = data_df_text["Question"]
embed_list = []
for text in texts:
    embed_list.append(get_embedding(text))

data_df_text["embeddings"] = embed_list

In [53]:
data_df_text.head()

Unnamed: 0,Question,Answer,embeddings
0,"[albert, einstein, born]",Albert Einstein was born on 14 March 1879.,"[[0.43446, -0.18111, -0.57913, 0.28164, 0.4187..."
1,[born],"He was born in Ulm, Germany.","[[0.40156, -0.52029, -0.11804, -0.35792, 0.173..."
2,[die],"He died 18 April 1955 in Princeton, New Jersey...","[[-0.27016, 0.76719, -0.25425, -0.41573, -0.44..."
3,[parents],His father was Hermann Einstein and his mother...,"[[0.85943, 0.12182, -0.10011, -0.23964, -0.920..."
4,"[sisters, brothers]",He had one sister named Maja.,"[[0.725, 0.27066, -0.63327, 0.17211, -0.2595, ..."


# Evaluate model on user generated text

In [54]:
def preprocess_text(text):
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = tokenize_text(text)
    text = clean_words(text)
    #text = words_stems(text)
    return text
    

In [55]:
def find_similar_questions(ques):
    score_list = []
    ques = preprocess_text(ques)
    print(ques)
    for i in range(len(data_df_text)):
        func_embeddings = data_df_text.iloc[i, 2]
        func_embeddings2, func_item_name2 = [], []
       
        for word in ques:

            try:
                func_embeddings2.append(glove_vectors[word])
  
            except:
                pass
        final_vec2 = [0]*100
        for v in func_embeddings2:
            final_vec2 += v 

        try:
            score = cosine_similarity(func_embeddings, func_embeddings2)
            score = np.mean(score)
            score_list.append([score, data_df.iloc[i, 1]])
        except:
            pass
    #print(score_list)
    score_list.sort(reverse = True)
    res = []
    try:
        res = score_list[:5]
        return res
    except:
        return "nan"

In [59]:
find_similar_questions("What is the date of his death?")

['date', 'death']


[[0.4510256,
  'His father was Hermann Einstein and his mother was Pauline Einstein (born Koch).'],
 [0.45052603, 'He died 18 April 1955 in Princeton, New Jersey, USA.'],
 [0.42582434,
  'He was married to Mileva Marić between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa Löwenthal in 1919 and they lived together until her death in 1936.'],
 [0.4098153, 'He was born in Ulm, Germany. '],
 [0.36222744,
  'He received his main education at the following schools: Catholic elementary school in Munich, Germany (1885-1888)Luitpold Gymnasium in Munich, Germany (1888-1894) Cantonal school in Aarau, Switzerland (1895-1896) Swiss Federal Institute of Technology in Zurich, Switzerland (1896-1900) Ph.D. from Zurich University, Switzerland (1905)']]

* References for doc2vec function usage: https://www.kaggle.com/code/yashtiwari1906/doc2vec-for-search