# Fact Retreival Bot using IDFT
### Steps
- Loading and preprocessing Questions and Answers from dataset
- Setting Stopwords
- Intitialising and training TF_IDF vectors
- Testing

## Imports

In [4]:
import pandas as pd                   # To load and process dataset
import numpy as np                    # For matrix operations
from nltk.corpus import stopwords     # Using NLTK to load stopwords
from nltk import wordpunct_tokenize   # Using NLTK to token sentences

from beakerx import *
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.width',1000)

## Loading and preprocessing Questions and Answers from dataset
- `hdfc.pkl` : Collection of 1341 QnA about HDFC. (Scraped from HDFC's FAQ site)
- Dropping stopwords
- Stripping Questions of extra spaces

In [11]:
df = pd.read_excel('hdfc.xlsx')
df = df.drop_duplicates('Question')
df = df.reset_index()

In [121]:
limit = 1000
reduced = df[['Question','Answer']][:limit]

qlabels = reduced['Question'].to_dict()
alabels = reduced['Answer'].to_dict()

reduced

## Setting stopwords
- Import set of common stopwords from nltk
- Adding domain-related stopword
- Removing question words (To distinguish between intents of questions)

In [14]:
plus = {'hdfc'}
minus = {'what','how','where','when','why'}
stop = set(stopwords.words('english'))

stop.update(plus)
stop.difference_update(minus)

## Intitialising and training TF-IDF vectors
- Setting stopwords to `stop`
- `tf_vect` : `TfidfVectorizer` object. Can be used to convert strings to tf-idf vectors
- `all_qs_vectors` : Matrix of TF-IDF vectors corresponding to questions in training set

In [21]:
tf_vect =TfidfVectorizer(stop_words=stop,
                         lowercase=True,
                         use_idf=True)
all_qs_vectors = tf_vect.fit_transform(reduced['Question'])
TableDisplay({"Shape of all_qs_vectors :":(all_qs_vectors.shape),"Number of questions : ":all_qs_vectors.shape[0],"Vocabulary size : ":all_qs_vectors.shape[1]})

In [22]:
# Transforming context with tfidf
context = 'How can I repay my Personal Loan?'
context_vector = tf_vect.transform([context])
context_matrix = context_vector.todense()

In [28]:
# Displaying TF_IDF results
def tabulate_vector(context):
    values = []
    for w in word_tokenize(context.strip()):
        ind = tf_vect.vocabulary_.get(w.lower(),"-")
        val = context_matrix[0,ind] if not ind == "-" else 0
        values.append({"Word":w,"Vocabulary Index":str(ind),"TF-IDF Value":val})
    TableDisplay(values)


## Predicting closest question
- `predict` has the following arguments
    - `n`       : int  | Number of results (from top)
    - `answers` : bool | Return answers or not
    - `ret_best`: bool | Returns index of closest match
- Steps for prediction
    - Convert query to tfidf vector
    - Get dot product of query vectors with each question to measures similarity
    - Sort array indices by descending order of array values
    - Return top n results

In [63]:
def predict(query,n=5,answers=False,ret_indices=False):
    # Comparing context with all questions using dot product
    query_vector = tf_vect.transform([query])
    sim = np.dot(all_qs_vectors, query_vector.T)
    # Converting numpy matrix to 1D array with 146 dot products (146 questions vs context)
    arr = sim.toarray().flatten()
    matches = arr.argsort(axis=0)[::-1]
    top_n_matches = matches[:n]
    results = []
    if ret_indices:
        return top_n_matches
    for i in top_n_matches:
        res = {"Question":qlabels[i],"Ans":alabels[i]} if answers else {"Question":qlabels[i],"Score":arr[i]}
        results.append(res)
    return results

In [64]:
TableDisplay(predict('How do I pay my personal loan ?'))

## Finding closest question by jaccard_distance
- `tokens` is a dictionary mapping a question's index to a list of tokens in the word

In [38]:
# Generating tokens after converting to lowercase, removing stopwords and non-alphanumberic tokens
# Note : nltk.word_tokenize does not split PIN/Pattern'
tokens = {}
for i in qlabels:
    tokens[i] = set([x for x in wordpunct_tokenize(qlabels[i].lower()) if x.isalnum() and x not in stop])

In [106]:
# Eliminating questions which have a jaccard_distance > 0.9 with another questions
def get_jaccard_similarity(words,words2):
    inter = words.intersection(words2)
    union = words.union(words2)
    return float(len(inter))/len(union),len(inter)

def pred_jaccard(query,n=5):
    words = set([x for x in wordpunct_tokenize(query.lower()) if x.isalnum() and x not in stop])
    max_sim = -1
    max_ind = None
    scores = {}
    for i in qlabels:        
        sc = get_jaccard_similarity(words,tokens[i])
        scores[i] = {"question":qlabels[i],"score":sc[0],"inter":sc[1]}
    return pd.DataFrame(scores).T.sort_values('score',ascending=False)[:n]
    

In [131]:
pred_jaccard('How does amortization work ?',10)

In [130]:
pd.DataFrame(predict('How does amortization work ?',10))

In [123]:
all_ = {}
for x in list(tokens.values()):
    for w in list(x):
        all_[w] = all_.get(w,0)+1
pd.Series(all_).sort_values()[:40]

useful         1
alternate      1
tat            1
wants          1
levied         1
penalty        1
attempts       1
failure        1
shall          1
successful     1
website        1
store          1
unblock        1
blocked        1
purchases      1
fraudulent     1
us             1
equipment      1
greeting       1
sc             1
mind           1
things         1
cardments      1
stating        1
exceeds        1
authorized     1
fresh          1
wrongly        1
lock           1
interactive    1
downloaded     1
inactive       1
platforms      1
regenerate     1
note           1
mpin           1
devices        1
show           1
storage        1
months         1
dtype: int64