In [9]:
import pandas as pd
import numpy as np
df = pd.read_pickle('hdfc.pkl')
df = df.drop_duplicates('Question')
df = df.reset_index()
df['Question'] = df['Question'].str.strip()

import nltk
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
pd.set_option('display.width',1000)

In [10]:
limit = 100
reduced = df[['Question','Answer']][:limit]

qlabels = reduced['Question'].to_dict()
alabels = reduced['Answer'].to_dict()

print reduced.head()

                                            Question                                             Answer
0  What will be done with the post dated cheques ...  Post Dated Cheques(PDCs)/Security Cheques subm...
1                  How can I repay my Personal Loan?  You pay the loan in equal monthly instalments ...
2  Are there any additional charges for loan repa...  The additional charges (if any) are applicable...
3                                 What is Guarantor?  A Guarantor is a person who guarantees to pay ...
4                                 What is De-pledge?  Removal of a pledge from the security to regai...


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vect =TfidfVectorizer(stop_words=stop,
                         lowercase=True,
                         use_idf=True)
all_qs_vectors = tf_vect.fit_transform(reduced['Question'])
print "Shape of all_qs_vectors :",all_qs_vectors.shape
print all_qs_vectors.shape[0],": Number of questions"
print all_qs_vectors.shape[1],": Vocabulary size"

Shape of all_qs_vectors : (100, 178)
100 : Number of questions
178 : Vocabulary size


In [12]:
# Transforming context with tfidf
context = 'How can I repay my Personal Loan?'
vector_context = tf_vect.transform([context])
context_matrix = vector_context.todense()
# vector_doc = self.vectorizer.transform(])

In [13]:
# Displaying TF_IDF results
from nltk import word_tokenize
print "WORD".ljust(10),"INDEX".ljust(6),"TFIDF_VALUE"
for w in word_tokenize(context.strip()):
    ind = tf_vect.vocabulary_.get(w.lower(),"NA")
    val = context_matrix[0,ind] if not ind == "NA" else 0
    print w.ljust(10),str(ind).ljust(6),val

WORD       INDEX  TFIDF_VALUE
How        NA     0
can        NA     0
I          NA     0
repay      141    0.6328378766551715
my         NA     0
Personal   118    0.714811086464789
Loan       87     0.2975925612941322
?          NA     0


In [14]:
# Comparing context with all questions using dot product
result = np.dot(all_qs_vectors, vector_context.T)
# Converting numpy matrix to 1D array with 146 dot products (146 questions vs context)
arr = result.toarray().flatten()
n=5
matches = arr.argsort(axis=0)[::-1]
top_n_matches = matches[:n]
# print 
# print qlabels
for i in top_n_matches:
    print qlabels[i]

How can I repay my Personal Loan?
Can I repay the Personal loan earlier?
How long can I take to repay my personal loan?
How do I repay my Professionals loan?
How do I repay my Business Loan?
