In [17]:
import numpy as np

In [18]:
from lingualytics.preprocessing import remove_lessthan, remove_punctuation, remove_stopwords
from lingualytics.stopwords import hi_stopwords,en_stopwords
from texthero.preprocessing import remove_digits
import pandas as pd

In [19]:
dataset = r"C:\Users\ilann\kandi\faqs.csv"

In [20]:
df = pd.read_csv(dataset, encoding_errors="ignore")
df

Unnamed: 0,Q,A
0,What is kandi?,kandi (pronounced kandee) is a platform that h...
1,Have feedback or want to know more?,We are a passionate set of application focused...
2,What components does kandi cover?,kandi helps you select software components acr...
3,How do I use kandi?,kandi provides two simplified experiences to h...
4,How do I shortlist components on kandi?,You can use the below filters to shortlist com...
5,How do I implement the components that I have ...,The component listing and detailed insights pa...


In [21]:
#Data preprocessing (removing punctautions and stopwords)

# pd.set_option('display.max_colwidth', None)
df['clean_Q'] = df['Q'].pipe(remove_digits) \
                                    .pipe(remove_punctuation) \
#                                   .pipe(remove_lessthan,length=3) \
#                                    .pipe(remove_stopwords,stopwords=en_stopwords.union(hi_stopwords))
print(df)

                                                   Q  \
0                                     What is kandi?   
1                Have feedback or want to know more?   
2                  What components does kandi cover?   
3                                How do I use kandi?   
4            How do I shortlist components on kandi?   
5  How do I implement the components that I have ...   

                                                   A  \
0  kandi (pronounced kandee) is a platform that h...   
1  We are a passionate set of application focused...   
2  kandi helps you select software components acr...   
3  kandi provides two simplified experiences to h...   
4  You can use the below filters to shortlist com...   
5  The component listing and detailed insights pa...   

                                             clean_Q  
0                                     What is kandi   
1                Have feedback or want to know more   
2                  What components does kandi cov

  return s.str.replace(rf"([{punctuation}])+", " ")


In [22]:
#Compute embedding for sentences/training the model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [23]:
Q_embedded = model.encode(df['clean_Q'], convert_to_tensor=True)



In [24]:
Q_embedded.shape

torch.Size([6, 384])

In [25]:
#get user query
user_query = "tell me about kandi"

In [26]:
#compute embeddings for user query
df_user_query = pd.DataFrame([user_query], columns=["user_query"])
df_user_query

Unnamed: 0,user_query
0,tell me about kandi


In [30]:
#takes input in data frame format
df_user_query['clean_user_Q'] = df_user_query['user_query'].pipe(remove_digits) \
                                    .pipe(remove_punctuation)
df_user_query

  return s.str.replace(rf"([{punctuation}])+", " ")


Unnamed: 0,user_query,clean_user_Q
0,tell me about kandi,tell me about kandi


In [33]:
user_Q_embedded = model.encode(df_user_query['clean_user_Q'], convert_to_tensor=True)
user_Q_embedded.shape

torch.Size([1, 384])

In [44]:
user_Q_embedded

tensor([[ 6.9899e-02,  9.4276e-03, -1.3369e-02,  4.0351e-02,  5.2212e-03,
          3.5141e-02,  8.5067e-02, -9.9379e-03, -3.9313e-02,  5.8849e-02,
          1.0085e-01, -5.3544e-02, -2.2394e-02, -5.1937e-02,  3.0627e-02,
         -4.6265e-02,  3.4853e-02,  2.8061e-02, -7.2037e-02, -2.9808e-02,
         -6.2095e-02, -1.0985e-02, -6.8197e-03, -4.0744e-02, -4.1265e-02,
         -4.2136e-02,  7.0115e-02, -7.3623e-02, -6.7594e-03,  1.1053e-03,
         -5.2180e-02,  1.7913e-02,  3.9321e-02,  5.7513e-02, -2.3647e-02,
          8.5967e-03,  6.0631e-02,  3.5874e-02,  5.3394e-03, -2.3639e-02,
         -2.4127e-02, -7.1573e-02, -2.6738e-03, -5.7618e-02,  4.4879e-02,
         -6.6413e-02, -4.1006e-02, -3.1347e-02, -3.0792e-02,  1.5158e-02,
         -3.4194e-02,  4.8126e-02,  5.7961e-02, -4.6704e-02,  4.0438e-02,
          3.5611e-03, -3.4743e-03, -1.7984e-02,  7.4855e-02, -4.7505e-02,
          3.7020e-02, -6.1809e-03,  5.3995e-02, -4.4603e-02,  3.1403e-02,
         -1.5275e-01, -3.1960e-03,  6.

In [32]:
#compute cosine similarities - user queries vs dataset queries
import torch
cos_fn = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

In [35]:
cos_fn(user_Q_embedded, Q_embedded)

tensor([0.8420, 0.1320, 0.6204, 0.7314, 0.4508, 0.5415])

In [37]:
#found the index which has the most similarity
index = np.argmax(cos_fn(user_Q_embedded, Q_embedded)).item()

In [38]:
df['A'][index]

'kandi (pronounced kandee) is a platform that helps developers pick the right library, package, code samples, APIs, and cloud functions, by analyzing over 430 million knowledge items.'

In [41]:
def predictions(user_query):
    df_user_query = pd.DataFrame([user_query], columns=["user_query"])
    df_user_query['clean_user_Q'] = df_user_query['user_query'].pipe(remove_digits) \
                                    .pipe(remove_punctuation)
    user_Q_embedded = model.encode(df_user_query['clean_user_Q'], convert_to_tensor=True)
    cos_fn = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
    index = np.argmax(cos_fn(user_Q_embedded, Q_embedded)).item()
    return df['A'][index]

In [43]:
while True:
    user_query = input("Ask your query: ")
    if user_query == "exit" or "close":
        break
    else:
        print(predictions(user_query))
        print("~~~~~~~~~~~~~~~~~")

Ask your query: tell me about kandi


  return s.str.replace(rf"([{punctuation}])+", " ")


kandi (pronounced kandee) is a platform that helps developers pick the right library, package, code samples, APIs, and cloud functions, by analyzing over 430 million knowledge items.
~~~~~~~~~~~~~~~~~
Ask your query: tell me about kandi!
kandi (pronounced kandee) is a platform that helps developers pick the right library, package, code samples, APIs, and cloud functions, by analyzing over 430 million knowledge items.
~~~~~~~~~~~~~~~~~
Ask your query: close
We are a passionate set of application focused techies. Wed love to hear from you on your feedback, questions, and any other comments.
Direct Message us on Twitter Message @OpenWeaverInc
You can email us at kandi.support@openweaver.com
Join our Discord community here
~~~~~~~~~~~~~~~~~
Ask your query: exit
