In [31]:
import numpy as np
import pandas as pd
import re
import gradio
import pickle
from nltk.corpus import stopwords
from scipy.spatial.distance import cosine
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import KeyedVectors # For loading saved gensim model
from simpletransformers.question_answering import QuestionAnsweringModel # This is for model

In [19]:
model_dir_ = "/home/sugam/Work/10-19 NLP/12 Projects/Resume Builder/Output/"
data_dir =  "/home/sugam/Work/10-19 NLP/12 Projects/Resume Builder/data/Processed/"

model = model_dir_ + "bert/"
context_data = data_dir + "context_data.csv"

#### Chat Interface
- Question -> given as input to the chatbot
- Preprocessing Pipeline to process question
- Embedding data to match cosine similarity
- Original context data to get the context
- Model to predict the value

### PREPROCESSING PIPELINE

In [16]:
class LowerCasing(BaseEstimator,TransformerMixin):
    """Takes the string and converts into lower casing"""

    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        return text.lower()


class RemovePunctuation(BaseEstimator, TransformerMixin):
    """Takes the string and removes punctuation"""
    
    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        exclude = '!"#$%&\'()*+./:;<=>?@[\\]^`{|}~'
        text = text.translate(str.maketrans("","",exclude))
        text = re.sub(","," ",text)
        text = re.sub(r"\("," ",text)
        text = re.sub(r"\)"," ",text)

        return text


class RemoveAccent(BaseEstimator,TransformerMixin):
    """Takes string and removes accent words"""
    
    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        accent_letters = 'éàáñüãèìöäøõîûçôšâ'
        text = text.translate(str.maketrans("","",accent_letters))

        return text


class RemoveStopWords(BaseEstimator,TransformerMixin):
    """Takes the string and remove the stopwords"""
    
    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        new_text = []
        for words in text.split():
            if words not in stopwords.words("english"):
                new_text.append(words)
            else:
                new_text.append("")
        text = " ".join(new_text)

        return text
    

pipe = Pipeline([
    ("lower",LowerCasing()),
    ("remove punctuation",RemovePunctuation()),
    ("remove accent",RemoveAccent()),
    ("remove stopwords",RemoveStopWords())
])

### WORD2VEC CLASS

In [27]:
class Word2Vec():
  """
  The class is responsible for importing the saved gensim word2vec 200 dim vector and use it to encode the question

  """

  def __init__(self,path):
      self.wv = KeyedVectors.load(path)

  def sent_vec(self,sent):
    """
    Creates a vector from sentence 
    """
    vector_size = self.wv.vector_size
    wv_res = np.zeros(vector_size)
    ctr = 1
    for w in sent:
      if w in self.wv:
        ctr+=1
        wv_res += self.wv[w]
    wv_res = wv_res/ctr
    return wv_res

  
word_2_vec_model_dir = model_dir_+"word2vec_model"

In [20]:
class LoadData():
    

    def __init__(self,embedding_path = "", context_path = ""):
        self.embedding_path = embedding_path
        self.context_path = context_path
        
    def load_context_embeddings(self):
        ####### This will load premade embeddings
        with open(self.embedding_path, "rb") as file:
            vec = pickle.load(file)
        
        ### Convert into dataframe
        vec = pd.DataFrame(vec)
        
        return vec
    
    def load_context_data(self):
        #### This will load the original data for context
        og_context = pd.read_csv(self.context_path)
        return og_context
        


embedding_path = data_dir +"word2vec_encoding.pkl"
context_path =  data_dir+"context_data.csv"

In [21]:
def find_cosine_similarity(vec, question_vector):
    """
    Takes the embedding dataframe of the context and the embedding of the question
    Finds the cosine similarity between two
    Gives the index whose cosine similarity is maximum
    """
    vec["cosine_similarity"] = vec["context"].apply(lambda x: 1 - cosine(x,question_vector)) # Applies the cosine similarity and store in a new column
    index_max_similarity = vec["cosine_similarity"].argmax() # Finds the index with maximum cosine similarity

    return index_max_similarity
    

In [22]:
def format_data(context_sentence,question):
    to_predict =  [
    {
        "context": context_sentence,
        "qas": [
            {
                "question": question,
                "id": "0",
            }
        ],
    }]

    return to_predict

In [33]:
class Model():
    def __init__(self):
        self.model = QuestionAnsweringModel("bert",model+"/best_model/",use_cuda=False)
    
    def predict(self,to_predict):
        predictions, raw_input = self.model.predict(to_predict)
        return predictions[0]["answer"][0]


In [None]:
import random
import gradio as gr
from typing import List
import time

def random_response(message: str, history: List):
    """
    The function handles whatever is to be shown in chatbot
    
    Args:
    
    message (str) - representing the user's input.
    history (a list of list) representing the conversations up until that point. Each inner list consists of two str representing a pair: [user input, bot response].
    
    Returns:
    Output to be shown
    
    """
    processed_question = pipe.fit_transform(message)  # Preprocesses the question
    question_vector = Word2Vec(word_2_vec_model_dir).sent_vec(processed_question) # Create embedding of question
    ld = LoadData(embedding_path=embedding_path,context_path=context_path) # Loading the pre-made context embeddings and context data
    vec = ld.load_context_embeddings()
    context_data = ld.load_context_data()
    index_max_similarity = find_cosine_similarity(vec,question_vector) # Find the index of maximum cosine similarity

    # Fetching the context from the original context data
    context = context_data.iloc[index_max_similarity,0] # This will have context of the question stored to it
    
    to_predict = format_data(context, message) # Formatting the original question and context. Not using the processed question because the model is not trained on 
    # processed questions
    
    answer = Model().predict(to_predict)
    
    for i in range(len(answer)):
        time.sleep(0.1)
        yield "The answer is->" + answer[:i+1]
    
    return answer

demo = gr.ChatInterface(random_response,
                        chatbot=gr.Chatbot(height=300),
                        textbox=gr.Textbox(placeholder="Ask me a yes or no question", container=False, scale=7),
                        title="ResuBot",
                        description="Ask me any job related questions",
                        theme="soft",
                        examples=["Hello", "Am I cool?", "Are tomatoes vegetables?"],
                        cache_examples=False,
                        retry_btn="Retry",
                        undo_btn="Delete Previous",
                        clear_btn="Clear").queue().launch(share=True)


Running on local URL:  http://127.0.0.1:7870
Running on public URL: https://c897dbc64552de907c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 266.54it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 13486.51it/s]
Running Prediction: 100%|██████████| 1/1 [00:00<00:00,  1.87it/s]
convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 154.83it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 24528.09it/s]
Running Prediction: 100%|██████████| 1/1 [00:00<00:00,  1.90it/s]
