In [12]:
# The directory for top level folder
model_dir_ = "/home/sugam/Work/10-19 NLP/12 Projects/Resume Builder/Output/"
data_dir =  "/home/sugam/Work/10-19 NLP/12 Projects/Resume Builder/data/Processed/"

model = model_dir_ + "bert/"
context_data = data_dir + "context_data.csv"

In [13]:
import pandas as pd
import re
import numpy as np
import gensim
import string
import pickle
from nltk.corpus import stopwords
from scipy.spatial.distance import cosine
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

## Part-1
- Load the context data
- Do text preprocessing like lowercasing -> punctuation removal -> accent words removal -> stopword removal

In [14]:
##### Load the data
context = pd.read_csv(context_data)
context.rename(columns={"0":"context"},inplace=True)

In [15]:
class LowerCasing(BaseEstimator,TransformerMixin):
    """Takes the string and converts into lower casing"""

    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        return text.lower()

In [16]:
class RemovePunctuation(BaseEstimator, TransformerMixin):
    """Takes the string and removes punctuation"""
    
    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        exclude = '!"#$%&\'()*+./:;<=>?@[\\]^`{|}~'
        text = text.translate(str.maketrans("","",exclude))
        text = re.sub(","," ",text)
        text = re.sub(r"\("," ",text)
        text = re.sub(r"\)"," ",text)

        return text

In [17]:
class RemoveAccent(BaseEstimator,TransformerMixin):
    """Takes string and removes accent words"""
    
    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        accent_letters = 'éàáñüãèìöäøõîûçôšâ'
        text = text.translate(str.maketrans("","",accent_letters))

        return text

In [18]:
class RemoveStopWords(BaseEstimator,TransformerMixin):
    """Takes the string and remove the stopwords"""
    
    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        new_text = []
        for words in text.split():
            if words not in stopwords.words("english"):
                new_text.append(words)
            else:
                new_text.append("")
        text = " ".join(new_text)

        return text

In [19]:
# class Tokenize(BaseEstimator, TransformerMixin):
#     def fit(self, text, y = None):
#         return self

#     def transform(self, text):
        

In [20]:
pipe = Pipeline([
    ("lower",LowerCasing()),
    ("remove punctuation",RemovePunctuation()),
    ("remove accent",RemoveAccent()),
    ("remove stopwords",RemoveStopWords())
])

In [21]:
pipe

In [None]:
#-------------------------> DO NOT RUN 2X <-----------------------------------------------------------------------------------

# Update each rows with preprocessed data -> pass each row through the pipeline
context["context"] = context["context"].map(lambda x: pipe.fit_transform(x))



#### Saving the processed context data and deleting the context variable to free up some memory
context.to_csv(data_dir+"context_processed.csv",index=False)
del context

## Part-2
- Load the preprocessed context data
- load the gensim word2vec 200 dim vectorizer model and vectorize each row
- vectorize the question 
- Perform cosine similarity of the question vector with the context vector
- Find the maximum similariy and take that as the context of the question.

In [22]:
## Load the saved data
context = pd.read_csv(data_dir+"context_processed.csv")
context_series = context["context"].copy()
del context 

In [23]:
import gensim.downloader as api
from gensim.models import KeyedVectors

print(list(gensim.downloader.info()["models"].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [24]:
# -------------------------------> DO NOT RUN 2X <-----------------------------------------
#### Import the gensim model
wv = api.load("glove-wiki-gigaword-200")

In [25]:
#### Save the model for future use
wv.save(model_dir_+"glove_model")

In [26]:
wv = KeyedVectors.load(model_dir_+"glove_model")

In [28]:
def sent_vec(sent):
  """Creates a vector from sentence """
  vector_size = wv.vector_size
  wv_res = np.zeros(vector_size)
  ctr = 1
  for w in sent:
    if w in wv:
      ctr+=1
      wv_res += wv[w]
  wv_res = wv_res/ctr
  return wv_res

vec = context_series.map(sent_vec)
vec.to_pickle(model_dir_+"glove_encoding.pkl")

In [None]:
####### This will load premade embeddings
with open(data_dir+"glove_encoding.pkl", "rb") as file:
    vec = pickle.load(file)

### Convert into dataframe
vec = pd.DataFrame(vec)

In [None]:
question = "What is job provided by Inbiopro Solutions Pvt Ltd company with autocad as input?"
question = pipe.fit_transform(question)
question_vector = sent_vec(question)

In [None]:
vec["cosine_similarity"] = vec["context"].apply(lambda x: 1 - cosine(x,question_vector))
index_max_similarity = vec["cosine_similarity"].argmax()
index_max_similarity

In [None]:
##### Read the original context data
og_context = pd.read_csv(data_dir+"context_data.csv")

In [None]:
# Fetching the context based on the maximum similarity
context_sentence = og_context.iloc[index_max_similarity,0]

In [None]:
to_predict =  [
    {
        "context": context_sentence,
        "qas": [
            {
                "question": question,
                "id": "0",
            }
        ],
    }]

In [None]:
to_predict