In [1]:
# The directory for top level folder
model_dir_ = "/home/sugam/Work/10-19 NLP/12 Projects/Resume Builder/Output/"
data_dir =  "/home/sugam/Work/10-19 NLP/12 Projects/Resume Builder/data/Processed/"

model = model_dir_ + "bert/"
test_data = data_dir + "test_json.json"
context_data = data_dir + "context_data.csv"

In [5]:
import torch
import json
import pandas as pd
import re
import string
from textblob import TextBlob
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

## Part-1

In [168]:
context = pd.read_csv(context_data)
context.rename(columns={"0":"context"},inplace=True)
context.head()

Unnamed: 0,context
0,"JT_Walkin Data Entry Operator (night Shift),CO..."
1,"JT_Work Based Onhome Based Part Time,CO_find l..."
2,"JT_Pl,sql Developer - SQL,CO_Softtech Career I..."
3,"JT_Manager,ad,partner - Indirect Tax - CA,CO_O..."
4,"JT_JAVA Technical Lead (6-8 yrs) -,CO_Spire Te..."


In [169]:

class LowerCasing(BaseEstimator,TransformerMixin):
    """Takes the string and converts into lower casing"""

    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        return text.lower()

In [170]:
class RemovePunctuation(BaseEstimator, TransformerMixin):
    """Takes the string and removes punctuation"""
    
    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        exclude = '!"#$%&\'()*+./:;<=>?@[\\]^`{|}~'
        text = text.translate(str.maketrans("","",exclude))
        text = re.sub(","," ",text)
        text = re.sub(r"\("," ",text)
        text = re.sub(r"\)"," ",text)

        return text

In [171]:
class RemoveAccent(BaseEstimator,TransformerMixin):
    """Takes string and removes accent words"""
    
    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        accent_letters = 'éàáñüãèìöäøõîûçôšâ'
        text = text.translate(str.maketrans("","",accent_letters))

        return text

In [172]:
class RemoveStopWords(BaseEstimator,TransformerMixin):
    """Takes the string and remove the stopwords"""
    
    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        new_text = []
        for words in text.split():
            if words not in stopwords.words("english"):
                new_text.append(words)
            else:
                new_text.append("")
        text = " ".join(new_text)

        return text

In [173]:
pipe = Pipeline([
    ("lower",LowerCasing()),
    ("remove punctuation",RemovePunctuation()),
    ("remove accent",RemoveAccent()),
    ("remove stopwords",RemoveStopWords())
])

In [175]:
pipe

In [177]:
context["context"] = context["context"].map(lambda x: pipe.fit_transform(x))

In [180]:
# def lower_casing(text):
#     return text.lower()

# def remove_punctuation(text):
#     exclude = '!"#$%&\'()*+./:;<=>?@[\\]^`{|}~'
#     text = text.translate(str.maketrans("","",exclude))
#     text = re.sub(","," ",text)
#     text = re.sub(r"\("," ",text)
#     text = re.sub(r"\)"," ",text)

#     return text


# def remove_accent_letters(text):
#     accent_letters = 'éàáñüãèìöäøõîûçôšâ'
#     text = text.translate(str.maketrans("","",accent_letters))

#     return text

# def remove_stopwords(text):
#     new_text = []
#     for words in text.split():
#         if words not in stopwords.words("english"):
#             new_text.append(words)
#         else:
#             new_text.append("")
#     text = " ".join(new_text)

#     return text


# context["context"]= context["context"].map(lower_casing)
# context["context"]= context["context"].map(remove_punctuation)
# context["context"]= context["context"].map(remove_accent_letters)
# context["context"]= context["context"].map(remove_stopwords)
# context.head()

In [179]:
#### Saving the processed context data and deleting the context variable
context.to_csv(data_dir+"context_processed.csv",index=False)
del context

## Part-2

In [6]:
context = pd.read_csv(data_dir+"context_processed.csv")
context_series = context["context"].copy()
del context 

In [18]:
class TFIDF(BaseEstimator,TransformerMixin):
    def fit(self,text,y=None):
        return self
    
    def transform(self,text):
        # Create a TF-IDF vectorizer 
        tfidf_vectorizer = TfidfVectorizer()

        # Fit and transform the context data
        tfidf_matrix = tfidf_vectorizer.fit_transform(context_series)
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

        return tfidf_df

In [8]:
coding_pipe = Pipeline([
    ("tfidf",TFIDF())
])
tfidf_df = coding_pipe.fit_transform(context_series)

In [11]:
tfidf_df.to_pickle(data_dir+"tf-idf.pkl")
del tfidf_df

In [17]:
tfidf_df = pd.read_pickle(data_dir+"tf-idf.pkl")
tfidf_df.head()

Unnamed: 0,00,000,00000,0008,000i,002,0021,003,004,0073,...,zos,zs4,zsm,zuha,zuti,zycus,zydus,½cke,½ï,ïƒ
0,0.0,0.168303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.148077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
question = "What is job provided by Lok Bharti Skilling Solution Pvt Ltd company?"
question_vector = tfidf_vectorizer.transform([question])

In [16]:
question_vector

NameError: name 'question_vector' is not defined

In [147]:
# Calculate the cosine similarity between the question and context data
cosine_similarities = linear_kernel(question_vector, tfidf_matrix).flatten()

In [148]:
cosine_similarities

array([0.0135628 , 0.        , 0.01349063, ..., 0.        , 0.        ,
       0.02319114])

In [150]:
# Get the index of the context with the highest similarity
closest_context_index = cosine_similarities.argmax()

# Get the closest context
closest_context = context_series[closest_context_index]


In [15]:
import random
import gradio as gr

def random_response(message, history):
    return random.choice(["Yes", "No"])

demo = gr.ChatInterface(random_response)

demo.launch()

  from .autonotebook import tqdm as notebook_tqdm


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


