In [14]:
import re
import numpy as np
import pandas as pd
import pickle
import nltk
import re
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer
import heapq
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
meta_df = pd.read_csv('ResumeDataSet.csv')
meta_df.head(3)

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."


In [16]:
print('{} Total papers in original df'.format(meta_df.shape[0]))
print('{} Papers with abstract'.format(len(meta_df[meta_df['Resume'].notnull()])))

962 Total papers in original df
962 Papers with abstract


In [17]:
def minimal_clean_text(text):
    text = text.lower()
    text = text.replace('-', ' ')
    text = text.replace('–', ' ')
    return text

def preproc_text(text):
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r'[^A-Za-z ]+', '', text)
    text = minimal_clean_text(text)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    cleaned_text = [lemmatizer.lemmatize(token) for token in word_tokens if token not in stop_words and len(token)>1]
    return cleaned_text

In [18]:
meta_df = meta_df.dropna(subset=['Resume'])
meta_df['Resume'] = meta_df['Resume'].apply(minimal_clean_text)
meta_df.head(3)

Unnamed: 0,Category,Resume
0,Data Science,skills * programming languages: python (pandas...
1,Data Science,education details \r\nmay 2013 to may 2017 b.e...
2,Data Science,"areas of interest deep learning, control syste..."


In [19]:
keywords = ['deep learning']

for keyword in keywords:
    meta_df[keyword.replace(' ', '_')] = meta_df['Resume'].apply(lambda x: True if keyword in x else False)
resume_screening_query = '==True or '.join(keyword.replace(' ', '_') for keyword in keywords) + '==True'
resume_screening_df = meta_df.query(resume_screening_query).reset_index(drop=True)
resume_screening_df['tokenized_abstract'] = resume_screening_df['Resume'].apply(preproc_text)

print('{:.2f}% of the abstracts contain the deep learning related phrases'.format(resume_screening_df.shape[0]/meta_df.shape[0]*100))
print('{} papers in total'.format(resume_screening_df.shape[0]))

2.08% of the abstracts contain the deep learning related phrases
20 papers in total


In [20]:
print(resume_screening_df.head(10))

       Category                                             Resume  \
0  Data Science  skills * programming languages: python (pandas...   
1  Data Science  areas of interest deep learning, control syste...   
2  Data Science  skills â¢ r â¢ python â¢ sap hana â¢ table...   
3  Data Science  education details \r\n b.tech   rayat and bahr...   
4  Data Science  personal skills â¢ ability to quickly grasp t...   
5  Data Science  skills * programming languages: python (pandas...   
6  Data Science  areas of interest deep learning, control syste...   
7  Data Science  skills â¢ r â¢ python â¢ sap hana â¢ table...   
8  Data Science  education details \r\n b.tech   rayat and bahr...   
9  Data Science  personal skills â¢ ability to quickly grasp t...   

   deep_learning                                 tokenized_abstract  
0           True  [skill, programming, language, python, panda, ...  
1           True  [area, interest, deep, learning, control, syst...  
2           True  [

In [21]:
#class for preprocessing and creating word embedding
class Preprocessing:
    #constructor
    def __init__(self,txt):
        # Tokenization
        nltk.download('punkt')  #punkt is nltk tokenizer
        # breaking text to sentences
        tokens = nltk.sent_tokenize(txt)
        self.tokens = tokens
        self.tfidfvectoriser=TfidfVectorizer()

    # Data Cleaning
    # remove extra spaces
    # convert sentences to lower case
    # remove stopword
    def clean_sentence(self, sentence, stopwords=False):
        sentence = sentence.lower().strip()
        sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
        if stopwords:
            sentence = remove_stopwords(sentence)
        return sentence

    # store cleaned sentences to cleaned_sentences
    def get_cleaned_sentences(self,tokens, stopwords=False):
        cleaned_sentences = []
        for line in tokens:
            cleaned = self.clean_sentence(line, stopwords)
            cleaned_sentences.append(cleaned)
        return cleaned_sentences

    #do all the cleaning
    def cleanall(self):
        cleaned_sentences = self.get_cleaned_sentences(self.tokens, stopwords=True)
        cleaned_sentences_with_stopwords = self.get_cleaned_sentences(self.tokens, stopwords=False)
        # print(cleaned_sentences)
        # print(cleaned_sentences_with_stopwords)
        return [cleaned_sentences,cleaned_sentences_with_stopwords]

    # TF-IDF Vectorizer
    def TFIDF(self,cleaned_sentences):
        self.tfidfvectoriser.fit(cleaned_sentences)
        tfidf_vectors=self.tfidfvectoriser.transform(cleaned_sentences)
        return tfidf_vectors

    #tfidf for question
    def TFIDF_Q(self,question_to_be_cleaned):
        tfidf_vectors=self.tfidfvectoriser.transform([question_to_be_cleaned])
        return tfidf_vectors

    # main call function
    def doall(self):
        cleaned_sentences, cleaned_sentences_with_stopwords = self.cleanall()
        tfidf = self.TFIDF(cleaned_sentences)
        return [cleaned_sentences,cleaned_sentences_with_stopwords,tfidf]

In [22]:
class AnswerMe:
    #cosine similarity
    def Cosine(self, question_vector, sentence_vector):
        dot_product = np.dot(question_vector, sentence_vector.T)
        denominator = (np.linalg.norm(question_vector) * np.linalg.norm(sentence_vector))
        return dot_product/denominator

    #Euclidean distance
    def Euclidean(self, question_vector, sentence_vector):
        vec1 = question_vector.copy()
        vec2 = sentence_vector.copy()
        if len(vec1)<len(vec2): vec1,vec2 = vec2,vec1
        vec2 = np.resize(vec2,(vec1.shape[0],vec1.shape[1]))
        return np.linalg.norm(vec1-vec2)

    # main call function
    def answer(self, question_vector, sentence_vector, method):
        if method==1: return self.Euclidean(question_vector,sentence_vector)
        else: return self.Cosine(question_vector,sentence_vector)

In [23]:
def RetrieveAnswer(question_embedding, tfidf_vectors,method=1):
    similarity_heap = []
    if method==1: max_similarity = float('inf')
    else: max_similarity = -1
    index_similarity = -1

    for index, embedding in enumerate(tfidf_vectors):
        find_similarity = AnswerMe()
        similarity = find_similarity.answer((question_embedding).toarray(),(embedding).toarray() , method).mean()
        if method==1:
            heapq.heappush(similarity_heap,(similarity,index))
        else:
            heapq.heappush(similarity_heap,(-similarity,index))

    return similarity_heap

In [24]:
user_question = "know deep learning and nlp"
method = 1

In [25]:
txt = meta_df['Resume'][0]
preprocess = Preprocessing(txt)
cleaned_sentences,cleaned_sentences_with_stopwords,tfidf_vectors = preprocess.doall()

question = preprocess.clean_sentence(user_question, stopwords=True)
question_embedding = preprocess.TFIDF_Q(question)

similarity_heap = RetrieveAnswer(question_embedding , tfidf_vectors ,method)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
print("Question: ", user_question)

# number of relevant solutions you want here it will print 2
number_of_sentences_to_print = 2

while number_of_sentences_to_print>0 and len(similarity_heap)>0:
    x = similarity_heap.pop(0)
    print(cleaned_sentences_with_stopwords[x[1]])
    number_of_sentences_to_print-=1

Question:  know deep learning and nlp
 machine learning regression svm nave bayes knn random forest decision trees boosting techniques cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca  neural nets
 others regular expression html css angular 6 logstash kafka python flask git docker computer vision   open cv and understanding of deep learningeducation details 

data science assurance associate 

data science assurance associate   ernst  young llp
skill details 
javascript  exprience   24 months
jquery  exprience   24 months
python  exprience   24 monthscompany details 
company   ernst  young llp
description   fraud investigations and dispute services   assurance
technology assisted review
tar technology assisted review assists in accelerating the review process and run analytics and generate reports
