In [1]:
import nltk
import docx
import numpy as np
import pandas as pd
import PyPDF2
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

#variables to hold the list of terms on corpus and document names
corpus = []
index_arr = []

In [2]:
#Helper function to read document from disk and return the text
def read_docx_document(filepath):
    doc = docx.Document(filepath)
    #Merged paragraphs into a single text
    text = ""
    for i in doc.paragraphs:
        text += " "+ i.text
    return text

In [3]:
def read_pdf_document(file_path):
    pdfFile = open(file_path, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFile)
    text = ""
    for i in range(pdfReader.numPages):
        page = pdfReader.getPage(i)
        text = text + page.extractText()
    
    return text

In [4]:
text = read_pdf_document("Resume/Luminance Fund_Saleem Lalani_Research Associate.pdf")

In [5]:
#Tokenizez the words in the text
def tokenize_words(text):
    return nltk.word_tokenize(text)

In [6]:
#Remove stopwords
def remove_stop_words(word_tokens):
    stop_words = set(stopwords.words('english')) 
    filtered_words = [w for w in word_tokens if not w in stop_words] 
    filtered_words = [] 
  
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_words.append(w) 
    return filtered_words

In [7]:
#Checking for alpha numeric words
def check_alphanumberic_words(tokens):
    return [word for word in tokens if word.isalpha()]

In [8]:
#Stemming words
def apply_stemmer(words):
    porter = PorterStemmer()
    return [porter.stem(word) for word in words]

In [9]:
def plot_frequency_distribution(stemmed_words):
    english_stop_words= stopwords.words('english')
    clean_tokens = stemmed_words[:]
    for token in stemmed_words:
        if token in english_stop_words:
            clean_tokens.remove(token)

    freq = nltk.FreqDist(clean_tokens)
    freq.plot(20, cumulative=False)

In [10]:
def add_documents(stemmed, document_name):
    index_arr.append(document_name)
    corpus.append(" ".join(stemmed))

In [11]:
def preprocess_document(doc_name, doc_type):
    file_path = "Resume/" + doc_name
    if (doc_type == "docx"):
        text = read_docx_document(file_path)
    if (doc_type == "pdf"):
        text = read_pdf_document(file_path)
    tokens = tokenize_words(text)
    words = check_alphanumberic_words(tokens)
    words = remove_stop_words(words)
    stemmed = apply_stemmer(words)
    add_documents(stemmed, doc_name)
    return stemmed
    

In [12]:
#Read document
#terms = preprocess_document('1Amy.docx')
#Plotting frequency distribution of the words in the text
#plot_frequency_distribution(terms)

In [13]:
#Read all other documents 
import os

dir = "Resume"
for filename in os.listdir(dir):
    if filename.endswith(".docx"):
        terms = preprocess_document(filename, "docx")
    if filename.endswith(".pdf"):
        terms = preprocess_document(filename, "pdf")




In [15]:
#Checking the output based on input of the text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the Document vector space Matrix
vectorizer = TfidfVectorizer()

#Sample query
text = read_docx_document("Job Description/CDL - EVP Head of Asset Mgt.docx")
tokens = tokenize_words(text)
words = check_alphanumberic_words(tokens)
words = remove_stop_words(words)
stemmed = apply_stemmer(words)
add_documents(stemmed, "Query")

X = vectorizer.fit_transform(corpus)
print(X.shape)
doc_term_matrix = X.todense()
tf_idf_data = pd.DataFrame(doc_term_matrix, 
                 columns=vectorizer.get_feature_names(), 
                index=index_arr)


(20, 2153)


In [16]:
#Vector space matrix
tf_idf_data

Unnamed: 0,ab,abil,abl,abreast,abu,academ,academi,acca,accept,access,...,yusen,zana,zhong,zhongtai,zoolog,òmasó,ﬁalfrﬂ,ﬁcpaaustraliaﬂ,ﬁpwcﬂ,ﬁsmeﬂ
180517_Vasanthi Kasinathan.docx,0.0,0.0,0.0,0.0,0.0,0.0,0.037075,0.037075,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1Amy.docx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bank of East Asia_Tracey Teo_Compliance Head.docx,0.017794,0.0,0.0,0.013607,0.0,0.0,0.0,0.017794,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eFinancialCareers_TT - CV.docx,0.0,0.01536,0.0,0.01421,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.042279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Lion Global_Ivy Choo_Investment Compliance Assistant Manager.docx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LT CV 201608.docx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048665,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Luminance Fund_Saleem Lalani_Research Associate.pdf,0.0,0.0,0.0,0.0,0.022414,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Phillip Capital_Loh Pei Shang_Compliance Manager.docx,0.0,0.0,0.0,0.0,0.0,0.040192,0.0,0.0,0.044542,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PWC_Olivia Peter_Regulatory Manager.pdf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.110063,0.110063,0.110063,0.220127
PWC_Penny Lim_Risk AM.pdf,0.031639,0.0,0.0,0.0,0.0,0.028549,0.0,0.0,0.0,0.031639,...,0.0,0.0,0.0,0.0,0.035994,0.0,0.0,0.0,0.0,0.0


In [17]:
# Compute Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(tf_idf_data, tf_idf_data))


[[1.         0.22781665 0.27656424 0.49049022 0.18226125 0.45815957
  0.17580971 0.31026364 0.02570945 0.26563687 0.26387358 0.31428464
  0.34994099 0.20240821 0.22084852 0.26018465 0.         0.
  0.27980362 0.26973153]
 [0.22781665 1.         0.13738848 0.22092328 0.18629205 0.22439908
  0.09651579 0.17526032 0.01455058 0.15715015 0.21251374 0.17656558
  0.24216271 0.1435531  0.15257266 0.15593303 0.         0.
  0.13769547 0.08714978]
 [0.27656424 0.13738848 1.         0.3463108  0.34691438 0.28182326
  0.2441229  0.36161075 0.0183576  0.3175193  0.37699591 0.52508267
  0.25114772 0.2928571  0.39251182 0.27769922 0.         0.
  0.43600127 0.22818025]
 [0.49049022 0.22092328 0.3463108  1.         0.26525064 0.49976291
  0.31592567 0.34895746 0.03524549 0.33464383 0.29268229 0.43778084
  0.3590289  0.27513557 0.31944058 0.25961518 0.         0.
  0.34181434 0.25741865]
 [0.18226125 0.18629205 0.34691438 0.26525064 1.         0.26780604
  0.16900065 0.36838013 0.02129057 0.22452846 0.

In [24]:
#cluster the documents using kmenas
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns; sns.set() 
from sklearn.datasets.samples_generator import make_blobs

kmean=KMeans(n_clusters=3)
kmean.fit(tf_idf_data)
kmean.labels_


array([0, 0, 1, 0, 2, 0, 1, 2, 2, 2, 1, 1, 0, 1, 1, 1, 2, 2, 1, 0])