## Importing Libraries

In [1]:
import io
import os
import pandas as pd

#NLP Libraries

import nltk
from nltk.corpus import stopwords
#from nltk.tag.stanford import StanfordNERTagger

#Document Reader Libraries

from subprocess import Popen, PIPE
from docx import Document
import pdfminer
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO

from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph

#Tf-IDf libraries

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#fuzzy logic

import gensim
from fuzzywuzzy import fuzz



## Convert PDF to Text - Function

In [2]:
#Function to convert pdf to text
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str1 = retstr.getvalue()
    retstr.close()
    return str1


## Read All Docx Blocks

In [3]:
#Read All Docx
def iter_block_items(parent):
    """
    Generate a reference to each paragraph and table child within *parent*,
    in document order. Each returned value is an instance of either Table or
    Paragraph. *parent* would most commonly be a reference to a main
    Document object, but also works for a _Cell object, which itself can
    contain paragraphs and tables.
    """
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
        # print(parent_elm.xml)
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

## Convert Document to Text including PDF - Function

In [4]:
#Function to convert doc formats (like .doc, .docx, .odt) to text
def document_to_text(filename, file_path):
    if filename[-4:] == ".doc":
        cmd = ['C:/antiword/antiword', file_path] #use antiword command
        p = Popen(cmd, stdout=PIPE)
        stdout, stderr = p.communicate()
        return stdout.decode('ascii', 'ignore')
    elif filename[-5:] == ".docx":
        doc = Document(file_path)
        fullText = []
        para_ix=0
        tbl_ix=0
        for block in iter_block_items(doc):
            block_type=block.__class__.__name__
            if block_type=='Table':
                for row in doc.tables[tbl_ix].rows:
                    for cell in row.cells:
                        for paracell in cell.paragraphs:
                            fullText.append(paracell.text)
                tbl_ix+=1
            elif block_type=='Paragraph':
                fullText.append(doc.paragraphs[para_ix].text)
                para_ix+=1
        return '\n'.join(fullText)
    elif filename[-4:] == ".odt":
        cmd = ['odt2txt', file_path]
        p = Popen(cmd, stdout=PIPE)
        stdout, stderr = p.communicate()
        return stdout.decode('ascii', 'ignore')
    elif filename[-4:] == ".pdf":
        return convert_pdf_to_txt(file_path)

## Data Cleaner

In [5]:
import pandas as pd
import re

punctuations_list = [',', '?', ':', ';', '!', ')', '(', '\'','"','*','/','<','>','{','}','[',']']

def cleanStr(mystring):
    
    # Removing non-utf charachters with space
    mystring = re.sub(r'[^\x00-\x7F]+', ' ', str(mystring).lower())
    
    for punctuation in punctuations_list:            
        mystring = mystring.replace(punctuation, "".join(punctuation).join("  "))
        
    # Replacing 2 dots with one    
    mystring = mystring.replace("..", ". ")
    
    # Removing Multiple White spaces
    message = ' '.join(mystring.split())

    return message 

## Read Sample CVs & JD for Sno.1 JD

In [36]:
# Read the sample cv1 -JD1
Cv1text=document_to_text('CURRENT_052017_NISHTHASHRIVASTAVA_Resume_c162f248-0fd9-4c0c-97b0-c22782d9cba1_NISHTHASHRIVASTAVA2_1.docx','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/IIM/CURRENT_052017_NISHTHASHRIVASTAVA_Resume_c162f248-0fd9-4c0c-97b0-c22782d9cba1_NISHTHASHRIVASTAVA2_1.docx')

In [37]:
# Read Sample cv2 - JD1
Cv2text=document_to_text('CURRENT_052017_ASHISHJAIN_Resume_02667350-11d5-471d-b3fd-a0c3d8c0d3e7_ASHISHJAIN2_6.docx','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/IIM/CURRENT_052017_ASHISHJAIN_Resume_02667350-11d5-471d-b3fd-a0c3d8c0d3e7_ASHISHJAIN2_6.docx')
#text2=document_to_text('test1.doc','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/Sno1/test1.doc')

In [38]:
#Read sample JD1
JdText=document_to_text('Jd-Sno6.docx','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/Sno6/Jd-Sno6.docx')
#text3=document_to_text('test3.pdf', 'D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/Sno1/test3.pdf')

In [41]:
JdText

'Role : Application Developer Role Description : Design, develop, and configure software systems to meet market and/or client requirements either end-to-end from analysis, design, implementation, quality assurance (including testing), to delivery and maintenance of the software product or system or for a specific phase of the lifecycle. Apply knowledge of technologies, applications, methodologies, processes and tools to support a client, project or entity. \nMust have Skills : Teradata BI'

In [27]:
cv1=Cv1text.split()
cv2=Cv2text.split()
JD=JdText.split()

In [17]:
documentsJD1=(JdText,Cv1text,Cv2text)

In [42]:
#tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=[2,2])
tfidf_vectorizer = TfidfVectorizer( ngram_range=[2,2])
#tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix1 = tfidf_vectorizer.fit_transform(documentsJD1)

In [43]:
print (tfidf_matrix1.shape)

(3, 1058)


In [44]:
cosine_similarity(tfidf_matrix1[0:1], tfidf_matrix1)

array([[ 1.        ,  0.        ,  0.00302468]])

From the cosine similarity the CV of 1st candidate - Shalini is more close to JD than that of 2nd Candidate Usha
and same is corroborated by the data where Shalini is selected and Usha is rejected

## Read Sample CVs & JD for Sno.5 JD

In [4]:
# Read the sample cv1 -JD5
Cv1text=document_to_text('CURRENT_052017_LaliteshUpadhyaya_Resume_9db2850b-a607-4a84-899d-1b0f9a0a8459_LaliteshUpadhyaya4_6.docx','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/Sno5/CURRENT_052017_LaliteshUpadhyaya_Resume_9db2850b-a607-4a84-899d-1b0f9a0a8459_LaliteshUpadhyaya4_6.docx')

# Read Sample cv2 - JD5
Cv2text=document_to_text('CURRENT_032017_HIRALPARMAR_Resume_3e8d6f3d-2b01-4603-8a09-12a840772afc_hiral2_0.pdf','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/Sno5/CURRENT_032017_HIRALPARMAR_Resume_3e8d6f3d-2b01-4603-8a09-12a840772afc_hiral2_0.pdf')

#Read sample JD5
JdText=document_to_text('Jd-Sno5.docx','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/Sno5/Jd-Sno5.docx')



PackageNotFoundError: Package not found at 'D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/Sno5/CURRENT_052017_LaliteshUpadhyaya_Resume_9db2850b-a607-4a84-899d-1b0f9a0a8459_LaliteshUpadhyaya4_6.docx'

In [56]:
documentsJD5=(JdText,Cv1text,Cv2text)

In [58]:
tfidf_matrix5 = tfidf_vectorizer.fit_transform(documentsJD5)

In [59]:
print (tfidf_matrix5.shape)

(3, 573)


In [61]:
cosine_similarity(tfidf_matrix5[0:1], tfidf_matrix5)

array([[ 1.        ,  0.2278712 ,  0.18104144]])

From the cosine similarity the CV of 1st candidate - Laitesh is more close to JD than that of 2nd Candidate Hiral 
However in the data Lalitesh is rejected but because he is demanding higher CTC and not because of not holding relevent skillset. 
The rejection reason for this sample is outside of the JD & CV similarity ranking

In [160]:
# Read the sample Accept cv1 -JD
Cv1text=document_to_text('CURRENT_042017_SethuramKishore Ch_Resume_514570f0-3c55-4382-96f1-65f9a73bf119_Sethuram9_0.doc','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/IIM/CURRENT_042017_SethuramKishore Ch_Resume_514570f0-3c55-4382-96f1-65f9a73bf119_Sethuram9_0.doc')
Cv2text=document_to_text('CURRENT_032017_DishantPatel_Resume_f88c74e3-8545-4156-a708-8d093885288e_DishantPatel6_0.docx','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/IIM/CURRENT_032017_DishantPatel_Resume_f88c74e3-8545-4156-a708-8d093885288e_DishantPatel6_0.docx')
Cv3text=document_to_text('CURRENT_032017_umashankar_Resume_77f65cee-4bce-4d83-9103-cc47c019eb96_umashankar11_0 (1).doc','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/IIM/CURRENT_032017_umashankar_Resume_77f65cee-4bce-4d83-9103-cc47c019eb96_umashankar11_0 (1).doc')

# Read Sample Rejectcv2 - JD
#Cv3text=document_to_text('CURRENT_052017_HarikrishnaPalla_Resume_0b50617e-97f1-4b22-94c1-0fa51548daa2_harikrishnapalla5_0.doc','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/IIM/CURRENT_052017_HarikrishnaPalla_Resume_0b50617e-97f1-4b22-94c1-0fa51548daa2_harikrishnapalla5_0.doc')
Cv4text=document_to_text('Ganesh[5_0].doc','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/IIM/Ganesh[5_0].doc')

#Cv5text=document_to_text('CURRENT_052017_RagunathanAnnadurai_Resume_3a1aca2d-751e-45b8-8975-69a310116353_RagunathanAnnadurai3_6.docx','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/IIM/CURRENT_052017_RagunathanAnnadurai_Resume_3a1aca2d-751e-45b8-8975-69a310116353_RagunathanAnnadurai3_6.docx')
#Read sample JD5
JdText=document_to_text('JD.docx','D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/TestingGrounds/JD.docx')



In [63]:
#documentsJD=(JdText,Cv1text,Cv2text,Cv3text,)
documentsJD=(JdText,Cv1text,Cv2text,Cv3text,Cv4text)#,Cv5text)


In [69]:
#tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=[2,2])
tfidf_vectorizer = TfidfVectorizer( ngram_range=[2,2])
# tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix5 = tfidf_vectorizer.fit_transform(documentsJD)

In [70]:
print (tfidf_matrix5.shape)

(5, 3395)


In [71]:
cosine_similarity(tfidf_matrix5[0:1], tfidf_matrix5)

array([[ 1.        ,  0.026105  ,  0.00608045,  0.02179648,  0.02056105]])

In [22]:
fuzz.ratio(JdText,Cv3text)

1

In [17]:
fuzz.WRatio(JdText,Cv4text)

86

In [23]:
len(set(str(JdText).lower().split()).intersection(set(str(Cv1text).lower().split())))

25

## Word2VecModels

In [10]:
model = gensim.models.KeyedVectors.load_word2vec_format('D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/TestingGrounds/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [6]:
#model2=gensim.models.Word2Vec.load("D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/res2vecModITDomain4CBOW.w2v")
model2=gensim.models.Word2Vec.load("D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/res2vecModITDomain4SG.w2v")

In [7]:
os.chdir('D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2')

In [7]:
model3=gensim.models.Word2Vec.load("res2vecModCntntWriter2CBOW.w2v")

In [8]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model2.wmdistance(s1, s2)

In [10]:
from nltk.corpus import stopwords

In [9]:
import pandas as pd
import numpy as np
import glob

In [11]:
os.getcwd()

'D:\\D drive\\DataAnalytics_StudyMaterial\\IIMB-BDA\\ProjectZ\\Iteration2\\Aujas'

In [48]:
os.chdir('D:\\D drive\\DataAnalytics_StudyMaterial\\IIMB-BDA\\ProjectZ\\Iteration2')

In [57]:
JDTxt=cleanStr(document_to_text('Job Description.docx',os.path.join('./TestingGrounds/','JD.docx')))

In [50]:
resume_filenames = sorted(glob.glob(os.path.join('./IIM/',"*.*")))

In [51]:
resume_filenames

['./IIM\\Abhi Kumar.docx',
 './IIM\\Al Gorithm.pdf',
 './IIM\\AmitBats[6_0].doc',
 './IIM\\BalakrishnaReddyS[4_6].docx',
 './IIM\\Bharath-Paturi-CV.pdf',
 './IIM\\CURRENT_022017_Santosh kumartoleti_Resume_f0fb3c53-c308-4f86-b335-b05ebd2cccc9_SantoshkumarToleti2_6.docx',
 './IIM\\CURRENT_032017_BhaktiSubhash Naik_Resume_cf61cd93-ff05-4fc4-85ec-42301b5456f5_BhaktiSubhashNaik4_9.doc',
 './IIM\\CURRENT_032017_Daljeet Singh DuggalDUGGAL_Resume_adb59f9d-a2e4-46b8-abdc-375b61107dc5_DaljeetSinghDuggal6_0.docx',
 './IIM\\CURRENT_032017_DishantPatel_Resume_f88c74e3-8545-4156-a708-8d093885288e_DishantPatel6_0.docx',
 './IIM\\CURRENT_032017_HIRALPARMAR_Resume_3e8d6f3d-2b01-4603-8a09-12a840772afc_hiral2_0.pdf',
 './IIM\\CURRENT_032017_SHALINISHALINI K_Resume_e49f1c7b-d151-44b9-9afe-b774a6852495_SHALINIK2_7.docx',
 './IIM\\CURRENT_032017_SandeeV S_Resume_2cd5e682-f1f0-4d95-9483-57a25f6965c0_SandeeVS4_3.doc',
 './IIM\\CURRENT_032017_SnehlataKumari_Resume_2224bb2b-a3f9-4083-9d5d-5b0a158be3fc_SnehlataK

In [52]:
ResFileArr=np.asarray(resume_filenames)

In [58]:
%%time
WMDScore=[]
for dfRow in range(len(resume_filenames)):
    print(dfRow)
    Cvtxt=cleanStr(document_to_text(resume_filenames[dfRow],resume_filenames[dfRow]))
    WMDScore.append(wmd(JDTxt,Cvtxt))
    #CvRejtxt=cleanStr(document_to_text(JDCV_df.ix[dfRow][2],os.path.join('./IIM/',JDCV_df.ix[dfRow][2])))
    #scoreSel.append(wmd(JdText,CvSeltxt))
    #scoreRej.append(wmd(JdText,CvRejtxt))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
Wall time: 4min 45s


In [59]:
WMDScoreArr=np.asarray(WMDScore)

In [60]:
CVJD = pd.DataFrame()
CVJD['ResumeFile']=ResFileArr
CVJD['WMDScore']=WMDScoreArr

In [61]:
CVJD.sort('WMDScore').to_csv('TestAllCV-TeraDataJDScore.csv')

  if __name__ == '__main__':


In [11]:
JDCV_df = pd.read_csv('ResumeSelRejFileData.tsv',sep='\t')

In [12]:
JDCV_df

Unnamed: 0,JD,ResumeSelectedFile,ResumeRejected File
0,"Design, build and configure applications.Must ...",CURRENT_032017_SHALINISHALINI K_Resume_e49f1c7...,CURRENT_052017_usharani_Resume_651d8412-2f0a-4...
1,Role : Application Developer Role Description ...,CURRENT_032017_Daljeet Singh DuggalDUGGAL_Resu...,File_Not_Found
2,Role : Application Developer Role.Description ...,File_Not_Found,File_Not_Found
3,Role : Application Developer Role Description ...,CURRENT_042017_SAGARDEWAN_Resume_1161c5a0-f1f9...,CURRENT_042017_KARANVIRSINGH_Resume_24452b3d-e...
4,Role : Application Developer Role Description ...,CURRENT_032017_HIRALPARMAR_Resume_3e8d6f3d-2b0...,CURRENT_052017_LaliteshUpadhyaya_Resume_9db285...
5,Role : Application Developer Role Description ...,CURRENT_052017_NISHTHASHRIVASTAVA_Resume_c162f...,CURRENT_052017_ASHISHJAIN_Resume_02667350-11d5...
6,Role : Role Description :Role : Role Descripti...,CURRENT_052017_SaranSaran Kumar_Resume_3fa6123...,CURRENT_052017_DineshReddy_Resume_553992ad-15c...
7,Role: Application Developer.Role Description: ...,CURRENT_052017_SaradhiPoluru_Resume_96f4e1db-a...,CURRENT_052017_Shaikh ShadmaWarsi_Resume_af9ac...
8,Role: Application Developer. Role Description:...,CURRENT_052017_VIVEK KUMAR REDDYMUDUGANTI_Resu...,SWETA[2_11].pdf
9,Role: Application Developer.Role Description: ...,CURRENT_052017_SAIKUMARKADA_Resume_a80bc2c6-71...,RajashekharaiahM[4_0].docx


In [15]:
%%time
scoreSel=[]
scoreRej=[]
for dfRow in range(len(JDCV_df)):
    print(dfRow)
    if (dfRow==6):
        scoreSel.append(np.NaN)
        scoreRej.append(np.NaN)
    else:        
        JdText=JDCV_df.ix[dfRow][0]
        CvSeltxt=cleanStr(document_to_text(JDCV_df.ix[dfRow][1],os.path.join('./IIM/',JDCV_df.ix[dfRow][1])))
        CvRejtxt=cleanStr(document_to_text(JDCV_df.ix[dfRow][2],os.path.join('./IIM/',JDCV_df.ix[dfRow][2])))
        scoreSel.append(wmd(JdText,CvSeltxt))
        scoreRej.append(wmd(JdText,CvRejtxt))
    
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
Wall time: 32.8 s


In [179]:
dfRow=22
JdText=JDCV_df.ix[dfRow][0]
CvSeltxt=cleanStr(document_to_text(JDCV_df.ix[dfRow][1],os.path.join('./IIM/',JDCV_df.ix[dfRow][1])))
CvRejtxt=cleanStr(document_to_text(JDCV_df.ix[dfRow][2],os.path.join('./IIM/',JDCV_df.ix[dfRow][2])))
scoreSel.append(wmd(JdText,CvSeltxt))
scoreRej.append(wmd(JdText,CvRejtxt))

In [14]:
scoreSel.append(np.NaN)
scoreRej.append(np.NaN)

In [16]:
ScrSelArr=np.asarray(scoreSel)
ScrRejArr=np.asanyarray(scoreRej)

In [17]:
JDCV_df['ScoreSel']=ScrSelArr
JDCV_df['ScoreRej']=ScrRejArr

In [18]:
JDCV_df.to_csv("Word2VecITDomainScore_SG1.csv")

In [207]:
JdText='Role : Tester  Role Description : Provide expertise in the planning, constructing and execution of test scripts. Apply business and functional knowledge including testing standards, guidelines, and testing methodology to meet the teams overall test objectives. Ensure all testing results are easily accessible and understandable. Track defects to closure and keep defect repository up-to-date.  Must have Skills : TOSCA Testsuite  Good to Have Skills :  Test Automation  Job Description Additional Comments :'

In [208]:
Cv1text=cleanStr(document_to_text('CURRENT_122016_ShikhaSelot_Resume_834060f4-f4c0-498b-9b20-f1da5911bb52_ShikhaSelot5_0.docx',os.path.join('./IIM/','CURRENT_122016_ShikhaSelot_Resume_834060f4-f4c0-498b-9b20-f1da5911bb52_ShikhaSelot5_0.docx')))

In [209]:
wmd(JdText,Cv1text)

0.20460103679273195

In [145]:
wmd("I love India","Indu")

inf

In [163]:
wmd(JdText,Cv3text)

0.7996079282793358

In [164]:
wmd(JdText,Cv4text)

0.8738800347081267

In [123]:
wmd(JdText,Cv5text)

1.8100193399270326

In [89]:
wmd("", "")

2.943863868713379

In [90]:
import glob

In [92]:
import codecs