In [1]:
import io
import os
import pandas as pd
import re
import numpy as np
import glob
from math import *

#NLP Libraries

import nltk
from nltk.corpus import stopwords

#Document Conversion Libraries
from subprocess import Popen, PIPE
import pdfminer
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO

from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph

#Tf-IDf libraries

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#fuzzy logic
import gensim
from fuzzywuzzy import fuzz



## Convert PDF to Text

In [2]:
#Function to convert pdf to text
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str1 = retstr.getvalue()
    retstr.close()
    return str1

## Read All Docx blocks

In [3]:
#Read All Docx
def iter_block_items(parent):
    """
    Generate a reference to each paragraph and table child within *parent*,
    in document order. Each returned value is an instance of either Table or
    Paragraph. *parent* would most commonly be a reference to a main
    Document object, but also works for a _Cell object, which itself can
    contain paragraphs and tables.
    """
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
        # print(parent_elm.xml)
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

## Convert Documents to Text including pdfs

In [4]:
#Function to convert doc formats (like .doc, .docx, .odt) to text
def document_to_text(filename, file_path):
    if filename[-4:] == ".doc":
        cmd = ['C:/antiword/antiword', file_path] #use antiword command
        p = Popen(cmd, stdout=PIPE)
        stdout, stderr = p.communicate()
        return stdout.decode('ascii', 'ignore')
    elif filename[-5:] == ".docx":
        doc = Document(file_path)
        fullText = []
        para_ix=0
        tbl_ix=0
        for block in iter_block_items(doc):
            block_type=block.__class__.__name__
            if block_type=='Table':
                for row in doc.tables[tbl_ix].rows:
                    for cell in row.cells:
                        for paracell in cell.paragraphs:
                            fullText.append(paracell.text)
                tbl_ix+=1
            elif block_type=='Paragraph':
                fullText.append(doc.paragraphs[para_ix].text)
                para_ix+=1
        return '\n'.join(fullText)
    elif filename[-4:] == ".odt":
        cmd = ['odt2txt', file_path]
        p = Popen(cmd, stdout=PIPE)
        stdout, stderr = p.communicate()
        return stdout.decode('ascii', 'ignore')
    elif filename[-4:] == ".pdf":
        return convert_pdf_to_txt(file_path)

## Data Cleaner

In [5]:
punctuations_list = [',', '?', ':', ';', '!', ')', '(', '\'','"','*','/','<','>','{','}','[',']']

def cleanStr(mystring):
    
    # Removing non-utf charachters with space
    mystring = re.sub(r'[^\x00-\x7F]+', ' ', str(mystring).lower())
    
    for punctuation in punctuations_list:            
        mystring = mystring.replace(punctuation, "".join(punctuation).join("  "))
        
    # Replacing 2 dots with one    
    mystring = mystring.replace("..", ". ")
    
    # Removing Multiple White spaces
    message = ' '.join(mystring.split())

    return message 


## Load Word2Vec Trained Model

In [73]:
#model1=gensim.models.Word2Vec.load("D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/res2vecModITDomain4CBOW.w2v")
model2=gensim.models.Word2Vec.load("D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/res2vecModITDomain4SG.w2v")

In [7]:
os.chdir("D:/D drive/DataAnalytics_StudyMaterial/IIMB-BDA/ProjectZ/Iteration2/Aujas")

## Define Word Mover Distance

In [72]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model2.wmdistance(s1, s2)
    #return model1.wmdistance(s1, s2)

## Read JD, CVs & Score

In [74]:
%%time
itr=0
JD=[]
CVFilePath=[]
CVScore=[]
profile=[]
for dirname, dirnames, filenames in os.walk('.'):
    #print(dirname)
    if(itr==0):
        #print(itr)
        a=1# do nothing as it is current directory
    elif(fmod(itr,2)==0):
        #print(itr)
        #Subdirectory sontaining CVs
        CV_files=glob.glob(os.path.join(dirname,"*.*"))
        #print(CV_files)
        for dfRow in range(len(CV_files)):
            JD.append(JDText)
            #print(JDText)
            CVFilePath.append(CV_files[dfRow])
            profile.append(re.findall(r"\\(.*)\\",CV_files[dfRow]))
            #print(CV_files[dfRow])
            #print(document_to_text(CV_files[dfRow],CV_files[dfRow]))
            Cvtxt=cleanStr(document_to_text(CV_files[dfRow],CV_files[dfRow]))
            #print(Cvtxt)
            CVScore.append(wmd(JDText,Cvtxt))
    else:
        #print(itr)
        #Subdirectory containing JD
        JDText=document_to_text("Job Description.docx",os.path.join(dirname,"Job Description.docx"))
        
    itr=itr+1
    

Wall time: 13.7 s


In [75]:
JDArr=np.asarray(JD)
CVFilePathArr=np.asarray(CVFilePath)
CVScoreArr=np.asarray(CVScore)
profileArr=np.asarray(profile)

In [76]:
finalCBOW=pd.DataFrame()#(JDArr,CVFilePathArr,CVScoreArr)
finalSG=pd.DataFrame()

In [70]:
finalCBOW['JD']=JDArr
finalCBOW['Profile']=profileArr
finalCBOW['CVFilePath']=CVFilePathArr
finalCBOW['CVScore']=CVScoreArr


In [77]:
finalSG['JD']=JDArr
finalSG['Profile']=profileArr
finalSG['CVFilePath']=CVFilePathArr
finalSG['CVScore']=CVScoreArr

In [83]:
finalCBOW.sort(['Profile','CVScore'])

  if __name__ == '__main__':


Unnamed: 0,JD,Profile,CVFilePath,CVScore
2,Job Description\n\nStrong programming skills i...,Python\Resumes,.\Python\Resumes\Santosh Kumar.pdf,0.449134
0,Job Description\n\nStrong programming skills i...,Python\Resumes,.\Python\Resumes\Aayush Tandon.pdf,0.470372
3,Job Description\n\nStrong programming skills i...,Python\Resumes,.\Python\Resumes\Sumit Sharma.pdf,0.550927
4,Job Description\n\nStrong programming skills i...,Python\Resumes,.\Python\Resumes\Susheel George Chandy.pdf,0.560237
1,Job Description\n\nStrong programming skills i...,Python\Resumes,.\Python\Resumes\Anand Vyas.pdf,0.636559
5,Job Description:\n\nNeed to have familiarity w...,Qradar\Resumes,.\Qradar\Resumes\Aayush Jain.doc,0.10252
8,Job Description:\n\nNeed to have familiarity w...,Qradar\Resumes,.\Qradar\Resumes\RAHUL SHARMA.doc,0.105573
6,Job Description:\n\nNeed to have familiarity w...,Qradar\Resumes,.\Qradar\Resumes\Ankur Joshi.doc,0.128946
7,Job Description:\n\nNeed to have familiarity w...,Qradar\Resumes,.\Qradar\Resumes\Gaurav Singh Chauhan.doc,0.16393
13,Job Description\n\nDesired Profile:\n\nFamilia...,SOC L1\Resumes,.\SOC L1\Resumes\Saranshnagaich.pdf,0.163275


In [84]:
finalSG.sort(['Profile','CVScore'])

  if __name__ == '__main__':


Unnamed: 0,JD,Profile,CVFilePath,CVScore
2,Job Description\n\nStrong programming skills i...,Python\Resumes,.\Python\Resumes\Santosh Kumar.pdf,1.068819
4,Job Description\n\nStrong programming skills i...,Python\Resumes,.\Python\Resumes\Susheel George Chandy.pdf,1.173408
0,Job Description\n\nStrong programming skills i...,Python\Resumes,.\Python\Resumes\Aayush Tandon.pdf,1.1814
1,Job Description\n\nStrong programming skills i...,Python\Resumes,.\Python\Resumes\Anand Vyas.pdf,1.201991
3,Job Description\n\nStrong programming skills i...,Python\Resumes,.\Python\Resumes\Sumit Sharma.pdf,1.214277
7,Job Description:\n\nNeed to have familiarity w...,Qradar\Resumes,.\Qradar\Resumes\Gaurav Singh Chauhan.doc,0.608879
5,Job Description:\n\nNeed to have familiarity w...,Qradar\Resumes,.\Qradar\Resumes\Aayush Jain.doc,0.632315
6,Job Description:\n\nNeed to have familiarity w...,Qradar\Resumes,.\Qradar\Resumes\Ankur Joshi.doc,0.6861
8,Job Description:\n\nNeed to have familiarity w...,Qradar\Resumes,.\Qradar\Resumes\RAHUL SHARMA.doc,0.689415
10,Job Description\n\nDesired Profile:\n\nFamilia...,SOC L1\Resumes,.\SOC L1\Resumes\Naveen Sunori.docx,0.536846


In [64]:
mystring = CVFilePath[0]
#mystring=".\Qradar\Resumes"
#mystring[ mystring.find("\\")+1 : mystring.find("\\") ]
match1 = re.findall(r"\\(.*)\\",mystring)
print (match1)#.group(0))
#for mt in match1:
 #   print(mt[0])
print(mystring)

['Python\\Resumes']
.\Python\Resumes\Aayush Tandon.pdf


In [138]:
stri = "foobar['InfoNeeded'],]"
match = re.match(r"^.*\['(.*)'\].*$",stri)
print (match.group(1))

InfoNeeded
