In [1]:
# -*- coding: utf-8 -*-
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
from docx import Document
from RAKE import rake
import unicodedata
import csv
import pandas as pd
import itertools
import numpy as np
from Tkinter import *

In [2]:
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

In [3]:
def convertFormat(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

def cosine_sim(text1, text2):
    try:
        vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
        tfidf = vectorizer.fit_transform([text1, text2])
        return ((tfidf * tfidf.T).A)[0,1]
    except Exception as e:
        return 0       

def getText(filename):
    doc = Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

def joinElemenets(items):
    return ' '.join(items)

def getKeywords(text):
    rake_object = rake.Rake("RAKE/SmartStoplist.txt", 3, 3, 1)
    return rake_object.run(text)

def printProfile(profile):
    print profile['Name']
    print profile['Title']
    print profile['Location']
    print profile['Experience']
    print profile['Pay rate']
    print profile['Last Updated']
    print profile['Relocate']
    print profile['Previous Title'], "\n"
    
def getCosineSim(PDlist, JD):
    sims = map(cosine_sim, PDlist, itertools.repeat(JD, len(PDlist)))
    return np.array(sims)

def getTopNProfiles(n):
    return sims.argsort()[-n:][::-1]

## Load profiles

In [4]:
PD = pd.read_csv('data/DataMiner-PD-filled.csv')
PD.loc[PD["Pay rate"].isnull(),'Pay rate'] = "negotiable | negotiable"
PD.loc[PD["Previous title"].isnull(),'Previous title'] = "N/A"

In [5]:
PDlist = map(joinElemenets, PD.values.tolist())

## For .docx files

In [9]:
text = convertFormat(getText("data/SampleJD.docx"))
text = getKeywords(text)
JD = ' '.join(zip(*text)[0])

In [10]:
sims = getCosineSim(PDlist, JD)
topCandidates = getTopNProfiles(10)

In [11]:
PD.ix[topCandidates,:]

Unnamed: 0,URL,Name,Title,Current Location,Experience,Pay rate,Last Updated,Relocate,Previous title
379,https://employer.dice.com/ows/detailFacade.htm...,kaushik Reddy,Desired: Sr. Tableau BI Developer/ Sr. Data An...,"Chicago, IL",8 years exp.,"$110,000/yr | $55/hr",12/29/2016,Willing to relocate,Previous Title: Sr. Tableau BI Developer/ Sr. ...
270,https://employer.dice.com/ows/detailFacade.htm...,Charles Gamkong,Desired: Spotfire Tableau Data Scientist Busin...,"Dallas, TX",10 years exp.,negotiable | negotiable,1/19/2017,Not willing to relocate,Previous Title: Sr Business Intelligence / Ent...
2075,https://employer.dice.com/ows/detailFacade.htm...,Srikanth Tabris,Desired: Sr. Tableau Developer/Data Analyst,"North San Jose, CA",8 years exp.,negotiable | negotiable,12/23/2016,Willing to relocate,Previous Title: Tableau/ SAS Developer
230,https://employer.dice.com/ows/detailFacade.htm...,Giedre Mickeviciute,"Desired: Software Development Manager, IT proj...","Concord, MA",17 years exp.,negotiable | negotiable,1/9/2017,Not willing to relocate,Previous Title: Founder and Business Developme...
1973,https://employer.dice.com/ows/detailFacade.htm...,Vishwa Mannem,Desired: Sr. SQL Server Database / BI Developer,"Sacramento, CA",9 years exp.,negotiable | negotiable,1/20/2017,Willing to relocate,Previous Title: SQL SERVER / ORACLE DEVELOPER
257,https://employer.dice.com/ows/detailFacade.htm...,NAVEEN KUMAR POOJARI,Desired: Oracle / ETL Developer ; Systems Anal...,"Atlanta, GA",9 years exp.,negotiable | negotiable,1/20/2017,Willing to relocate,Previous Title: Oracle /ETL(Informatica) Devel...
1949,https://employer.dice.com/ows/detailFacade.htm...,RENJITH PEEDIACKAL,Desired: Business Data Analyst,"Wilmington, DE",10 years exp.,"$90,000/yr | $45/hr",1/17/2017,Willing to relocate,Previous Title: Data Analysis Consultant
1215,https://employer.dice.com/ows/detailFacade.htm...,SHILPI SRIVASTAVA,"Desired: Project Manager, Software Engineer, S...","Katy, TX",10 years exp.,"$24,000/yr | $12/hr",1/3/2017,Willing to relocate,
1326,https://employer.dice.com/ows/detailFacade.htm...,Anumita Pal,Desired: Sr. Tableau Developer /Sr. Data Analyst,"Seattle, WA",9 years exp.,negotiable | negotiable,1/20/2017,Willing to relocate,Previous Title: MSBI Analyst
1769,https://employer.dice.com/ows/detailFacade.htm...,Ram V,"Desired: Director,Project Manager ,Delivery Ma...","Norwood, NJ",20+ years exp.,"$137,800/yr | $65/hr",1/12/2017,Willing to relocate,Previous Title: Customer Support Engineer


## UI application

In [23]:
PD.ix[topCandidates[0]]

URL                 https://employer.dice.com/ows/detailFacade.htm...
Name                                              Giedre Mickeviciute
Title               Desired: Software Development Manager, IT proj...
Current Location                                          Concord, MA
Experience                                              17 years exp.
Pay rate                                      negotiable | negotiable
Last Updated                                                 1/9/2017
Relocate                                      Not willing to relocate
Previous title      Previous Title: Founder and Business Developme...
Name: 230, dtype: object

In [117]:
def process():
    sims = getCosineSim(PDlist, txt)
    topCandidates = getTopNProfiles(5)
    p0 = PD.ix[topCandidates[0]]
    
    f=("Calibri",10)
    r=4
    
    Label(app, text="A", width=20, height=1).grid(row=r+1, column=0, sticky=SW)
    Label(app, text="AAA", width=20, height=1).grid(row=r+2, column=0, sticky=W)
    Label(app, text="AAAAA", width=40, height=1).grid(row=r+3, column=0, sticky=W)
#     Label(app, text=p0["Name"], width=len(p0["Name"]), height=1).grid(row=r+1, column=0, sticky=W)
#     Label(app, text=str(p0["Title"]).split(':')[1], width=len(str(p0["Title"]).split(':')[1]), height=1).grid(row=r+2, column=0, sticky=W)
#     Label(app, text=p0["Current Location"], width=len(p0["Current Location"]), height=1).grid(row=r+3, column=0, sticky=W)
#     Label(app, text='{0}'.format(p0["Name"]), width=len(p0["Name"]), height=1, font=("Calibri",13)).grid(row=r, column=0)
#     Label(app, text='{0}'.format(str(p0["Title"]).split(':')[1]), width=len(p0["Title"].split(':')[1]), height=1, font=f).grid(row=r, column= 1)
#     Label(app, text='{0}'.format(p0["Current Location"]), width=len(p0["Current Location"]), height=1, font=f).grid(row=r+2, column=1)
#     Label(app, text='{0}'.format(p0["Experience"]), width=len(p0["Experience"]), height=1, font=f).grid(row=r+3, column=1)
#     Label(app, text='{0}'.format(p0["Pay rate"]), width=len(p0["Pay rate"]), height=1, font=f).grid(row=r+4, column=1)
#     Label(app, text='{0}'.format(p0["Relocate"]), width=len(p0["Relocate"]), height=1, font=f).grid(row=r+5, column=1)
    Label(app, text='{0}'.format(p0["URL"]), width=len(p0["URL"]), height=1, font=f).grid(row=r+6, column=1, columnspan=2)
    
root = Tk()
root.title('Profile Recommender')
root.wm_state('zoomed')

app = Frame(root)
app.pack()#side="left", fill='both', expand=True, padx=4, pady=4)

data = None

Label(app, text=' ', font=("Calibri",16)).grid(row=1, column=0, sticky=NW)
Label(app, text='Job Description          ', font=("Calibri -weight bold", 16)).grid(row=2, column=0, sticky=NW)
Label(app, text='', width=7, font=("Calibri",16)).grid(row=2, column=3, sticky=NW)

txt = StringVar()
Entry(app, textvariable=txt, width=60, font=("Calibri light",16)).grid(row=2, column=1, sticky=NE)

Button(app, text='Search', command=process, width=10, font=("Calibri -weight bold", 14)).grid(row=2, column=4, sticky=SE)

root.mainloop()