In [3]:
%load_ext autoreload
%autoreload 2

import os,sys,inspect
import re
from bs4 import BeautifulSoup
from spacy import displacy
import pandas as pd

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
data_dir = os.path.join(parentdir, 'data')
job_alpha = os.path.join(parentdir, 'src')

print(job_alpha)
if job_alpha not in sys.path:
    sys.path.append(job_alpha)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
d:\dev\py\job-alpha\src


In [4]:
details = os.path.join(data_dir, 'sample-details.htm')
with open(details, 'r') as myfile:
    details_data=myfile.read().replace('\n', '')
soup = BeautifulSoup(details_data, 'html.parser')
descr = soup.find('div', attrs={'data-automation': 'jobDescription'})
job_details = descr.get_text()
job_details = re.sub(r'\s+', ' ', job_details).strip().lower()
job_details

'we are looking for full-time and casual sales consultant the applicants must be: experienced in telecommunications sales,customer service and motivated to meet or exceed ongoing store sales and customer expectations. confident in siebel and mnc systems (will be preferred)! self-motivated,team player and have an ability to learn quickly. confident with excellent communication skills, listening skills and able to build great rapport with customers. able to deliver high level of pre and post sales advocacy (nps) strong in organisational and task management skills with attention to detail. professional at all times with excellent personal presentation and people skills. able to work under pressure and deliver daily results. mature with positive and "can do "attitude and a high level of initiative very reliable, punctual, trustworthy and hardworking with good work ethics able to attend day starts, sales and product training sessions as required. able to work weekends and extra hours as and

In [9]:
import spacy

nlp = spacy.load('en')
doc = nlp(job_details)

#docs = [npl(sen) for sen in job_details.split('.')]

# Find named entities, phrases and concepts  
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
    
print('-----')
for i, token in enumerate(doc):
    print(token.text, token.pos_, token.dep_, token.is_stop)

daily 715 720 DATE
day 894 897 DATE
and extra hours 977 992 TIME
-----
we PRON nsubj True
are VERB aux True
looking VERB ROOT False
for ADP prep True
full ADJ amod True
- PUNCT punct False
time NOUN nmod False
and CCONJ cc True
casual ADJ amod False
sales NOUN compound False
consultant NOUN pobj False
the DET det True
applicants NOUN nsubj False
must VERB aux True
be VERB conj True
: PUNCT punct False
experienced VERB acomp False
in ADP prep True
telecommunications NOUN compound False
sales NOUN pobj False
, PUNCT punct False
customer NOUN compound False
service NOUN conj False
and CCONJ cc True
motivated VERB conj False
to PART aux True
meet VERB xcomp False
or CCONJ cc True
exceed VERB conj False
ongoing ADJ amod False
store NOUN compound False
sales NOUN dobj False
and CCONJ cc True
customer NOUN compound False
expectations NOUN conj False
. PUNCT punct False
confident ADJ ROOT False
in ADP prep True
siebel NOUN pobj False
and CCONJ cc True
mnc NOUN conj False
systems NOUN conj Fals

In [21]:
key_words = []
phrase = ''
is_comb = False
for i, token in enumerate(doc):
    if token.pos_ in ['NOUN', 'ADJ', 'CCONJ']:
        is_comb = True
        phrase += token.text + ' '
        #print(token.text, token.pos_, token.dep_, token.is_stop)
    else:
        if is_comb:
            key_words.append(phrase)
            phrase = ''
        is_comb = False

print(','.join(key_words))

full ,time and casual sales consultant ,applicants ,telecommunications sales ,customer service and ,or ,ongoing store sales and customer expectations ,confident ,siebel and mnc systems ,self ,motivated ,team player and ,ability ,confident ,excellent communication skills ,skills and able ,great rapport ,customers ,able ,high level ,pre and ,sales advocacy ,strong ,organisational and task management skills ,attention ,detail ,professional ,times ,excellent personal presentation and people skills ,able ,pressure and ,daily results ,positive and ,and ,high level ,initiative ,reliable ,punctual ,trustworthy and ,good work ethics able ,day starts ,sales and product training sessions ,able ,weekends and extra hours ,and ,business ,your perfect opportunity ,our ambitious team ,telstra south yarra ,email ,attractive salary and ,generous commission available ,right candidate 


In [28]:
noisy_pos_tags = ['PROP', 'PUNCT', 'ADP']
min_token_length = 2

def isNoise(token):     
    is_noise = False
    if token.pos_ in noisy_pos_tags:
        is_noise = True 
    elif token.is_stop == True:
        is_noise = True
    elif len(token.string) <= min_token_length:
        is_noise = True
    return is_noise

def cleanup(token, lower = True):
    if lower:
       token = token.lower()
    return token.strip()



from collections import Counter
cleaned_list = [cleanup(word.string) for word in doc if not isNoise(word)]
Counter(cleaned_list).most_common(10)

[('sales', 5),
 ('able', 5),
 ('skills', 4),
 ('work', 3),
 ('customer', 2),
 ('motivated', 2),
 ('confident', 2),
 ('team', 2),
 ('excellent', 2),
 ('deliver', 2)]

In [36]:
# distill into noun chunks
# print(', '.join([chunk.text for chunk in doc.noun_chunks]))
print('-----')

# extract object based noun chunks
objs = [chunk.text for chunk in doc.noun_chunks if chunk.root.dep_ == 'pobj']
print(', '.join(objs))

print('-----')
# conjunctions with a root which is an object
roots = [chunk.root.text for chunk in doc.noun_chunks if chunk.root.dep_ == 'pobj']
conj = [chunk for chunk in doc.noun_chunks if chunk.root.dep_ == 'conj']
conj_add = ['%s %s' % (chunk.root.head.text, chunk.text) for chunk in conj if chunk.root.head.text in roots]
print(', '.join(conj_add))

print('-----')
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

-----
full-time and casual sales consultant, telecommunications sales, siebel, excellent communication skills, customers, pre and post sales advocacy, organisational and task management skills, attention, detail, all times, excellent personal presentation, pressure, initiative, good work ethics, telstra south yarra, email, ryan.vanhemert@southyarratelstrastore.com.au very attractive salary, the right candidate
-----
sales customer service, sales customer expectations, siebel mnc, siebel systems, skills listening skills, sales product training sessions, salary very generous commission
-----
we we nsubj looking
full-time and casual sales consultant consultant pobj for
the applicants applicants nsubj be
telecommunications sales sales pobj in
customer service service conj sales
ongoing store sales sales dobj exceed
customer expectations expectations conj sales
siebel siebel pobj in
mnc mnc conj siebel
systems systems conj siebel
self-motivated,team player player ROOT player
an ability abil

In [11]:
for token in doc:
    if token.pos_ is not 'PUNCT':# and not token.is_stop:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop)

au au DET DT ROOT xx True False
very very ADV RB advmod xxxx True True
attractive attractive ADJ JJ amod xxxx True False
salary salary NOUN NN pobj xxxx True False
and and CCONJ CC cc xxx True True
very very ADV RB advmod xxxx True True
generous generous ADJ JJ amod xxxx True False
commission commission NOUN NN conj xxxx True False
available available ADJ JJ amod xxxx True False
for for ADP IN prep xxx True True
the the DET DT det xxx True True
right right ADJ JJ amod xxxx True False
candidate candidate NOUN NN pobj xxxx True False


In [7]:

displacy.render(doc, style='dep', jupyter=True, options={'distance': 140})

In [8]:
displacy.render(doc, style='ent', jupyter=True)

In [9]:
import explacy
explacy.print_parse_info(nlp, 'The salad was surprisingly tasty.')

Dep tree Token        Dep type Lemma        Part of Sp
──────── ──────────── ──────── ──────────── ──────────
   ┌─►   The          det      the          DET       
┌─►└──   salad        nsubj    salad        NOUN      
└┬┬┬──   was          ROOT     be           VERB      
 ││└─►   surprisingly advmod   surprisingly ADV       
 │└──►   tasty        acomp    tasty        ADJ       
 └───►   .            punct    .            PUNCT     


In [10]:

for sen in job_details.split('.'):
    sen = sen.strip()
    doc = nlp(sen)
    try:
        explacy.print_parse_info(nlp, sen)
    except:
        pass
    displacy.render(doc, style='dep', jupyter=True, options={'distance': 140})

Dep tree                          Token              Dep type Lemma              Part of Sp
───────────────────────────────── ────────────────── ──────── ────────────────── ──────────
                             ┌──► we                 nsubj    -PRON-             PRON      
                             │┌─► are                aux      be                 VERB      
┌───────────────────┬────────┴┴── looking            ROOT     look               VERB      
│                   └─►┌───────── for                prep     for                ADP       
│                      │     ┌──► full               amod     full               ADJ       
│                      │     │┌─► -                  punct    -                  PUNCT     
│                      │  ┌─►└┼── time               nmod     time               NOUN      
│                      │  │   └─► and                cc       and                CCONJ     
│                      │  │  ┌──► casual             amod     casual            

Dep tree               Token         Dep type Lemma         Part of Sp
────────────────────── ───────────── ──────── ───────────── ──────────
┌───────────────────── confident     ROOT     confident     ADJ       
└─►┌────────────────── with          prep     with          ADP       
   │              ┌──► excellent     amod     excellent     ADJ       
   │              │┌─► communication compound communication NOUN      
   └─►┌───────────┴┼── skills        pobj     skill         NOUN      
      │            └─► ,             punct    ,             PUNCT     
      │            ┌─► listening     amod     listen        VERB      
      └─►┌─────────┼── skills        conj     skill         NOUN      
         │         └─► and           cc       and           CCONJ     
         └─►┌───────── able          conj     able          ADJ       
            │      ┌─► to            aux      to            PART      
            └─►┌┬──┴── build         xcomp    build         VERB      
      

Dep tree     Token    Dep type Lemma    Part of Sp
──────────── ──────── ──────── ──────── ──────────
┌─────────── able     ROOT     able     ADJ       
│        ┌─► to       aux      to       PART      
└─►┌─┬┬──┴── work     xcomp    work     VERB      
   │ │└─►┌── under    prep     under    ADP       
   │ │   └─► pressure pobj     pressure NOUN      
   │ └─────► and      cc       and      CCONJ     
   └─►┌───── deliver  conj     deliver  VERB      
      │  ┌─► daily    amod     daily    ADJ       
      └─►└── results  dobj     result   NOUN      


Dep tree                    Token       Dep type Lemma       Part of Sp
─────────────────────────── ─────────── ──────── ─────────── ──────────
┌───────────────────┬┬───── mature      ROOT     mature      VERB      
│                   │└─►┌── with        prep     with        ADP       
│                   │   └─► positive    pobj     positive    ADJ       
│                   └─────► and         cc       and         CCONJ     
│                      ┌──► "           punct    "           PUNCT     
│                      │┌─► can         aux      can         VERB      
└─►┌┬┬─────────┬───────┴┼── do          conj     do          VERB      
   │││         │        └─► "           punct    "           PUNCT     
   │││         └─►┌─────┬── attitude    dobj     attitude    VERB      
   │││            │     └─► and         cc       and         CCONJ     
   │││            │    ┌──► a           det      a           DET       
   │││            │    │┌─► high        amod     high        ADJ

Dep tree                    Token       Dep type Lemma       Part of Sp
─────────────────────────── ─────────── ──────── ─────────── ──────────
┌────────────────────────── able        ROOT     able        ADJ       
│                       ┌─► to          aux      to          PART      
└─►┌────────────┬┬┬─────┴── work        xcomp    work        VERB      
   │            ││└─►┌──┬── weekends    dobj     weekend     NOUN      
   │            ││   │  └─► and         cc       and         CCONJ     
   │            ││   │  ┌─► extra       amod     extra       ADJ       
   │            ││   └─►└── hours       conj     hour        NOUN      
   │            │└────────► as          prep     as          ADP       
   │            └─────────► and         cc       and         CCONJ     
   │                    ┌─► when        advmod   when        ADV       
   └─►┌───────────┬─────┴── required    conj     require     VERB      
      │           │     ┌─► to          aux      to          PAR

Dep tree Token Dep type Lemma Part of Sp
──────── ───── ──────── ───── ──────────
┌┬┬───── apply ROOT     apply VERB      
││└─►┌── by    prep     by    ADP       
││   └─► email pobj     email NOUN      
│└──►┌── to    prep     to    ADP       
│    └─► :     punct    :     PUNCT     
└──────► ryan  conj     ryan  ADJ       


Dep tree Token                            Dep type Lemma                            Part of Sp
──────── ──────────────────────────────── ──────── ──────────────────────────────── ──────────
         vanhemert@southyarratelstrastore ROOT     vanhemert@southyarratelstrastore ADJ       


Dep tree Token Dep type Lemma Part of Sp
──────── ───── ──────── ───── ──────────
         com   ROOT     com   NOUN      


Dep tree             Token      Dep type Lemma      Part of Sp
──────────────────── ────────── ──────── ────────── ──────────
┌┬────────────────── au         ROOT     au         DET       
││               ┌─► very       advmod   very       ADV       
││            ┌─►└── attractive amod     attractive ADJ       
│└─►┌─────────┴──┬── salary     pobj     salary     NOUN      
│   │            └─► and        cc       and        CCONJ     
│   │            ┌─► very       advmod   very       ADV       
│   │         ┌─►└── generous   amod     generous   ADJ       
│   └─►┌──────┴───── commission conj     commission NOUN      
│      └─►┌───────── available  amod     available  ADJ       
│         └─►┌────── for        prep     for        ADP       
│            │  ┌──► the        det      the        DET       
│            │  │┌─► right      amod     right      ADJ       
│            └─►└┴── candidate  pobj     candidate  NOUN      
└──────────────────► !          punct    !          PUN

a $4.5 billion project, a new team, accordance, the mine plan
-----

