In [1]:
# !pip install flashtext

In [2]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_lg

In [3]:
import pandas as pd
from IPython.display import HTML, display

import spacy
from spacy.matcher import Matcher
from spacy.util import filter_spans
from spacy.tokens import Span
from spacy import displacy
import re

In [4]:
df = pd.read_csv('data_science_jobs.csv')
df.drop(columns=['Unnamed: 0', 'Job Board', 'Location'], axis=1,  inplace=True)

In [5]:
df.isna().sum()

Job Title          0
Company Name       0
Job Description    0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Job Title        881 non-null    object
 1   Company Name     881 non-null    object
 2   Job Description  881 non-null    object
dtypes: object(3)
memory usage: 20.8+ KB


In [7]:
description_list = list(df['Job Description'])

In [8]:
spacy_model = spacy.load("en_core_web_sm")

# Finding sentences that contain the keyword

In [9]:
def highlight_terms(keyword, texts):
    for doc in spacy_model.pipe(texts):
        for sentence in set([word.sent for word in doc if word.lower_ in keyword]):
            text = sentence.text.strip()
            markup = re.sub(fr'(?i)\b({"|".join(keyword)})\b', r'<strong>\1</strong>', text)
            display(HTML(markup))

In [10]:
highlight_terms(['experience'], description_list[:10])

In [11]:
highlight_terms(['proficient'], description_list[:10])

# POS

In [12]:
matcher = Matcher(spacy_model.vocab)
pattern = [{'POS': 'NOUN', 'OP': '+'}, {'LOWER': 'experience'}]
matcher.add('experience_noun', [pattern])

pattern = [{'LOWER': 'experience'}, {'POS': 'ADP'}, {'POS': {'IN': ('DET', 'NOUN', 'PROPN')}, 'OP': '+'}]
matcher.add('experience_adp', [pattern])

In [90]:
def show_extraction(examples, *extractors):
    seen = set()
    for doc in spacy_model.pipe(examples):
        doc.ents = filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)])
        for tok in doc:
            if tok.lower_ == 'experience':
                sentence = tok.sent
                if sentence.text in seen:
                    continue
                seen.update([sentence.text])
                if not sentence.ents:
                    doc.ents = list(doc.ents) + [Span(doc, tok.i, tok.i+1, 'MISSING')]
                displacy.render(sentence, style='ent', options = {'colors': {'MISSING': 'pink',
                                                                            'EXPERIENCE': 'lightgreen'}})
                

def get_extractions(documents, *extractors): 
    for idx, desc in enumerate(spacy_model.pipe(documents, batch_size=100, disable=['ner'])): 
        for ent in [Span(desc, start, end, label) for extractor in extractors for label, start, end in extractor(desc)]:
            sent = ent.root.sent
            yield ent.text, idx, ent.start, ent.end, ent.label_, sent.start, sent.end
                            
def get_left_span(word, label='', include=True):
    offset = 1 if include else 0
    idx = word.i
    while idx > word.left_edge.i:
        if word.doc[idx - 1].pos_ in ('NOUN', 'PROPN', 'ADJ', 'X'):
            idx -= 1
        else:
            break
    return label, idx, word.i+offset


def get_conjugations(word):
    new = [word]
    while new:
        word = new.pop()
        yield word
        for child in word.children:
            if child.dep_ == 'conj':
                new.append(child)
                             
def extract_verb_maybeadj_noun_experience(doc, label='EXPERIENCE'):
    for word in doc:
        if word.lower_ in EXP_TERMS:
            for child in word.rights:
                if child.dep_ == 'acl':
                    for word_child in child.children:
                        if word_child.dep_ == 'prep':
                            for word_child2 in word_child.children:
                                if word_child2.dep_ == 'pobj':
                                    for c in get_conjugations(word_child2):
                                        yield get_left_span(c, 'EXPERIENCE')
                        elif word_child.dep_ == 'dobj':
                            for c in get_conjugations(word_child):
                                yield get_left_span(c, 'EXPERIENCE')

def extract_noun_phrase_experience(doc):
    for np in doc.noun_chunks:
        if np[-1].lower_ == 'experience':
            if len(np) > 1:
                yield 'EXPERIENCE', np[0].i, np[-1].i
                                                
def extract_verb_adj_noun(doc, label): 
    for word in doc: 
    

In [91]:
show_extraction(s2, extract_verb_maybeadj_noun_experience)

In [15]:
experience_qualifiers = ['previous', 'prior', 'following', 'recent', 'the above', 'past',
                         
                         'proven', 'demonstrable', 'demonstrated', 'relevant', 'significant', 'practical',
                         'essential', 'equivalent', 'desirable', 'required', 'considerable', 'similar',
                         'working', 'specific', 'qualified', 'direct', 'hands on', 'handson', 'hands-on'
                         
                         'strong', 'solid', 'good', 'substantial', 'excellent', 'the right', 'valuable', 'invaluable',
                         
                         'some', 'any', 'none', 'much', 'extensive', 'no', 'more',
                         'your', 'their', 'great',
                         'years', 'months',
                        ]

stopwords = ['a', 'an', '*', '**', '.', 'this', 'the', ':', 'Skills']

experience_qualifier_pattern = rf'\b(?:{"|".join(experience_qualifiers)})\b'

experience_qualifier_pattern

'\\b(?:previous|prior|following|recent|the above|past|proven|demonstrable|demonstrated|relevant|significant|practical|essential|equivalent|desirable|required|considerable|similar|working|specific|qualified|direct|hands on|handson|hands-onstrong|solid|good|substantial|excellent|the right|valuable|invaluable|some|any|none|much|extensive|no|more|your|their|great|years|months)\\b'

In [49]:
list(get_extractions(s2[0], matcher))

[]

In [54]:
EXP_TERMS = ['experience']

In [41]:
s2= ["Once you join the Global Quality team, you will realize you are part of a family of global, professional and motivated team members working to ensure a well rounded customer experience via data driven decisions.. About The Role - You Will. Understand data management infrastructure and pipeline. Understand agile software development process. Join the building and deployment of analytic infrastructure in Linux environment. Join the development of reporting and analytic tools as well as work with data from Hadoop clusters. Will have the opportunity to present work to a wider audience and hone soft skills as opposed to only technical skills. Be involved in the design decision-making process as well as data strategy formulation and bring to the table fresh perspective and ideas. About You. Willing to learn – technical as well as soft skills. Passionate about teamwork and able to deliver in a fast-paced environment. Your Experience Includes. Web development experience. Angular framework experience will be an advantage. Proficient in C, C++ and/or Python . Proficient in SQL/Postgresql. Experience working with Linux environments and dockers will be an advantage. Been a part of projects following Agile software development process"]

In [92]:
show_extraction(s2, extract_noun_phrase_experience)
print('---')
show_extraction(s2, extract_verb_maybeadj_noun_experience)

---


In [37]:
s = ['Web development experience. Angular framework experience will be an advantage. Proficient in C, C++ and/or Python. Proficient in SQL/Postgresql. Experience working with Linux environments and dockers will be an advantage. Been a part of projects following Agile software development process']

In [38]:
show_extraction(s, extract_noun_phrase_experience)
print('---')
show_extraction(s, extract_verb_maybeadj_noun_experience)

---
