In [1]:
#Credit to Edward Ross - https://skeptric.com/notebooks/Parsing%20Experience%20from%20Adzuna%20Job%20Ads.html
import re
import pandas as pd
import spacy
from spacy.util import filter_spans
from spacy.tokens import Span
from spacy.matcher import Matcher

In [2]:
from spacy import displacy
from IPython.display import HTML, display


In [11]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [12]:
df = pd.read_csv('data_science_jobs.csv')

In [13]:
df = df.drop(df.columns[0], axis = 1) 

In [14]:
import regex as re
df['Job Description'] = df['Job Description'].apply(lambda x: re.sub(r'[!;,\s]', ' ', x))
df.sample(50, random_state = 5)

Unnamed: 0,Job Title,Company Name,Job Description,Location,Job Board
852,HEAD OF ANALYTICS & INSIGHTS/DATA SCIENTIST,Saje Natural Wellness,For over 25 years we’ve been helping our commu...,Vancouver,Indeed
207,Machine Learning Engineer,Aviva,About the job Individually we are people but ...,Markham,Indeed
627,"Data Engineer / Power BI Developer, Omnia AI",Deloitte\n,Job Type: Permanent Primary Location: Vancouve...,Vancouver,Glassdoor
671,Junior Business Intelligence Developer - Data ...,CAPRiser Group,Our subsidiary is in PropTech and real estate ...,Toronto,Indeed
661,"Data Scientist Technical Lead (Montreal), Inta...",Intact\n,Who needs insurance? Everybody. That keeps us ...,Montreal,Glassdoor
489,Bio informaticien/Bioinformatics scientist,GenAIz\n,Avant le SRAS-CoV-2 un vaccin qui était dével...,Montreal,Glassdoor
434,Data Scientist,Awakedata,RESPONSIBILITIES: Develop state-of-the-art com...,Burnaby,Glassdoor
448,Scientifique de données / Data Scientist,BusPatrol\n,Qui sommes-nous : BusPatrouille est une entrep...,Montreal,Glassdoor
634,Data Management Specialist - Immunotoxicology,Charles River Laboratories\n,For 70 years Charles River employees have wor...,Senneville,Glassdoor
748,Data Scientist,F8th,We're transforming the Cyber Security industry...,Midtown Toronto,Glassdoor


In [15]:
ads = list(df['Job Description'])

In [16]:
len(ads)

881

In [17]:
def highlight_terms(terms, texts):
    for doc in nlp.pipe(texts):
        for sentence in set([tok.sent for tok in doc if tok.lower_ in terms]):
            text = sentence.text.strip()
            markup = re.sub(fr'(?i)\b({"|".join(terms)})\b', r'<strong>\1</strong>', text)
            display(HTML(markup))
            print('-----')

In [19]:
matcher = Matcher(nlp.vocab)
pattern = [{'POS': 'NOUN', 'OP': '+'}, {'LOWER': 'experience'}]
matcher.add('experience_noun', [pattern])

pattern = [{'LOWER': 'experience'}, {'POS': 'ADP'}, {'POS': {'IN': ('DET', 'NOUN', 'PROPN')}, 'OP': '+'}]
matcher.add('experience_adp', [pattern])

In [20]:
doc = nlp(ads[0])
matcher(doc)

[(12285600890577657150, 117, 119),
 (1417798585642285709, 239, 242),
 (1417798585642285709, 239, 243),
 (12285600890577657150, 265, 267)]

In [21]:
def show_extraction(examples, *extractors):
    seen = set()
    for doc in nlp.pipe(examples):
        doc.ents = filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)])
        for tok in doc:
            if tok.lower_ == 'experience':
                sentence = tok.sent
                if sentence.text in seen:
                    continue
                seen.update([sentence.text])
                if not sentence.ents:
                    doc.ents = list(doc.ents) + [Span(doc, tok.i, tok.i+1, 'MISSING')]
                displacy.render(sentence, style='ent', options = {'colors': {'MISSING': 'pink',
                                                                            'EXPERIENCE': 'lightgreen'}})
                

In [22]:
show_extraction(ads[:5], matcher)


In [23]:
def get_extractions(examples, *extractors):
    # Could use context instead of enumerate
    for idx, doc in enumerate(nlp.pipe(examples, batch_size=100, disable=['ner'])):
        for ent in filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)]):
            sent = ent.root.sent
            yield ent.text, idx, ent.start, ent.end, ent.label_, sent.start, sent.end

In [24]:
list(get_extractions(ads[:3], matcher))


[('user experience', 0, 117, 119, 'experience_noun', 92, 121),
 ('experience in the application', 0, 239, 243, 'experience_adp', 224, 304),
 ('techniques Experience', 0, 265, 267, 'experience_noun', 224, 304),
 ('data science experience', 1, 143, 146, 'experience_noun', 134, 169),
 ('software development experience Experience',
  1,
  157,
  161,
  'experience_noun',
  134,
  169),
 ('Experience with time', 1, 169, 172, 'experience_adp', 169, 256),
 ('environment Experience', 1, 197, 199, 'experience_noun', 169, 256),
 ('fitness experience', 2, 73, 75, 'experience_noun', 62, 93),
 ('experience in ETL', 2, 399, 402, 'experience_adp', 384, 561),
 ('Experience with Informatica IICS', 2, 445, 449, 'experience_adp', 384, 561),
 ('Experience with message queues', 2, 455, 459, 'experience_adp', 384, 561),
 ('experience in a DevOps environment Experience',
  2,
  468,
  474,
  'experience_adp',
  384,
  561)]

In [25]:
def extract_df(*extractors, n_max=None, **kwargs):
    if n_max is None:
        n_max = len(df)
    ent_df = pd.DataFrame(list(get_extractions(df[:n_max]['Job Description'], *extractors)),
                          columns=['text', 'docidx', 'start', 'end', 'label', 'sent_start', 'sent_end'])
    return ent_df.merge(df, how='left', left_on='docidx', right_index=True)

In [26]:
%time 
ent_df = extract_df(matcher, n_max=1000)
ent_df.head()

CPU times: total: 0 ns
Wall time: 0 ns


Unnamed: 0,text,docidx,start,end,label,sent_start,sent_end,Job Title,Company Name,Job Description,Location,Job Board
0,user experience,0,117,119,experience_noun,92,121,Data Scientist,Metricsflow Inc,At Metricsflow we work to shape the future of...,St. John's,Glassdoor
1,experience in the application,0,239,243,experience_adp,224,304,Data Scientist,Metricsflow Inc,At Metricsflow we work to shape the future of...,St. John's,Glassdoor
2,techniques Experience,0,265,267,experience_noun,224,304,Data Scientist,Metricsflow Inc,At Metricsflow we work to shape the future of...,St. John's,Glassdoor
3,data science experience,1,143,146,experience_noun,134,169,Data Scientist - 05/12/21,Acerta Analytics Solutions Inc,Acerta’s machine learning platforms leverage a...,Kitchener,Indeed
4,software development experience Experience,1,157,161,experience_noun,134,169,Data Scientist - 05/12/21,Acerta Analytics Solutions Inc,Acerta’s machine learning platforms leverage a...,Kitchener,Indeed


In [27]:
def aggregate_df(df, col=['text']):
    return (df
            .groupby(col)
            .agg(n_company=('Company Name', 'nunique'))
            .reset_index()
            .sort_values(['n_company'], ascending=False)
        )

In [28]:
aggregate_df(ent_df).head(10)


Unnamed: 0,text,n_company
1393,work experience,47
1406,years experience,39
1202,industry experience,34
599,customer experience,29
796,experience in a,27
1383,user experience,23
874,experience in data science,20
323,Experience with Python,19
1066,experience with Python,15
934,experience in the,13


In [29]:
def showent(docidx, start, end, label, sent_start, sent_end, **kwargs):
    # We don't need to parse it, so just make_doc
    doc = nlp.make_doc(ads[docidx])
    doc.ents = [Span(doc, start, end, label)]
    sent = doc[sent_start:sent_end]
    displacy.render(sent, style='ent')
    
def showent_df(df):
    for idx, row in df.iterrows():
        showent(**row)

In [30]:
showent_df(ent_df.query('text == "experience in a"').head())


In [31]:
def extract_noun_phrase_experience(doc):
    for np in doc.noun_chunks:
        if np[-1].lower_ == 'experience':
            if len(np) > 1:
                yield 'EXPERIENCE', np[0].i, np[-1].i

In [32]:
show_extraction(ads[:5], extract_noun_phrase_experience)


In [33]:
%time
ent_df = extract_df(extract_noun_phrase_experience, n_max=50000)


CPU times: total: 0 ns
Wall time: 0 ns


In [34]:
aggregate_df(ent_df).head(50)


Unnamed: 0,text,n_company
1014,industry,28
984,hands-on,28
1186,relevant,24
935,equivalent,24
1386,work,18
1141,professional,16
244,Demonstrated,16
431,Prior,15
276,Extensive,15
1392,your,14


In [35]:
experience_qualifiers = ['previous', 'prior', 'following', 'recent', 'the above', 'past',
                         
                         'proven', 'demonstrable', 'demonstrated', 'relevant', 'significant', 'practical',
                         'essential', 'equivalent', 'desirable', 'required', 'considerable', 'similar',
                         'working', 'specific', 'qualified', 'direct', 'hands on', 'handson', 'hands-on'
                         
                         'strong', 'solid', 'good', 'substantial', 'excellent', 'the right', 'valuable', 'invaluable',
                         
                         'some', 'any', 'none', 'much', 'extensive', 'no', 'more',
                         'your', 'their', 'great',
                         'years', 'months',
                        ]

stopwords = ['a', 'an', '*', '**', '.', 'this', 'the', ':', 'Skills']

experience_qualifier_pattern = rf'\b(?:{"|".join(experience_qualifiers)})\b'

experience_qualifier_pattern

'\\b(?:previous|prior|following|recent|the above|past|proven|demonstrable|demonstrated|relevant|significant|practical|essential|equivalent|desirable|required|considerable|similar|working|specific|qualified|direct|hands on|handson|hands-onstrong|solid|good|substantial|excellent|the right|valuable|invaluable|some|any|none|much|extensive|no|more|your|their|great|years|months)\\b'

In [36]:
aggregate_df(ent_df[(~ent_df.text.str.lower().str.contains(experience_qualifier_pattern)) & # Not a qualifier
                     ~ent_df.text.isin(stopwords)]).head(50)

Unnamed: 0,text,n_company
725,hands-on,28
753,industry,28
1026,work,18
858,professional,16
384,Strong,14
206,Hands-on,13
351,SQL,11
467,a plus,10
880,related,10
621,customer,9


In [37]:
def extract_adp_experience(doc, label='EXPERIENCE'):
    for tok in doc:
        if tok.lower_ == 'experience':
            for child in tok.rights:
                if child.dep_ == 'prep':
                    for obj in child.children:
                        if obj.dep_ == 'pobj':
                            yield label, obj.left_edge.i, obj.i+1

In [38]:
show_extraction(ads[10:15], extract_adp_experience)

In [39]:
def extract_adp_experience_2(doc):
    for np in doc.noun_chunks:
        start_tok = np[0].i
        if start_tok >= 2 and doc[start_tok - 2].lower_ == 'experience' and doc[start_tok - 1].pos_ == 'ADP':
            yield 'EXPERIENCE', start_tok, start_tok + len(np)


In [40]:
show_extraction(ads[10:15], extract_adp_experience_2)

In [41]:
%time 
ent_adp_df = extract_df(extract_adp_experience, n_max=50)


CPU times: total: 0 ns
Wall time: 0 ns


In [42]:
%time 
ent_adp_df = extract_df(extract_adp_experience_2, n_max=50)


CPU times: total: 0 ns
Wall time: 0 ns


In [43]:
%time 
ent_adp_df = extract_df(extract_adp_experience, n_max=50000)


CPU times: total: 0 ns
Wall time: 0 ns


In [44]:
aggregate_df(ent_adp_df).head(50)


Unnamed: 0,text,n_company
334,Python,31
375,SQL,19
816,data science,15
760,data,13
640,big data tools,13
273,Machine Learning,11
839,data visualization,10
1192,relational databases,9
844,data visualization tools,9
339,Python R,8


In [45]:
showent_df(ent_adp_df.query("text=='C'").head(5))


In [46]:
showent_df(ent_adp_df.query("text=='R'").head(5))


In [47]:
def highlight_text_context(terms, texts, n_before=1, n_after=2):
    context = []
    for doc in nlp.pipe(texts):
        sentences = list(doc.sents)
        idxs = [i for i, sent in enumerate(sentences) if any(term in sent.text.lower() for term in terms)]
        
        for idx in idxs:
            before = ''.join(sent.text for sent in sentences[max(idx-n_before, 0):idx])
            after = ''.join(sent.text for sent in sentences[idx+1:min(idx+n_before+1, len(sentences))])
            text = sentences[idx].text
            markup = re.sub(fr'(?i)\b({"|".join(terms)})\b', r'<strong>\1</strong>',
                                 f'<span style="color:blue">{text}</span>')
            display(HTML(before + markup + after))

In [48]:
terms = ['experience']

for _, q in ent_adp_df.query("text=='a'").head(7).iterrows():
    doc = nlp(q.FullDescription)
    if q.sent_start > 0:
        prev_sent = doc[q.sent_start - 1].sent.text
    else:
        prev_sent = ''
    
    if q.sent_end < len(doc):
        next_sent = doc[q.sent_end].sent.text
    else:
        next_sent = ''
        
    text = doc[q.sent_start:q.sent_end].text
    markup = re.sub(fr'(?i)\b({"|".join(terms)})\b', r'<strong>\1</strong>',
                     f'<span style="color:blue">{text}</span>')
    display(HTML(prev_sent + markup + next_sent))

In [49]:
def get_left_span(tok, label='', include=True):
    offset = 1 if include else 0
    idx = tok.i
    while idx > tok.left_edge.i:
        if tok.doc[idx - 1].pos_ in ('NOUN', 'PROPN', 'ADJ', 'X'):
            idx -= 1
        else:
            break
    return label, idx, tok.i+offset


In [50]:
def get_conjugations(tok):
    new = [tok]
    while new:
        tok = new.pop()
        yield tok
        for child in tok.children:
            if child.dep_ == 'conj':
                new.append(child)

In [51]:
#old
EXP_TERMS = ['experience']
def extract_adp_conj_experience(doc, label='EXPERIENCE'):
    for tok in doc:
        if tok.lower_ in EXP_TERMS:
            for child in tok.rights:
                if child.dep_ == 'prep':
                    for obj in child.children:
                        if obj.dep_ == 'pobj':
                            for conj in get_conjugations(obj):
                                yield get_left_span(conj, label)

In [52]:
show_extraction(ads[10:15], extract_adp_conj_experience)


In [53]:
show_extraction(ads[:10], extract_adp_conj_experience)


In [54]:
def extract_verb_maybeadj_noun_experience(doc, label='EXPERIENCE'):
    for tok in doc:
        if tok.lower_ in EXP_TERMS:
            for child in tok.rights:
                if child.dep_ == 'acl':
                    for gc in child.children:
                        if gc.dep_ == 'prep':
                            for ggc in gc.children:
                                if ggc.dep_ == 'pobj':
                                    for c in get_conjugations(ggc):
                                        yield get_left_span(c, 'EXPERIENCE')
                        elif gc.dep_ == 'dobj':
                            for c in get_conjugations(gc):
                                yield get_left_span(c, 'EXPERIENCE')


In [55]:
show_extraction(ads[5:10], extract_verb_maybeadj_noun_experience)


In [56]:
extract_exps = [extract_adp_conj_experience,]


In [57]:
n_ads = len(df)

In [58]:
%%time
df_ents = extract_df(*extract_exps, n_max=n_ads)

CPU times: total: 32.7 s
Wall time: 32.8 s


In [59]:
df_ents

Unnamed: 0,text,docidx,start,end,label,sent_start,sent_end,Job Title,Company Name,Job Description,Location,Job Board
0,application,0,242,243,EXPERIENCE,224,304,Data Scientist,Metricsflow Inc,At Metricsflow we work to shape the future of...,St. John's,Glassdoor
1,tensorflow,1,164,165,EXPERIENCE,134,169,Data Scientist - 05/12/21,Acerta Analytics Solutions Inc,Acerta’s machine learning platforms leverage a...,Kitchener,Indeed
2,sklearn,1,167,168,EXPERIENCE,134,169,Data Scientist - 05/12/21,Acerta Analytics Solutions Inc,Acerta’s machine learning platforms leverage a...,Kitchener,Indeed
3,operations data,1,175,177,EXPERIENCE,169,256,Data Scientist - 05/12/21,Acerta Analytics Solutions Inc,Acerta’s machine learning platforms leverage a...,Kitchener,Indeed
4,data modeling,2,403,405,EXPERIENCE,384,561,Data Engineer,GoodLife Fitness\n,Position Description DATA ENGINEER **This ...,Midtown Toronto,Glassdoor
...,...,...,...,...,...,...,...,...,...,...,...,...
2839,variety,879,396,397,EXPERIENCE,365,448,Data Engineer,Cyient\n,Cyient is a global engineering and technology ...,Montreal,Glassdoor
2840,Data Engineer,880,441,443,EXPERIENCE,432,467,Principal Data Engineer,ELEMENTS Global Services,Elements Global Services is an award-winning H...,Toronto,Indeed
2841,NoSQL database technologies,880,523,526,EXPERIENCE,515,532,Principal Data Engineer,ELEMENTS Global Services,Elements Global Services is an award-winning H...,Toronto,Indeed
2842,modern programming languages,880,538,541,EXPERIENCE,532,559,Principal Data Engineer,ELEMENTS Global Services,Elements Global Services is an award-winning H...,Toronto,Indeed


In [60]:
showent_df(df_ents[:2])


In [61]:
df_ent_agg = aggregate_df(df_ents)
df_ent_agg.head(10)

Unnamed: 0,text,n_company
425,Python,42
462,SQL,28
1075,machine learning,22
818,data science,22
768,data,20
385,NoSQL,18
1506,tools,18
945,experience,17
878,development,16
937,etc,14


In [62]:
from flashtext import KeywordProcessor


In [63]:
keyword_processor = KeywordProcessor(case_sensitive=True)


In [64]:
#selecting most popular
skills = df_ent_agg.query('n_company >= 3').text
len(skills)

161

In [65]:
for skill in skills:
    keyword_processor.add_keyword(skill)

In [66]:
from collections import Counter


In [67]:
skills = list(
(df_ent_agg
 .query('n_company >= 3')
).text
)
len(skills)

161

In [68]:
n_max=1000
for a,b,c in zip(skills[:n_max:3],skills[1:n_max:3],skills[2:n_max:3]):
     print('{:<35}{:<35}{:<}'.format(a,b,c))

Python                             SQL                                machine learning
data science                       data                               NoSQL
tools                              experience                         development
etc                                Machine Learning                   R
big data tools                     Experience                         technologies
data visualization                 big data technologies              relational databases
.                                  years                              data analysis
programming languages              Data Scientist                     software development
Data Engineer                      data visualization tools           customers
building                           relational SQL                     Tableau
analysis                           implementation                     cloud
processing systems                 object function scripting languagesdata analytics
frameworks     

In [69]:
def get_extractions_2(examples, *extractors):
    # Could use context instead of enumerate
    doc = nlp(examples, disable=['ner'])
    for ent in filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)]):
        sent = ent.root.sent
        yield ent.text



In [70]:
def list_skills(examples, *extractors):
    return list(get_extractions_2(examples, *extractors))

In [82]:
print(list_skills(df['Job Description'][10], extract_adp_conj_experience))

['BigQuery Experience']


In [72]:
#for debugging visually
print(show_extraction(df['Job Description'][10]), extract_adp_conj_experience)

None <function extract_adp_conj_experience at 0x000001EDD1AE2CB0>


In [73]:
%%time
#create a new column in df with the fxn that works
df['skills'] = df['Job Description'].apply(lambda x: list_skills(x, extract_adp_conj_experience)) 


CPU times: total: 41.2 s
Wall time: 41.2 s


In [74]:
df.head()

Unnamed: 0,Job Title,Company Name,Job Description,Location,Job Board,skills
0,Data Scientist,Metricsflow Inc,At Metricsflow we work to shape the future of...,St. John's,Glassdoor,[application]
1,Data Scientist - 05/12/21,Acerta Analytics Solutions Inc,Acerta’s machine learning platforms leverage a...,Kitchener,Indeed,"[tensorflow, sklearn, operations data]"
2,Data Engineer,GoodLife Fitness\n,Position Description DATA ENGINEER **This ...,Midtown Toronto,Glassdoor,"[data modeling, Cloud platforms, Informatica I..."
3,Safety Aggregate Reporting & Analytics Coordin...,IQVIA,Job Overview Manage the administrative support...,Kirkland,Indeed,[]
4,Senior Data Engineer,Samsung Electronics\n,Position Summary The SmartThings Big Data tea...,Vancouver,Glassdoor,"[big data engineering, large scale, high avail..."


In [75]:
df.sample(50, random_state = 5)

Unnamed: 0,Job Title,Company Name,Job Description,Location,Job Board,skills
852,HEAD OF ANALYTICS & INSIGHTS/DATA SCIENTIST,Saje Natural Wellness,For over 25 years we’ve been helping our commu...,Vancouver,Indeed,"[market research, customer strategy Experience..."
207,Machine Learning Engineer,Aviva,About the job Individually we are people but ...,Markham,Indeed,"[microservice architecture, pandas, numpy]"
627,"Data Engineer / Power BI Developer, Omnia AI",Deloitte\n,Job Type: Permanent Primary Location: Vancouve...,Vancouver,Glassdoor,[high proficiency]
671,Junior Business Intelligence Developer - Data ...,CAPRiser Group,Our subsidiary is in PropTech and real estate ...,Toronto,Indeed,"[mining, CSS, JavaScript, one, more, non-relat..."
661,"Data Scientist Technical Lead (Montreal), Inta...",Intact\n,Who needs insurance? Everybody. That keeps us ...,Montreal,Glassdoor,"[field, source data mining frameworks]"
489,Bio informaticien/Bioinformatics scientist,GenAIz\n,Avant le SRAS-CoV-2 un vaccin qui était dével...,Montreal,Glassdoor,"[multidisciplinary bioinformatics projects, KE..."
434,Data Scientist,Awakedata,RESPONSIBILITIES: Develop state-of-the-art com...,Burnaby,Glassdoor,"[big data techniques, SQL]"
448,Scientifique de données / Data Scientist,BusPatrol\n,Qui sommes-nous : BusPatrouille est une entrep...,Montreal,Glassdoor,[]
634,Data Management Specialist - Immunotoxicology,Charles River Laboratories\n,For 70 years Charles River employees have wor...,Senneville,Glassdoor,[]
748,Data Scientist,F8th,We're transforming the Cyber Security industry...,Midtown Toronto,Glassdoor,"[statistical package, software development, ma..."


In [77]:
with open('raw_skills.txt', 'w') as f:
    for skill in skills:
        print(skill, file=f)

In [78]:
df.to_csv('rule_based_extracted_skills.csv', index=False)

In [84]:
print(list_skills(df['Job Description'][10], extract_adp_conj_experience))

[]


In [87]:
[extract_adp_conj_experience(s)]
temp = extract_df(*extract_exps)

In [88]:
temp

Unnamed: 0,text,docidx,start,end,label,sent_start,sent_end,Job Title,Company Name,Job Description,Location,Job Board,skills
0,application,0,242,243,EXPERIENCE,224,304,Data Scientist,Metricsflow Inc,At Metricsflow we work to shape the future of...,St. John's,Glassdoor,[application]
1,tensorflow,1,164,165,EXPERIENCE,134,169,Data Scientist - 05/12/21,Acerta Analytics Solutions Inc,Acerta’s machine learning platforms leverage a...,Kitchener,Indeed,"[tensorflow, sklearn, operations data]"
2,sklearn,1,167,168,EXPERIENCE,134,169,Data Scientist - 05/12/21,Acerta Analytics Solutions Inc,Acerta’s machine learning platforms leverage a...,Kitchener,Indeed,"[tensorflow, sklearn, operations data]"
3,operations data,1,175,177,EXPERIENCE,169,256,Data Scientist - 05/12/21,Acerta Analytics Solutions Inc,Acerta’s machine learning platforms leverage a...,Kitchener,Indeed,"[tensorflow, sklearn, operations data]"
4,data modeling,2,403,405,EXPERIENCE,384,561,Data Engineer,GoodLife Fitness\n,Position Description DATA ENGINEER **This ...,Midtown Toronto,Glassdoor,"[data modeling, Cloud platforms, Informatica I..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2839,variety,879,396,397,EXPERIENCE,365,448,Data Engineer,Cyient\n,Cyient is a global engineering and technology ...,Montreal,Glassdoor,"[area, SQL, solutions, variety]"
2840,Data Engineer,880,441,443,EXPERIENCE,432,467,Principal Data Engineer,ELEMENTS Global Services,Elements Global Services is an award-winning H...,Toronto,Indeed,"[Data Engineer, NoSQL database technologies, m..."
2841,NoSQL database technologies,880,523,526,EXPERIENCE,515,532,Principal Data Engineer,ELEMENTS Global Services,Elements Global Services is an award-winning H...,Toronto,Indeed,"[Data Engineer, NoSQL database technologies, m..."
2842,modern programming languages,880,538,541,EXPERIENCE,532,559,Principal Data Engineer,ELEMENTS Global Services,Elements Global Services is an award-winning H...,Toronto,Indeed,"[Data Engineer, NoSQL database technologies, m..."


In [91]:
df.iloc[1]

Job Title                                  Data Scientist - 05/12/21
Company Name                          Acerta Analytics Solutions Inc
Job Description    Acerta’s machine learning platforms leverage a...
Location                                                   Kitchener
Job Board                                                     Indeed
skills                        [tensorflow, sklearn, operations data]
Name: 1, dtype: object

In [95]:
print('Job Description: ', df.iloc[1]['Job Description'])
print()
print('Job Skills: ', df.iloc[1]['skills'])

Job Description:  Acerta’s machine learning platforms leverage automotive assembly and vehicle data to detect the earliest indicators of future product failures. We help automakers optimize quality  safety  and reliability throughout the entire product life cycle  from the assembly line to the finish line.  As an integral part of the data science team at Acerta  you will productize  deploy  maintain  and monitor machine learning models running behind Acerta’s LinePulse and AutoPulse products. Our LinePulse SaaS platform enables automakers to identify anomalies in production data for enhanced testing  accelerated root cause analysis  and improved manufacturing output. Acerta’s AutoPulse SaaS platform enables predictive maintenance of connected and autonomous vehicles based on production  maintenance  and on-road data.  Requirements: Minimum of 3 - 5 years of data science experience using python Strong ML and statistics background 2+ years of software development experience Experience wi

In [97]:
show_extraction(ads[1], extract_adp_conj_experience)

In [117]:
show_extraction(ads[1:2], extract_adp_experience_2)

In [116]:
ads[1] = 'Experience with statistical analysis, numerical programming, or machine learning in Python'