In [2]:
import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score
import spacy
import pandas as pd

In [3]:
# Read job posts 
df = pd.read_pickle(r'Data Scientist_cleaned.pkl')

In [4]:
print(df.checked_description_checked[1])

  /  ,  Writing software to clean and investigate large, messy data sets of numerical and textual data,  Integrating with internal and external data sources and APIs to help uncover new trends and improve analysis,  Developing highly scalable data pipelines, tools, and products to enable the analyst community to fully leverage the power of AWS,  Streamlining the deployment and maintenance of machine learning models by facilitating efficient model updates and creating robust operational monitoring.,  Investigating the impact of new technologies on the future of digital banking and the financial world of tomorrow,  Curious: You ask why, you explore, you’re not afraid to blurt out your disruptive idea. You probably know Python, Scala, or Java and are constantly exploring new open source tools, and hitting up stack overflow on a regular basis.,  A Wrangler: You know how to programmatically extract data from a database and an API, bring it through a transformation or two, and convert into a

# Train model using Json files

In [5]:
# Creates NER training data in Spacy format
def convert_to_spacy(jsonfile):
    try:
        tranin_data = []
        lines = []
        with open(jsonfile, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                # only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    # dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1, label))

            tranin_data.append((text, {"entities": entities}))

        return tranin_data
    except Exception as e:
        logging.exception("Unable to process \n" + "error = " + str(e))
        return None

In [6]:
# Train Spacy NER using traindata.json
def train_NER_spacy(blank_model=False, load_model='model', jsonfile='traindata.json',
                create_model_name='skill', model_output=True, n_iter=5):
    tranin_data = convert_to_spacy(jsonfile)
    if blank_model == False:
        nlp = spacy.load(load_model)  # load pretrained spaCy model
    else:
        nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)  # last
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for content, annotations in tranin_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    if blank_model == True:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            print("Statring iteration " + str(itn))
            random.shuffle(tranin_data)
            losses = {}
           
            for content, annotations in tranin_data:
                nlp.update(
                    [content],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    if model_output == True:
        nlp.meta["name"] = create_model_name
        nlp.to_disk('/home/xinda/insight/model')

In [7]:
train_NER_spacy(blank_model=False, jsonfile='traindata.json',load_model='en_core_web_lg',
                create_model_name='skill', model_output=True, n_iter=50)

Statring iteration 0
{'ner': 22305.35207753426}
Statring iteration 1
{'ner': 21384.00663086406}
Statring iteration 2
{'ner': 20984.322833751477}
Statring iteration 3
{'ner': 20964.974197072504}
Statring iteration 4
{'ner': 20826.967922359858}
Statring iteration 5
{'ner': 20578.623209020618}
Statring iteration 6
{'ner': 20578.617991266645}
Statring iteration 7
{'ner': 20396.451974142783}
Statring iteration 8
{'ner': 20245.001947662367}
Statring iteration 9
{'ner': 20283.82602963936}
Statring iteration 10
{'ner': 19954.38236785022}
Statring iteration 11
{'ner': 20182.373412026365}
Statring iteration 12
{'ner': 20130.43283461003}
Statring iteration 13
{'ner': 20038.731644841973}
Statring iteration 14
{'ner': 19722.733611092517}
Statring iteration 15
{'ner': 19971.964474265686}
Statring iteration 16
{'ner': 19889.780089317082}
Statring iteration 17
{'ner': 19955.850394361474}
Statring iteration 18
{'ner': 19667.292747981483}
Statring iteration 19
{'ner': 19510.42697816837}
Statring iterati

In [8]:
# load trained NER model
nlp = spacy.load('/home/xinda/insight/model')

In [12]:
nlp = spacy.load('en_core_web_lg')


In [10]:
# visualization
from spacy import displacy

In [13]:
doc = nlp(df.checked_description_checked[1])
displacy.render(doc, style="ent",jupyter=True)

In [7]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [14]:
skillset = [e.text for e in doc.ents if ((e.label_ == 'ORG')& (len(e.text) >= 1))] 

In [15]:
skillset

['  ',
 'API',
 'Matplotlib, d3',
 'Tableau',
 'Stickler',
 'MacBook Pro',
 'PyData',
 'AWS',
 'KDD',
 'CICD',
 'GitHub',
 'SQL',
 'S3',
 'Lambda',
 'RDS',
 'Git Workflows',
 'TensorFlow',
 'SQL']

In [10]:
nlp = spacy.load('en_core_web_lg')
def extract_skillset(content):
    doc = nlp(content)
    skillset = [e.text for e in doc.ents if ((e.label_ == 'ORG')& (len(e.text) >= 3))]
    return skillset

In [11]:
def clean_text(text):
    import string
    import nltk
    import re
    import pandas as pd
    import numpy as np
    from gensim.summarization import keywords
    wl = nltk.WordNetLemmatizer()
    stopword = nltk.corpus.stopwords.words('english')
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    tokens = [wl.lemmatize(word) for word in tokens if word not in stopword]
    text = ' '.join(tokens)
    return text

In [12]:
df['skillset'] = df.checked_description_checked.apply(lambda x: extract_skillset(x))
df['skillset'] = df['skillset'].apply(lambda x: clean_text(x))

In [13]:
df.to_pickle(r'Data Scientist_processed.pkl')