In this analysis, we will explore a dataset of google job ads. Those ads correspond either to the search string `machine learning engineer` or `data scientist`.

In [66]:
import os
import io
import re
import yaml

import pandas as pd
import numpy as np

from stemming import porter2

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
from sklearn.compose import ColumnTransformer

import matplotlib.pyplot as plt
import seaborn as sns

from google.cloud import storage

In [2]:
config_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'config.yaml'))
with open(config_path, 'r') as config_file:
    config = yaml.load(config_file, Loader=yaml.SafeLoader)

In [3]:
%matplotlib inline

In [4]:
plt.style.use('fivethirtyeight')
plt.rcParams['figure.dpi']= 180

In [5]:
client = storage.Client(config['project'])
bucket = client.get_bucket(config['bucket'])
blob = bucket.blob(config['dest_file'])
file_content = io.BytesIO(blob.download_as_string())

ads = pd.read_csv(file_content)



In [6]:
ads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 7 columns):
id                  541 non-null object
job_descr           539 non-null object
location            536 non-null object
minimum_qual        539 non-null object
preferred_qual      539 non-null object
responsibilities    539 non-null object
title               541 non-null object
dtypes: object(7)
memory usage: 29.7+ KB


First, let us try using simple string operations to derive the additional information from the job title. Such information could be: seniority, function, etc.

In [7]:
ads[['role', 'department', 'area', 'other']] = ads.title.str.split(',', expand=True)

In [8]:
seniority_re = r'.*(intern|senior|lead|staff|principal|manager|head).*'
function_re = r'.*(engineer|scientist|science|consultant|architect|advocate|developer|analyst|specialist).*'

ads['seniority'] = ads.role.str.extract(pat=seniority_re, flags=re.I).fillna('L3-L4')[0].str.lower()
ads['function'] = (ads.role.str.extract(pat=function_re, flags=re.I)[0]
                   .str.lower().str.replace('science', 'scientist', flags=re.I))

In [9]:
ads[['title', 'role', 'department', 'area', 'seniority', 'function']].head()

Unnamed: 0,title,role,department,area,seniority,function
0,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
1,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
2,"Conversational AI Engineer, Google Cloud Profe...",Conversational AI Engineer,Google Cloud Professional Services,,l3-l4,engineer
3,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
4,"Data Scientist, Engineering",Data Scientist,Engineering,,l3-l4,scientist


An interesting insight could be to see the most relevant terms from the qualifications and responsibilities with regards to the function. This could give us an isight into the keywords that one should emphasize on in their resume.

In [10]:
ads.function.value_counts()

engineer      365
architect      69
developer      18
scientist      15
analyst        11
consultant     10
specialist      3
advocate        2
Name: function, dtype: int64

In [43]:
to_model = ads[(ads.job_descr.notna()) & (ads.function.isin(['engineer', 'scientist', 'architect']))].copy()

In [44]:
label_enc = LabelEncoder()
to_model['function_enc'] = label_enc.fit_transform(to_model.function.astype('str'))

In [45]:
X = to_model.job_descr
y = to_model.function_enc

In [46]:
text_pipe = Pipeline([('count_vectorizer', CountVectorizer(stop_words='english',
                                                           ngram_range=(1, 10),
                                                           min_df=0.05, max_df=0.95,
                                                           strip_accents='unicode')),
                      ('tf_idf_transf', TfidfTransformer()),
                      ('naive_bayes_clf', MultinomialNB())])

In [47]:
%time
scores = cross_validate(text_pipe, X, y, cv=3, scoring='f1_weighted', return_train_score=True)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.87 µs


In [59]:
for k in scores.keys():
    print(f'{k}, mean: {scores[k].mean():.2f}, std: {scores[k].std():.2f}')

fit_time, mean: 0.68, std: 0.08
score_time, mean: 0.18, std: 0.00
test_score, mean: 0.72, std: 0.02
train_score, mean: 0.73, std: 0.02


In [48]:
text_pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('count_vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.95,
                                 max_features=None, min_df=0.05,
                                 ngram_range=(1, 10), preprocessor=None,
                                 stop_words='english', strip_accents='unicode',
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tf_idf_transf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('naive_bayes_clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [64]:
importances = pd.DataFrame(text_pipe['naive_bayes_clf'].feature_log_prob_.transpose(),
                           columns=label_enc.classes_,
                           index=text_pipe['count_vectorizer'].get_feature_names())

In [65]:
for l in label_enc.classes_:
    print(l)
    print('-'*80)
    print(importances[l].sort_values(ascending=False)[:10])
    print('\n')
    print('-'*80)

architect
--------------------------------------------------------------------------------
cloud          -6.240556
customers      -6.441641
google cloud   -6.586060
hardware       -7.224296
solutions      -7.295929
business       -7.314180
enterprise     -7.337779
help           -7.393645
team           -7.400823
platform       -7.416114
Name: architect, dtype: float64


--------------------------------------------------------------------------------
engineer
--------------------------------------------------------------------------------
cloud          -6.382416
engineers      -6.513930
customers      -6.527060
team           -6.698174
software       -6.735478
google cloud   -6.827812
information    -6.857268
work           -6.867722
design         -6.907004
scale          -6.954883
Name: engineer, dtype: float64


--------------------------------------------------------------------------------
scientist
--------------------------------------------------------------------------------

In [101]:
kwargs = {'stop_words':'english',
          'ngram_range':(1, 10),
          'min_df':0.05,
          'max_df':0.95,
          'strip_accents':'unicode'}

text_cols = ['job_descr', 'minimum_qual', 'preferred_qual', 'responsibilities']

column_trans = ColumnTransformer([(f'tfidf_{c}', TfidfVectorizer(**kwargs), c) for c in text_cols])

multi_col_pipe = Pipeline([('multi_col_tfidf', column_trans),
                           ('naive_bayes_clf', MultinomialNB())])

In [102]:
multi_col_pipe.fit(to_model[text_cols], y)

Pipeline(memory=None,
         steps=[('multi_col_tfidf',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('tfidf_job_descr',
                                                  TfidfVectorizer(analyzer='word',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.float64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=0.95,
            

In [108]:
%time
scores = cross_validate(multi_col_pipe, to_model[text_cols], y, cv=3, scoring='f1_weighted', return_train_score=True)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.01 µs


In [109]:
for k in scores.keys():
    print(f'{k}, mean: {scores[k].mean():.2f}, std: {scores[k].std():.2f}')

fit_time, mean: 1.23, std: 0.04
score_time, mean: 0.35, std: 0.02
test_score, mean: 0.79, std: 0.05
train_score, mean: 0.78, std: 0.02


In [116]:
importances_mcp = pd.DataFrame(multi_col_pipe['naive_bayes_clf'].feature_log_prob_.transpose(),
                               columns=label_enc.classes_,
                               index=multi_col_pipe['multi_col_tfidf'].get_feature_names())

In [117]:
for l in label_enc.classes_:
    print(l)
    print('-'*80)
    print(importances_mcp[l].sort_values(ascending=False)[:10])
    print('\n')
    print('-'*80)

architect
--------------------------------------------------------------------------------
tfidf_minimum_qual__cloud             -6.627677
tfidf_job_descr__cloud                -6.640148
tfidf_preferred_qual__experience      -6.720834
tfidf_preferred_qual__cloud           -6.787615
tfidf_job_descr__customers            -6.841233
tfidf_minimum_qual__computing         -6.873032
tfidf_responsibilities__power         -6.880187
tfidf_job_descr__google cloud         -6.985652
tfidf_minimum_qual__cloud computing   -6.988558
tfidf_responsibilities__customers     -7.010322
Name: architect, dtype: float64


--------------------------------------------------------------------------------
engineer
--------------------------------------------------------------------------------
tfidf_preferred_qual__experience   -5.892254
tfidf_responsibilities__design     -6.627540
tfidf_minimum_qual__development    -6.773619
tfidf_minimum_qual__degree         -6.795694
tfidf_responsibilities__systems    -6.844199