In this analysis, we will explore a dataset of google job ads. Those ads correspond either to the search string `machine learning engineer` or `data scientist`.

In [45]:
import os
import io
import re
import yaml

import pandas as pd
import numpy as np

from stemming import porter2

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.compose import ColumnTransformer

import matplotlib.pyplot as plt
import seaborn as sns

from google.cloud import storage

In [2]:
config_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'config.yaml'))
with open(config_path, 'r') as config_file:
    config = yaml.load(config_file, Loader=yaml.SafeLoader)

In [3]:
%matplotlib inline

In [4]:
plt.style.use('fivethirtyeight')
plt.rcParams['figure.dpi']= 180

In [5]:
client = storage.Client(config['project'])
bucket = client.get_bucket(config['bucket'])
blob = bucket.blob(config['dest_file'])
file_content = io.BytesIO(blob.download_as_string())

ads = pd.read_csv(file_content)



In [25]:
ads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 567 entries, 0 to 566
Data columns (total 13 columns):
id                  567 non-null object
job_descr           567 non-null object
location            567 non-null object
minimum_qual        567 non-null object
preferred_qual      567 non-null object
responsibilities    567 non-null object
title               567 non-null object
role                567 non-null object
department          498 non-null object
area                201 non-null object
other               18 non-null object
seniority           567 non-null object
function            515 non-null object
dtypes: object(13)
memory usage: 57.7+ KB


First, let us try using simple string operations to derive the additional information from the job title. Such information could be: seniority, function, etc.

In [26]:
ads.fillna('unknown', inplace=True)

In [27]:
ads[['role', 'department', 'area', 'other']] = ads.title.str.split(',', expand=True)

In [28]:
seniority_re = r'.*(intern|senior|lead|staff|principal|manager|head).*'
function_re = r'.*(engineer|scientist|science|consultant|architect|advocate|developer|analyst|specialist).*'

ads['seniority'] = ads.role.str.extract(pat=seniority_re, flags=re.I).fillna('L3-L4')[0].str.lower()
ads['function'] = (ads.role.str.extract(pat=function_re, flags=re.I)[0]
                   .str.lower().str.replace('science', 'scientist', flags=re.I))

In [29]:
ads[['title', 'role', 'department', 'area', 'seniority', 'function']].head()

Unnamed: 0,title,role,department,area,seniority,function
0,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
1,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
2,"Conversational AI Engineer, Google Cloud Profe...",Conversational AI Engineer,Google Cloud Professional Services,,l3-l4,engineer
3,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
4,"Data Scientist, Engineering",Data Scientist,Engineering,,l3-l4,scientist


An interesting insight could be to see the most relevant terms from the qualifications and responsibilities with regards to the function. This could give us an isight into the keywords that one should emphasize on in their resume.

In [30]:
ads.seniority.value_counts()

l3-l4        377
staff         58
senior        39
intern        35
lead          27
manager       22
principal      8
head           1
Name: seniority, dtype: int64

In [31]:
ads.location.value_counts()

Mountain View          93
Sunnyvale              53
Bengaluru              36
Xindian District       23
Warsaw                 21
                       ..
Oslo                    1
Minneapolis             1
Zhubei, Zhubei City     1
Stockholm               1
Addison                 1
Name: location, Length: 68, dtype: int64

In [32]:
ads.function.value_counts()

engineer      380
architect      74
developer      18
scientist      16
consultant     11
analyst        11
specialist      3
advocate        2
Name: function, dtype: int64

In [33]:
to_model = ads[ads.function.isin(['engineer', 'scientist', 'architect'])].copy()

In [34]:
label_enc = LabelEncoder()
to_model['function_enc'] = label_enc.fit_transform(to_model.function.astype('str'))

In [197]:
kwargs = {'stop_words':'english',
          'ngram_range':(2, 5),
          'strip_accents':'unicode'}

text_cols = ['job_descr', 'minimum_qual', 'preferred_qual', 'responsibilities']

column_trans = ColumnTransformer([(f'tfidf_{c}', TfidfVectorizer(**kwargs), c) for c in text_cols])

multi_col_pipe = Pipeline([('multi_col_tfidf', column_trans),
                           ('naive_bayes_clf', MultinomialNB())])

In [198]:
# param_grid = dict()
# 
# margins = 0.02
# 
# for c in text_cols:
#     param_grid[f'multi_col_tfidf__tfidf_{c}__min_df'] = list(np.arange(0.0, margins, 0.01))
#     param_grid[f'multi_col_tfidf__tfidf_{c}__max_df'] = list(np.arange(1.0 - margins, 1.0, 0.01))

In [200]:
# grid_search = GridSearchCV(multi_col_pipe, param_grid, cv=3, return_train_score=True)

In [201]:
# %timeit
# grid_res = grid_search.fit(to_model.drop('function_enc', axis=1), to_model.function_enc)

In [202]:
# best_model = grid_res.best_estimator_

In [203]:
# best_model['multi_col_tfidf'].transformers

In [204]:
# grid_res.cv_results_

In [205]:
X = to_model.drop('function_enc', axis=1)
y = to_model.function_enc

In [206]:
model = multi_col_pipe.fit(X, y)

In [207]:
%%timeit
scores = cross_validate(model, X, y, cv=3, scoring='f1_weighted', return_train_score=True)

3.31 s ± 230 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [208]:
for k in scores.keys():
    print(f'{k}, mean: {scores[k].mean():.2f}, std: {scores[k].std():.2f}')

fit_time, mean: 0.45, std: 0.02
score_time, mean: 0.15, std: 0.00
test_score, mean: 0.88, std: 0.02
train_score, mean: 0.99, std: 0.01


In [209]:
importances_mcp = pd.DataFrame(model['naive_bayes_clf'].feature_log_prob_.transpose(),
                               columns=label_enc.classes_,
                               index=model['multi_col_tfidf'].get_feature_names())

In [210]:
col_prefix = re.compile('(' + '|'.join([f'tfidf_{c}__' for c in text_cols]) + ')(.*)')

In [211]:
top_n = 10
imp_terms = pd.DataFrame()

for l in label_enc.classes_:
    imp_label = importances_mcp[l].sort_values(ascending=False)[:top_n]
    to_append = [{'feature': col_prefix.match(feature_term)[1],
                  'term': col_prefix.match(feature_term)[2],
                  'impoortance': imp,
                  'label': l}
                 for feature_term, imp in zip(imp_label.index, imp_label.values)]
    imp_terms = imp_terms.append(to_append, ignore_index=True)

In [214]:
to_crosstab = imp_terms[imp_terms.label=='scientist']

pd.crosstab(to_crosstab.term, to_crosstab.feature)

feature,tfidf_minimum_qual__
term,Unnamed: 1_level_1
degree statistics,1
degree statistics computer,1
degree statistics computer science,1
experience data,1
machine learning,1
operations research,1
quantitative discipline,1
statistical software,1
statistics computer,1
statistics computer science,1
