In this analysis, we will explore a dataset of google job ads. Those ads correspond either to the search string `machine learning engineer` or `data scientist`.

In [1]:
import os
import io
import re
import yaml

import pandas as pd
import numpy as np

from stemming import porter2

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
from sklearn.compose import ColumnTransformer

import matplotlib.pyplot as plt
import seaborn as sns

from google.cloud import storage

In [2]:
config_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'config.yaml'))
with open(config_path, 'r') as config_file:
    config = yaml.load(config_file, Loader=yaml.SafeLoader)

In [3]:
%matplotlib inline

In [4]:
plt.style.use('fivethirtyeight')
plt.rcParams['figure.dpi']= 180

In [5]:
client = storage.Client(config['project'])
bucket = client.get_bucket(config['bucket'])
blob = bucket.blob(config['dest_file'])
file_content = io.BytesIO(blob.download_as_string())

ads = pd.read_csv(file_content)



In [6]:
ads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 556 entries, 0 to 555
Data columns (total 7 columns):
id                  556 non-null object
job_descr           554 non-null object
location            551 non-null object
minimum_qual        554 non-null object
preferred_qual      554 non-null object
responsibilities    554 non-null object
title               556 non-null object
dtypes: object(7)
memory usage: 30.5+ KB


First, let us try using simple string operations to derive the additional information from the job title. Such information could be: seniority, function, etc.

In [7]:
ads.fillna('unknown', inplace=True)

In [8]:
ads[['role', 'department', 'area', 'other']] = ads.title.str.split(',', expand=True)

In [9]:
seniority_re = r'.*(intern|senior|lead|staff|principal|manager|head).*'
function_re = r'.*(engineer|scientist|science|consultant|architect|advocate|developer|analyst|specialist).*'

ads['seniority'] = ads.role.str.extract(pat=seniority_re, flags=re.I).fillna('L3-L4')[0].str.lower()
ads['function'] = (ads.role.str.extract(pat=function_re, flags=re.I)[0]
                   .str.lower().str.replace('science', 'scientist', flags=re.I))

In [10]:
ads[['title', 'role', 'department', 'area', 'seniority', 'function']].head()

Unnamed: 0,title,role,department,area,seniority,function
0,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
1,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
2,"Conversational AI Engineer, Google Cloud Profe...",Conversational AI Engineer,Google Cloud Professional Services,,l3-l4,engineer
3,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
4,"Data Scientist, Engineering",Data Scientist,Engineering,,l3-l4,scientist


An interesting insight could be to see the most relevant terms from the qualifications and responsibilities with regards to the function. This could give us an isight into the keywords that one should emphasize on in their resume.

In [11]:
ads.function.value_counts()

engineer      375
architect      70
developer      18
scientist      16
consultant     11
analyst        11
specialist      3
advocate        2
Name: function, dtype: int64

In [12]:
to_model = ads[ads.function.isin(['engineer', 'scientist', 'architect'])].copy()

In [13]:
label_enc = LabelEncoder()
to_model['function_enc'] = label_enc.fit_transform(to_model.function.astype('str'))

In [14]:
kwargs = {'stop_words':'english',
          'ngram_range':(1, 10),
          'min_df':0.05,
          'max_df':0.95,
          'strip_accents':'unicode'}

text_cols = ['job_descr', 'minimum_qual', 'preferred_qual', 'responsibilities']

column_trans = ColumnTransformer([(f'tfidf_{c}', TfidfVectorizer(**kwargs), c) for c in text_cols])


multi_col_pipe = Pipeline([('multi_col_tfidf', column_trans),
                           ('naive_bayes_clf', MultinomialNB())])

In [16]:
multi_col_pipe.fit(to_model[text_cols], to_model.function_enc)

Pipeline(memory=None,
         steps=[('multi_col_tfidf',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('tfidf_job_descr',
                                                  TfidfVectorizer(analyzer='word',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.float64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=0.95,
            

In [19]:
%time
scores = cross_validate(multi_col_pipe, to_model[text_cols], to_model.function_enc, cv=3, scoring='f1_weighted', return_train_score=True)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs


In [20]:
for k in scores.keys():
    print(f'{k}, mean: {scores[k].mean():.2f}, std: {scores[k].std():.2f}')

fit_time, mean: 1.16, std: 0.02
score_time, mean: 0.30, std: 0.01
test_score, mean: 0.79, std: 0.05
train_score, mean: 0.79, std: 0.02


In [21]:
importances_mcp = pd.DataFrame(multi_col_pipe['naive_bayes_clf'].feature_log_prob_.transpose(),
                               columns=label_enc.classes_,
                               index=multi_col_pipe['multi_col_tfidf'].get_feature_names())

In [22]:
for l in label_enc.classes_:
    print(l)
    print('-'*80)
    print(importances_mcp[l].sort_values(ascending=False)[:10])
    print('\n')
    print('-'*80)

architect
--------------------------------------------------------------------------------
tfidf_minimum_qual__cloud             -6.564817
tfidf_job_descr__cloud                -6.597862
tfidf_preferred_qual__experience      -6.680751
tfidf_preferred_qual__cloud           -6.745307
tfidf_minimum_qual__computing         -6.795373
tfidf_job_descr__customers            -6.806507
tfidf_responsibilities__power         -6.859997
tfidf_minimum_qual__cloud computing   -6.923272
tfidf_job_descr__google cloud         -6.939427
tfidf_responsibilities__customers     -6.956900
Name: architect, dtype: float64


--------------------------------------------------------------------------------
engineer
--------------------------------------------------------------------------------
tfidf_preferred_qual__experience   -5.866356
tfidf_responsibilities__design     -6.608423
tfidf_minimum_qual__development    -6.760669
tfidf_minimum_qual__degree         -6.776300
tfidf_minimum_qual__software       -6.824406