In this analysis, we will explore a dataset of google job ads. Those ads correspond either to the search string `machine learning engineer` or `data scientist`.

In [1]:
import os
import io
import re
import yaml

import pandas as pd
import numpy as np

from stemming import porter2

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns

from google.cloud import storage

In [2]:
config_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'config.yaml'))
with open(config_path, 'r') as config_file:
    config = yaml.load(config_file, Loader=yaml.SafeLoader)

In [3]:
%matplotlib inline

In [4]:
plt.style.use('fivethirtyeight')
plt.rcParams['figure.dpi']= 180

In [5]:
client = storage.Client(config['project'])
bucket = client.get_bucket(config['bucket'])
blob = bucket.blob(config['dest_file'])
file_content = io.BytesIO(blob.download_as_string())

ads = pd.read_csv(file_content)



In [6]:
ads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 532 entries, 0 to 531
Data columns (total 7 columns):
id                  532 non-null object
job_descr           530 non-null object
location            528 non-null object
minimum_qual        530 non-null object
preferred_qual      530 non-null object
responsibilities    530 non-null object
title               532 non-null object
dtypes: object(7)
memory usage: 29.2+ KB


First, let us try using simple string operations to derive the additional information from the job title. Such information could be: seniority, function, etc.

In [7]:
ads[['role', 'department', 'area', 'other']] = ads.title.str.split(',', expand=True)

In [8]:
seniority_re = r'.*(intern|senior|lead|staff|principal|manager|head).*'
function_re = r'.*(engineer|scientist|science|consultant|architect|advocate|developer|analyst|specialist).*'

ads['seniority'] = ads.role.str.extract(pat=seniority_re, flags=re.I).fillna('L3-L4')[0].str.lower()
ads['function'] = (ads.role.str.extract(pat=function_re, flags=re.I)[0]
                   .str.lower().str.replace('science', 'scientist', flags=re.I))

In [9]:
ads[['title', 'role', 'department', 'area', 'seniority', 'function']].head()

Unnamed: 0,title,role,department,area,seniority,function
0,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
1,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
2,"Conversational AI Engineer, Google Cloud Profe...",Conversational AI Engineer,Google Cloud Professional Services,,l3-l4,engineer
3,"Cloud AI Engineer, Professional Services",Cloud AI Engineer,Professional Services,,l3-l4,engineer
4,"Data Scientist, Engineering",Data Scientist,Engineering,,l3-l4,scientist


An interesting insight could be to see the most relevant terms from the qualifications and responsibilities with regards to the function. This could give us an isight into the keywords that one should emphasize on in their resume.

In [10]:
ads.function.value_counts()

engineer      364
architect      69
developer      18
scientist      15
consultant     10
analyst        10
specialist      2
advocate        2
Name: function, dtype: int64

In [11]:
label_enc = LabelEncoder()
ads['function_enc'] = label_enc.fit_transform(ads.function.astype('str'))

In [12]:
to_model = ads[ads.minimum_qual.notna()]

X = to_model.minimum_qual
y = to_model.function_enc

In [13]:
text_pipe = Pipeline([('count_vectorizer', CountVectorizer(stop_words='english',
                                                           ngram_range=(1, 10),
                                                           min_df=0.05, max_df=0.95,
                                                           strip_accents='unicode')),
                      ('tf_idf_transf', TfidfTransformer()),
                      ('naive_bayes_clf', MultinomialNB())])

In [14]:
text_pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('count_vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.95,
                                 max_features=None, min_df=0.05,
                                 ngram_range=(1, 10), preprocessor=None,
                                 stop_words='english', strip_accents='unicode',
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tf_idf_transf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('naive_bayes_clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [15]:
importances = pd.DataFrame(text_pipe.steps[-1][1].feature_log_prob_.transpose(),
                           columns=label_enc.classes_,
                           index=text_pipe.steps[0][1].get_feature_names())

In [16]:
for l in label_enc.classes_:
    print(l)
    print('-'*80)
    print(importances[l].sort_values(ascending=False)[:10])
    print('\n')
    print('-'*80)

advocate
--------------------------------------------------------------------------------
technologies           -5.310366
cloud                  -5.384257
applications           -5.470887
experience technical   -5.478780
developing             -5.531538
technical              -5.533766
technology             -5.558950
years experience       -5.570774
systems                -5.576884
machine learning       -5.577526
Name: advocate, dtype: float64


--------------------------------------------------------------------------------
analyst
--------------------------------------------------------------------------------
sql                                                -4.606693
data                                               -4.629347
analysis                                           -5.032129
statistics                                         -5.104996
equivalent practical experience years experience   -5.137152
practical experience years experience              -5.139561
experience 