In [1]:
# https://towardsdatascience.com/how-to-use-nlp-in-python-a-practical-step-by-step-example-bd82ca2d2e1e

In [2]:
# Preparation: Scraping the Data

In [3]:
'''We scrape the job postings for “data scientists” from Indeed for 8 different cities. Upon scraping, we download the data into separate files for each of the cities.'''

'We scrape the job postings for “data scientists” from Indeed for 8 different cities. Upon scraping, we download the data into separate files for each of the cities.'

In [4]:
'''The 8 cities included in this analysis are Boston, Chicago, Los Angeles, Montreal, New York, San Francisco, Toronto, and Vancouver. The variables are job_title, company, location, and job_description.'''

'The 8 cities included in this analysis are Boston, Chicago, Los Angeles, Montreal, New York, San Francisco, Toronto, and Vancouver. The variables are job_title, company, location, and job_description.'

In [49]:
# https://gist.github.com/liannewriting/a08e549f186067837856494513250ff1

In [None]:
from time import sleep
from selenium import webdriver
from selenium.common.exceptions import ElementNotVisibleException
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import random

# define scraping function
def scrape_indeed(search,loc, limit = 50, canada=False):
    
    # search_term is the keyword/designation to be searched
    search_term = search.replace(' ','+')                  
    
    if canada:
        url = 'https://www.indeed.ca/jobs?q={}&l={}&limit={}&radius=25&start=0'.format(search_term, loc, limit)
    else:
        url = 'https://www.indeed.com/jobs?q={}&l={}&limit={}&radius=25&start=0'.format(search_term, loc, limit)
    
    # Start the browser and load the above URL
    browser = webdriver.Chrome('/Users/justin/Downloads/chromedriver')
    browser.get(url)
    
    # Empty dataframe in which we will store our data scraped from job posts
    data = pd.DataFrame(columns = ['job_title','company', 'location', 'job_description'])

    x = 0
    
    # get the number of results. This determines
    num_results = browser.find_element_by_id('searchCountPages').text
    ind0 = num_results.find('of ') + 3
    ind1 = num_results.find(' ', ind0)
    num_results = int(num_results[ind0:ind1])
    pages = math.ceil(num_results/limit) # the number of pages to visit.
    
    # Loop through the pages
    for j in range(pages):
        
        # All the job posts have class 'row result clickcard'.
        job_elements =  browser.find_elements_by_xpath("//div[@class='jobsearch-SerpJobCard unifiedRow row result clickcard']")

        # Loop through the individual job posts
        for i in range(len(job_elements)):
            
            # Click on the job post
            job_elements[i].click()
            
            # Sleep for minimum 3 seconds because we dont want to create unnecessary load on Indeed's servers
            sleep(3 + random.randint(0,3))
            
            # Sometimes Selenium might start scraping before the page finishes loading or 
            # we might encounter '404 : Job not found error'
            # Although these occurences are very rare we don't want our job scrapper to crash.
            # Therefore we will retry before moving on.
            # If the data was successfully scrapped then it will break out of the for loop
            # If we encounter error it will retry again provided the retry count is below 5
            
            done = False
            for k in range(0,5):
                try:
                    title =  browser.find_element_by_id('vjs-jobtitle').text
                    company = browser.find_element_by_id('vjs-cn').text
                    company = company.replace('- ', '')
                    
                    location = browser.find_element_by_id('vjs-loc').text
                    description = browser.find_element_by_id('vjs-desc').text
                    done = True
                    break
                except NoSuchElementException:
                    print('Unable to fetch data. Retrying.....')

            if not done:
                continue

            # For debugging purposes lets log the job post scrapped
            print('Completed Post {} of Page {} - {}'.format(i+1,j+1,title))
            
            # Insert the data into our dataframe
            data = data.append({'job_title':title,
                                'company':company,
                                'location':location,
                                'job_description':description},ignore_index=True)    
            

        # Change the URL, so as to move on to the next page
        url = url.replace('start=' + str(x),'start=' +str(x+limit))
        x += limit
        
        if len(job_elements) < limit:
            break
        
        browser.get(url)
        print('Moving on to page ' + str(j+2))
        sleep(2)
        
        # A popover appears when we go to the next page. We will tell the browser to click on close button.
        # Although so far for me it has appeared only on 2nd page but I have included the check for every page to be on safer side
        try:
            browser.find_element_by_id('popover-x').click()
        except:
            print('No Newsletter Popup Found')
    
    browser.close()
    return data

# download data, use Toronto as an example
loc = 'Toronto%2C+ON'
q = 'title%3A%28machine+learning%29'

df0 = scrape_indeed(q, loc, 50, True) # Jan 25
df0.to_pickle('data_scientist_toronto.pkl')

In [9]:
from collections import Counter
import nltk
import string
from nltk.tokenize import word_tokenize
import math
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

In [7]:
# Step 1: Loading and Cleaning the Data

In [None]:
# First, we load and combine the data files of the 8 cities into Python.

In [None]:
# load in the data.
df_list = []
cities = ['boston', 'chicago', 'la', 'montreal', 'ny', 'sf', 'toronto', 'vancouver']

for city in cities:
    df_tmp = pd.read_pickle('data_scientist_{}.pkl'.format(city))
    df_tmp['city'] = city
    df_list.append(df_tmp)

df = pd.concat(df_list).reset_index(drop=True)

# make the city names nicer.
msk = df['city'] == 'la'
df.loc[msk, 'city'] = 'los angeles'

msk = df['city'] == 'ny'
df.loc[msk, 'city'] = 'new york'

msk = df['city'] == 'sf'
df.loc[msk, 'city'] = 'san francisco'

In [10]:
# We remove duplicate rows/job postings with the same job_title, job_description, and city features.

In [None]:
# If it's the same job description in the same city, for the same job title, we consider it duplicate.
print(df.shape)
df = df.drop_duplicates(subset=['job_description', 'city', 'job_title'])
print(df.shape)

In [11]:
# Step #2: Forming the Lists of Keywords

In [12]:
'''Before searching in the job descriptions, we need lists of keywords that represent the tools/skills/degrees.
For this analysis, we use a simple approach to forming the lists. The lists are based on our judgment and the content of the job postings. You may use more advanced approaches if the task is more complicated than this.'''

'Before searching in the job descriptions, we need lists of keywords that represent the tools/skills/degrees.\nFor this analysis, we use a simple approach to forming the lists. The lists are based on our judgment and the content of the job postings. You may use more advanced approaches if the task is more complicated than this.'

In [13]:
# https://monkeylearn.com/keyword-extraction/

In [14]:
'''For the list of keywords of tools, we initially come up with a list based on our knowledge of data science. We know that the popular tools for data scientists include Python, R, Hadoop, Spark, and more. We have a decent knowledge of the field. So this initial list is good to have covered many tools mentioned in the job postings.'''

'For the list of keywords of tools, we initially come up with a list based on our knowledge of data science. We know that the popular tools for data scientists include Python, R, Hadoop, Spark, and more. We have a decent knowledge of the field. So this initial list is good to have covered many tools mentioned in the job postings.'

In [15]:
'''Then we look at random job postings and add tools that are not on the list yet. Often these new keywords remind us to add other related tools as well.
After this process, we have a keyword list that covers most of the tools mentioned in the job postings.'''

'Then we look at random job postings and add tools that are not on the list yet. Often these new keywords remind us to add other related tools as well.\nAfter this process, we have a keyword list that covers most of the tools mentioned in the job postings.'

In [16]:
'''Next, we separate the keywords into a single-word list and a multi-word list. We need to match these two lists of keywords to the job description in different ways.
With simple string matches, the multi-word keyword is often unique and easy to identify in the job description.'''

'Next, we separate the keywords into a single-word list and a multi-word list. We need to match these two lists of keywords to the job description in different ways.\nWith simple string matches, the multi-word keyword is often unique and easy to identify in the job description.'

In [17]:
'''The single-word keyword, such as “c” is referring to C programming language in our article. But “c” is also a common letter that is used in many words including “can”, “clustering”. We need to process them further (through tokenization) to match only when there is a single letter “c” in the job descriptions.'''

'The single-word keyword, such as “c” is referring to C programming language in our article. But “c” is also a common letter that is used in many words including “can”, “clustering”. We need to process them further (through tokenization) to match only when there is a single letter “c” in the job descriptions.'

In [18]:
# Below are our lists of keywords for tools coded in Python.

In [None]:
# got these keywords by looking at some examples and using existing knowledge.
tool_keywords1 = ['python', 'pytorch', 'sql', 'mxnet', 'mlflow', 'einstein', 'theano', 'pyspark', 'solr', 'mahout', 
 'cassandra', 'aws', 'powerpoint', 'spark', 'pig', 'sas', 'java', 'nosql', 'docker', 'salesforce', 'scala', 'r',
 'c', 'c++', 'net', 'tableau', 'pandas', 'scikitlearn', 'sklearn', 'matlab', 'scala', 'keras', 'tensorflow', 'clojure',
 'caffe', 'scipy', 'numpy', 'matplotlib', 'vba', 'spss', 'linux', 'azure', 'cloud', 'gcp', 'mongodb', 'mysql', 'oracle', 
 'redshift', 'snowflake', 'kafka', 'javascript', 'qlik', 'jupyter', 'perl', 'bigquery', 'unix', 'react',
 'scikit', 'powerbi', 's3', 'ec2', 'lambda', 'ssrs', 'kubernetes', 'hana', 'spacy', 'tf', 'django', 'sagemaker',
 'seaborn', 'mllib', 'github', 'git', 'elasticsearch', 'splunk', 'airflow', 'looker', 'rapidminer', 'birt', 'pentaho', 
 'jquery', 'nodejs', 'd3', 'plotly', 'bokeh', 'xgboost', 'rstudio', 'shiny', 'dash', 'h20', 'h2o', 'hadoop', 'mapreduce', 
 'hive', 'cognos', 'angular', 'nltk', 'flask', 'node', 'firebase', 'bigtable', 'rust', 'php', 'cntk', 'lightgbm', 
 'kubeflow', 'rpython', 'unixlinux', 'postgressql', 'postgresql', 'postgres', 'hbase', 'dask', 'ruby', 'julia', 'tensor',
# added r packages doesn't seem to impact the result
 'dplyr','ggplot2','esquisse','bioconductor','shiny','lubridate','knitr','mlr','quanteda','dt','rcrawler','caret','rmarkdown',
 'leaflet','janitor','ggvis','plotly','rcharts','rbokeh','broom','stringr','magrittr','slidify','rvest',
 'rmysql','rsqlite','prophet','glmnet','text2vec','snowballc','quantmod','rstan','swirl','datasciencer']


# another set of keywords that are longer than one word.
tool_keywords2 = set(['amazon web services', 'google cloud', 'sql server'])

In [19]:
# We get lists of keywords for skills by following a similar process as tools.
skill_keywords1 = set(['statistics', 'cleansing', 'chatbot', 'cleaning', 'blockchain', 'causality', 'correlation', 'bandit', 'anomaly', 'kpi',
 'dashboard', 'geospatial', 'ocr', 'econometrics', 'pca', 'gis', 'svm', 'svd', 'tuning', 'hyperparameter', 'hypothesis',
 'salesforcecom', 'segmentation', 'biostatistics', 'unsupervised', 'supervised', 'exploratory',
 'recommender', 'recommendations', 'research', 'sequencing', 'probability', 'reinforcement', 'graph', 'bioinformatics',
 'chi', 'knn', 'outlier', 'etl', 'normalization', 'classification', 'optimizing', 'prediction', 'forecasting',
 'clustering', 'cluster', 'optimization', 'visualization', 'nlp', 'c#',
 'regression', 'logistic', 'nn', 'cnn', 'glm',
 'rnn', 'lstm', 'gbm', 'boosting', 'recurrent', 'convolutional', 'bayesian',
 'bayes'])


# another set of keywords that are longer than one word.
skill_keywords2 = set(['random forest', 'natural language processing', 'machine learning', 'decision tree', 'deep learning', 'experimental design',
 'time series', 'nearest neighbors', 'neural network', 'support vector machine', 'computer vision', 'machine vision', 'dimensionality reduction', 
 'text analytics', 'power bi', 'a/b testing', 'ab testing', 'chat bot', 'data mining'])

In [20]:
# For education level, we use a different procedure.

In [21]:
'''Because we are looking for the minimum required education level, we need a numeric value to rank the education degree. For example, we use 1 to represent “bachelor” or “undergraduate”, 2 to represent “master” or “graduate”, and so on.
In this way, we have a ranking of degrees by numbers from 1 to 4. The higher the number, the higher the education level.'''

'Because we are looking for the minimum required education level, we need a numeric value to rank the education degree. For example, we use 1 to represent “bachelor” or “undergraduate”, 2 to represent “master” or “graduate”, and so on.\nIn this way, we have a ranking of degrees by numbers from 1 to 4. The higher the number, the higher the education level.'

In [None]:
degree_dict = {'bs': 1, 'bachelor': 1, 'undergraduate': 1, 
               'master': 2, 'graduate': 2, 'mba': 2.5, 
               'phd': 3, 'ph.d': 3, 'ba': 1, 'ma': 2,
               'postdoctoral': 4, 'postdoc': 4, 'doctorate': 3}


degree_dict2 = {'advanced degree': 2, 'ms or': 2, 'ms degree': 2, '4 year degree': 1, 'bs/': 1, 'ba/': 1,
                '4-year degree': 1, 'b.s.': 1, 'm.s.': 2, 'm.s': 2, 'b.s': 1, 'phd/': 3, 'ph.d.': 3, 'ms/': 2,
                'm.s/': 2, 'm.s./': 2, 'msc/': 2, 'master/': 2, 'master\'s/': 2, 'bachelor\s/': 1}
degree_keywords2 = set(degree_dict2.keys())

In [22]:
# Step #3: Streamlining the Job Descriptions using NLP Techniques

In [23]:
'''In this step, we streamline the job description text. We make the text easier to understand by computer programs; and hence more efficient to match the text with the lists of keywords.
The job_description feature in our dataset looks like this.'''

'In this step, we streamline the job description text. We make the text easier to understand by computer programs; and hence more efficient to match the text with the lists of keywords.\nThe job_description feature in our dataset looks like this.'

In [None]:
df['job_description'].iloc[12]

In [24]:
# Tokenizing the Job Descriptions

In [25]:
'''Tokenization is a process of parsing the text string into different sections (tokens). It is necessary since the computer programs understand the tokenized text better.
We must explicitly split the job description text string into different tokens (words) with delimiters such as space (“ ”). We use the word_tokenize function to handle this task.'''

'Tokenization is a process of parsing the text string into different sections (tokens). It is necessary since the computer programs understand the tokenized text better.\nWe must explicitly split the job description text string into different tokens (words) with delimiters such as space (“ ”). We use the word_tokenize function to handle this task.'

In [26]:
# https://www.nltk.org/api/nltk.tokenize.html

In [None]:
word_tokenize(df['job_description'].iloc[12])

In [27]:
'''After this process, the job description text string is partitioned into tokens (words) as below. The computer can read and process these tokens easier.
For instance, the single-word keyword “c” can only match with tokens (words) “c”, rather than with other words “can” or “clustering”.'''

'After this process, the job description text string is partitioned into tokens (words) as below. The computer can read and process these tokens easier.\nFor instance, the single-word keyword “c” can only match with tokens (words) “c”, rather than with other words “can” or “clustering”.'

In [28]:
# Parts of Speech (POS) Tagging the Job Descriptions

In [29]:
'''The job descriptions are often long. We want to keep the words that are informative for our analysis while filtering out others. We use POS tagging to achieve this.
The POS tagging is an NLP method of labeling whether a word is a noun, adjective, verb, etc. Wikipedia explains it well:
POS tagging is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech, based on both its definition and its context — i.e., its relationship with adjacent and related words in a phrase, sentence, or paragraph. A simplified form of this is commonly taught to school-age children, in the identification of words as nouns, verbs, adjectives, adverbs, etc.
Thanks to the NLTK, we can use this tagger with Python.
Applying this technique on the lists of keywords, we can find tags related to our analysis.'''

'The job descriptions are often long. We want to keep the words that are informative for our analysis while filtering out others. We use POS tagging to achieve this.\nThe POS tagging is an NLP method of labeling whether a word is a noun, adjective, verb, etc. Wikipedia explains it well:\nPOS tagging is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech, based on both its definition and its context — i.e., its relationship with adjacent and related words in a phrase, sentence, or paragraph. A simplified form of this is commonly taught to school-age children, in the identification of words as nouns, verbs, adjectives, adverbs, etc.\nThanks to the NLTK, we can use this tagger with Python.\nApplying this technique on the lists of keywords, we can find tags related to our analysis.'

In [None]:
# Below, we POS tag the list of keywords for tools as a demonstration.
from nltk import pos_tag
from nltk.stem import PorterStemmer

pos_tag(tool_keywords1)

In [30]:
'''Different combinations of letters represent the tags. For instance, NN stands for nouns and singular words such as “python”, JJ stands for adjective words such as “big”. The full list of representations is here.'''

'Different combinations of letters represent the tags. For instance, NN stands for nouns and singular words such as “python”, JJ stands for adjective words such as “big”. The full list of representations is here.'

In [31]:
# https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/

In [32]:
# As we can see, the tagger is not perfect. For example, “sql” is tagged as “JJ” — adjective. But it is still good enough to help us filtering for useful words.

In [33]:
'''We use this list of tags of all the keywords as a filter for the job descriptions. We keep only the words from the job descriptions that have these same tags of keywords. For example, we would keep the words from job descriptions with tags “NN” and “JJ”. By doing this, we filter out the words from the job descriptions such as “the”, “then” that are not informative for our analysis.
At this stage, we have streamlined job descriptions that are tokenized and shortened.'''

'We use this list of tags of all the keywords as a filter for the job descriptions. We keep only the words from the job descriptions that have these same tags of keywords. For example, we would keep the words from job descriptions with tags “NN” and “JJ”. By doing this, we filter out the words from the job descriptions such as “the”, “then” that are not informative for our analysis.\nAt this stage, we have streamlined job descriptions that are tokenized and shortened.'

In [34]:
# Step #4: Final Processing of the Keywords and the Job Descriptions

In [35]:
# In this step, we process both the lists of keywords and the job descriptions further.

In [36]:
# Stemming the Words

In [37]:
'''Word stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base, or root form — generally a written word form.
The stemming process allows computer programs to identify the words of the same stem despite their different look. In this way, we can match words as long as they have the same stem. For instance, the words “models”, “modeling” both have the same stem of “model”.
We stem both the lists of keywords and the streamlined job descriptions.'''

'Word stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base, or root form — generally a written word form.\nThe stemming process allows computer programs to identify the words of the same stem despite their different look. In this way, we can match words as long as they have the same stem. For instance, the words “models”, “modeling” both have the same stem of “model”.\nWe stem both the lists of keywords and the streamlined job descriptions.'

In [38]:
# Lowercasing the Words

In [39]:
'''Lastly, we standardize all the words by lowercasing them. We only lowercase the job descriptions since the lists of keywords are built in lowercase.
As mentioned in the previous sections, the Python code used in the previous procedures is below.'''

'Lastly, we standardize all the words by lowercasing them. We only lowercase the job descriptions since the lists of keywords are built in lowercase.\nAs mentioned in the previous sections, the Python code used in the previous procedures is below.'

In [None]:
from nltk import pos_tag
from nltk.stem import PorterStemmer

ps = PorterStemmer()


# process the job description.
def prepare_job_desc(desc):
    # tokenize description.
    tokens = word_tokenize(desc)
        
    # Parts of speech (POS) tag tokens.
    token_tag = pos_tag(tokens)
    
    # Only include some of the POS tags.
    include_tags = ['VBN', 'VBD', 'JJ', 'JJS', 'JJR', 'CD', 'NN', 'NNS', 'NNP', 'NNPS']
    filtered_tokens = [tok for tok, tag in token_tag if tag in include_tags]
    
    # stem words.
    stemmed_tokens = [ps.stem(tok).lower() for tok in filtered_tokens]
    return set(stemmed_tokens)

df['job_description_word_set'] = df['job_description'].map(prepare_job_desc)

# process the keywords
tool_keywords1_set = set([ps.stem(tok) for tok in tool_keywords1]) # stem the keywords (since the job description is also stemmed.)
tool_keywords1_dict = {ps.stem(tok):tok for tok in tool_keywords1} # use this dictionary to revert the stemmed words back to the original.

skill_keywords1_set = set([ps.stem(tok) for tok in skill_keywords1])
skill_keywords1_dict = {ps.stem(tok):tok for tok in skill_keywords1}

degree_keywords1_set = set([ps.stem(tok) for tok in degree_dict.keys()])
degree_keywords1_dict = {ps.stem(tok):tok for tok in degree_dict.keys()}

In [None]:
# Now only the words (tokens) in the job descriptions that are related to our analysis remain. An example of a final job description is below.
df['job_description_word_set'].iloc[10]

In [40]:
# Step #5: Matching the Keywords and the Job Descriptions

In [41]:
# To see if a job description mentions specific keywords, we match the lists of keywords and the final streamlined job descriptions.

In [42]:
# Tools/Skills

In [43]:
'''As you may recall, we built two types of keyword lists — the single-word list and the multi-word list. For the single-word keywords, we match each keyword with the job description by the set intersection function. For the multi-word keywords, we check whether they are sub-strings of the job descriptions.'''

'As you may recall, we built two types of keyword lists — the single-word list and the multi-word list. For the single-word keywords, we match each keyword with the job description by the set intersection function. For the multi-word keywords, we check whether they are sub-strings of the job descriptions.'

In [44]:
# Education

In [45]:
'''For the education level, we use the same method as tools/skills to match keywords. Yet, we only keep track of the minimum level.
For example, when the keywords “bachelor” and “master” both exist in a job description, the bachelor’s degree is the minimum education required for this job.'''

'For the education level, we use the same method as tools/skills to match keywords. Yet, we only keep track of the minimum level.\nFor example, when the keywords “bachelor” and “master” both exist in a job description, the bachelor’s degree is the minimum education required for this job.'

In [None]:
tool_list = []
skill_list = []
degree_list = []

msk = df['city'] != '' # just in case you want to filter the data.

num_postings = len(df[msk].index)
for i in range(num_postings):
    job_desc = df[msk].iloc[i]['job_description'].lower()
    job_desc_set = df[msk].iloc[i]['job_description_word_set']
    
    # check if the keywords are in the job description. Look for exact match by token.
    tool_words = tool_keywords1_set.intersection(job_desc_set)
    skill_words = skill_keywords1_set.intersection(job_desc_set)
    degree_words = degree_keywords1_set.intersection(job_desc_set)
    
    # check if longer keywords (more than one word) are in the job description. Match by substring.
    j = 0
    for tool_keyword2 in tool_keywords2:
        # tool keywords.
        if tool_keyword2 in job_desc:
            tool_list.append(tool_keyword2)
            j += 1
    
    k = 0
    for skill_keyword2 in skill_keywords2:
        # skill keywords.
        if skill_keyword2 in job_desc:
            skill_list.append(skill_keyword2)
            k += 1
    
    # search for the minimum education.
    min_education_level = 999
    for degree_word in degree_words:
        level = degree_dict[degree_keywords1_dict[degree_word]]
        min_education_level = min(min_education_level, level)
    
    for degree_keyword2 in degree_keywords2:
        # longer keywords. Match by substring.
        if degree_keyword2 in job_desc:
            level = degree_dict2[degree_keyword2]
            min_education_level = min(min_education_level, level)
    
    # label the job descriptions without any tool keywords.
    if len(tool_words) == 0 and j == 0:
        tool_list.append('nothing specified')
    
    # label the job descriptions without any skill keywords.
    if len(skill_words) == 0 and k == 0:
        skill_list.append('nothing specified')
    
    # If none of the keywords were found, but the word degree is present, then assume it's a bachelors level.
    if min_education_level > 500:
        if 'degree' in job_desc:
            min_education_level = 1
    
    tool_list += list(tool_words)
    skill_list += list(skill_words)
    degree_list.append(min_education_level)

In [46]:
# Step 6: Visualizing the Results

In [47]:
'''We summarize the results with bar charts.
For each particular keyword of tools/skills/education levels, we count the number of job descriptions that match them. We calculate their percentage among all the job descriptions as well.
For the lists of tools and skills, we are only presenting the top 50 most popular ones. For the education level, we summarize them according to the minimum level required.'''

'We summarize the results with bar charts.\nFor each particular keyword of tools/skills/education levels, we count the number of job descriptions that match them. We calculate their percentage among all the job descriptions as well.\nFor the lists of tools and skills, we are only presenting the top 50 most popular ones. For the education level, we summarize them according to the minimum level required.'

In [None]:
# Top Tools In-Demand

In [None]:
# create the list of tools.
df_tool = pd.DataFrame(data={'cnt': tool_list})
df_tool = df_tool.replace(tool_keywords1_dict)

# group some of the categories together.
msk = df_tool['cnt'] == 'h20'
df_tool.loc[msk, 'cnt'] = 'h2o'

msk = df_tool['cnt'] == 'aws'
df_tool.loc[msk, 'cnt'] = 'amazon web services'

msk = df_tool['cnt'] == 'gcp'
df_tool.loc[msk, 'cnt'] = 'google cloud'

msk = df_tool['cnt'] == 'github'
df_tool.loc[msk, 'cnt'] = 'git'

msk = df_tool['cnt'] == 'postgressql'
df_tool.loc[msk, 'cnt'] = 'postgres'

msk = df_tool['cnt'] == 'tensor'
df_tool.loc[msk, 'cnt'] = 'tensorflow'

df_tool_top50 = df_tool['cnt'].value_counts().reset_index().rename(columns={'index': 'tool'}).iloc[:50]

In [None]:
# visualize the tools.
layout = dict(
    title='Tools For Data Scientists',
    yaxis=dict(
        title='% of job postings',
        tickformat=',.0%',
    )
)

fig = go.Figure(layout=layout)
fig.add_trace(go.Bar(
    x=df_tool_top50['tool'],
    y=df_tool_top50['cnt']/num_postings
))

iplot(fig)

In [None]:
# Top Skills In-Demand

In [None]:
# create the list of skills/knowledge.
df_skills = pd.DataFrame(data={'cnt': skill_list})
df_skills = df_skills.replace(skill_keywords1_dict)

# group some of the categories together.
msk = df_skills['cnt'] == 'nlp'
df_skills.loc[msk, 'cnt'] = 'natural language processing'

msk = df_skills['cnt'] == 'convolutional'
df_skills.loc[msk, 'cnt'] = 'convolutional neural network'

msk = df_skills['cnt'] == 'cnn'
df_skills.loc[msk, 'cnt'] = 'convolutional neural network'

msk = df_skills['cnt'] == 'recurrent'
df_skills.loc[msk, 'cnt'] = 'recurrent neural network'

msk = df_skills['cnt'] == 'rnn'
df_skills.loc[msk, 'cnt'] = 'recurrent neural network'

msk = df_skills['cnt'] == 'knn'
df_skills.loc[msk, 'cnt'] = 'nearest neighbors'

msk = df_skills['cnt'] == 'svm'
df_skills.loc[msk, 'cnt'] = 'support vector machine'

msk = df_skills['cnt'] == 'machine vision'
df_skills.loc[msk, 'cnt'] = 'computer vision'

msk = df_skills['cnt'] == 'ab testing'
df_skills.loc[msk, 'cnt'] = 'a/b testing'

df_skills_top50 = df_skills['cnt'].value_counts().reset_index().rename(columns={'index': 'skill'}).iloc[:50]

In [None]:
# visualize the skills.
layout = dict(
    title='Skills For Data Scientists',
    yaxis=dict(
        title='% of job postings',
        tickformat=',.0%',
    )
)

fig = go.Figure(layout=layout)
fig.add_trace(go.Bar(
    x=df_skills_top50['skill'],
    y=df_skills_top50['cnt']/num_postings
))

iplot(fig)

In [48]:
# Minimum Education Required

In [None]:
# create the list of degree.
df_degrees = pd.DataFrame(data={'cnt': degree_list})
df_degrees['degree_type'] = ''


msk = df_degrees['cnt'] == 1
df_degrees.loc[msk, 'degree_type'] = 'bachelors'

msk = df_degrees['cnt'] == 2
df_degrees.loc[msk, 'degree_type'] = 'masters'

msk = df_degrees['cnt'] == 3
df_degrees.loc[msk, 'degree_type'] = 'phd'

msk = df_degrees['cnt'] == 4
df_degrees.loc[msk, 'degree_type'] = 'postdoc'

msk = df_degrees['cnt'] == 2.5
df_degrees.loc[msk, 'degree_type'] = 'mba'

msk = df_degrees['cnt'] > 500
df_degrees.loc[msk, 'degree_type'] = 'not specified'


df_degree_cnt = df_degrees['degree_type'].value_counts().reset_index().rename(columns={'index': 'degree'}).iloc[:50]


In [None]:
# visualize the degrees.
layout = dict(
    title='Minimum Education For Data Scientists',
    yaxis=dict(
        title='% of job postings',
        tickformat=',.0%',
    )
)

fig = go.Figure(layout=layout)
fig.add_trace(go.Bar(
    x=df_degree_cnt['degree'],
    y=df_degree_cnt['degree_type']/num_postings
))

iplot(fig)