In [2]:
import pandas as pd
import pickle
pd.set_option('display.max_colwidth', 500)
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from collections import Counter

## Process ILO occupations

- Add missing categories
- Add major categories

In [56]:
# Load occupations from ILO
all_occupations = pd.read_excel('../../job classifications/index08-draft-ISCO.xlsx')
all_occupations.columns = ['code', 'old_code', 'title']
all_occupations['code'] = all_occupations.code.astype('str')
all_occupations.drop_duplicates(inplace=True)

In [53]:
all_occupations = pd.read_excel('../../job classifications/index08-draft-ISCO.xlsx')

In [8]:
len(all_occupations)

7011

In [7]:
all_occupations[all_occupations.title.str.contains('Machine')]

Unnamed: 0,code,old_code,title


In [254]:
# Add missing occupations in the coding. For now assign default coding for data science.
extra_occupations = pd.DataFrame({
    "title":['Scientist, data', 'Engineer, data', 'Engineer, machine learning',\
            'Analyst, data', 'Architect, data', 'Analyst, business']
})

extra_occupations['code'] = 2529
extra_occupations['old_code'] = 2139
all_occupations = pd.concat([all_occupations, extra_occupations])

In [255]:
# Add major categories (i.e. Scientist, Engineer, etc) to the list of potential matches.
major_categories = all_occupations.title.apply(lambda x: x.split(',')[0]).unique()

## Process quotacom occupations

- Job title cleaning
- Correct spelling mistakes

In [None]:
df = pd.read_csv('data/processed/raw_occupations.csv')
df = df[df.occupation.notna()]

In [261]:
# Clean occupation titles (i.e. lowercase, remove stop words)
def process_occupation(occupation):
    text = occupation.lower()
    #text = occupation.replace('senior')
    
    remove_punct = str.maketrans("", "", string.punctuation)
    text = text.translate(remove_punct)
    
    remove_digits = str.maketrans("", "", string.digits)
    text = text.translate(remove_digits)
    
    text = text.replace(' ml ', 'machine learning')
    text = text.replace(' ai ', 'artificial intelligence')
    
    tokens = word_tokenize(text)
    
    stop_words = stopwords.words('english') + ['senior', 'sr', 'jr', 'principal']
    stop_words.remove('it')
    tokens_stop = [y for y in tokens if y not in stop_words]

    return " ".join(tokens_stop)

In [262]:
df['occupation2'] = df.occupation.apply(lambda x: process_occupation(x))

In [281]:
## Correct spelling mistakes in the job titles. Use spellchecker but also manual input.

# from spellchecker import SpellChecker
# spell = SpellChecker()
# test = " ".join(df.occupation2.values).split()
# test = list(set(test))
# misspelled = list(spell.unknown(test))

# corrected = []
# for m in misspelled:
#     c = spell.correction(m)
#     if c == m: 
#         print(m, c)
#         c = input()
#     corrected.append(c)

# spelling_dict = {}
# for idx, m in enumerate(misspelled):
#     c = corrected[idx]
#     if c == "":
#         c = m
#     spelling_dict[m] = c
    
# pickle.dump(spelling_dict, open( "spelling_dict.p", "wb" ) )

spelling_dict = pickle.load(open( "spelling_dict.p", "rb" ))

In [283]:
def correct_spelling(word):
    for key, value in spelling_dict.items():
        if key not in word:
            continue
        word = re.sub(r"\b%s\b" % key , value, word)

    return word

In [391]:
df['occupation3'] = df.occupation2.apply(lambda x: correct_spelling(x))

## Prepare for similarity matching

In [256]:
# Create word vectors with ILO categories to use as reference
all_base_titles = list(major_categories) + list(all_occupations.title.values)
print('creating base vectors... ', end = '')
base_vectors = [nlp(process_occupation(x)) for x in all_base_titles]
print('done!')

pickle.dump(base_vectors, open( "base_vectors.p", "wb" ) )

In [392]:
# Create word vectors with job titles to be matched
vector_dict = {}
for occupation in df.occupation3.unique():
    vector_dict[occupation] = nlp(occupation)
    
pickle.dump(vector_dict, open( "vector_dict.p", "wb" ) )

In [393]:
raw_title = []
tokens = []
for key,value in vector_dict.items():
    raw_title.append(key)
    tokens.append(value)
    
raw_occupations_df = pd.DataFrame({'raw':raw_title, 'token':tokens})

In [221]:
# https://spacy.io/usage/vectors-similarity
# Load pretrained word vectors 
nlp = spacy.load("en_core_web_lg")

def get_most_similar_title(raw_occupations, base_vectors, base_occupations):
    ### Add code
    ### Add original raw occupation
    most_similar_title = []
    similarity_score = []
    count = 0
    for raw_vector in raw_occupations.token.values: 
        if count % 1000 == 0:
            print(count)
        this_score = []
        for base_vector in base_vectors:
            this_score.append(raw_vector.similarity(base_vector))
        similarity_score.append(max(this_score))
        most_similar_title.append(base_occupations[this_score.index(max(this_score))])
        count += 1
    return pd.DataFrame({"occupation3": raw_occupations.raw.values, "title": most_similar_title, "score": similarity_score})


In [None]:
matched_df = get_most_similar_title(raw_occupations_df, base_vectors, all_base_titles)
matched_df = matched_df.merge(all_occupations, on = 'title', how='left')

In [435]:
df = df.merge(matched_df, on = 'occupation3', how='left')

In [None]:
#df.to_csv('classification_v1.csv', index=False)

In [436]:
df.head()

Unnamed: 0,id,occupation,linkedin,type,country,gender,companyName,occupation2,occupation3,title,score,code,old_code
0,id_3979,Senior Java Developer,,candidate,United Kingdom,male,Truecaller,java developer,java developer,Developer,0.813993,,
1,id_3980,Software Engineer,,candidate,United Kingdom,male,"Netsize, a Gemalto company",software engineer,software engineer,"Engineer, software",1.0,2512.0,2131.0
2,id_3981,Senior Software Engineer,,candidate,United Kingdom,male,Itiviti AB,software engineer,software engineer,"Engineer, software",1.0,2512.0,2131.0
3,id_3982,Software Developer,,candidate,United Kingdom,male,Klarna,software developer,software developer,"Developer, software",1.0,2512.0,2131.0
4,id_3984,Software Developer,,candidate,United Kingdom,male,HiQ,software developer,software developer,"Developer, software",1.0,2512.0,2131.0


In [468]:
df_tech = df[(df.code.str.startswith('25')) | (df.code.str.startswith('133')) |(df.title.isin(selected_major_categories))]

In [471]:
# Only keep profiles with linkedin urls
profiles2scrape = df_tech[df_tech.linkedin.notna()]

In [488]:
#profiles2scrape.to_csv('profiles2scrape_v1.csv', index=False)

In [469]:
sum(df_tech.linkedin.notna())

22373

In [31]:
# Precision -- look at IT profiles
for idx, row in val.sample(n=1000, random_state=3).iterrows():
    print(row['occupation'],"|", row['title'])
    print("-----")

Vice President of North America | President
-----
Chief Technology Officer | Officer, chief technology
-----
Head of Information Technology | Manager, information technology 
-----
Director | Director
-----
President | President
-----
Chief Technology Officer | Officer, chief technology
-----
Global Chief Information Officer (CIO) | Officer, chief information
-----
Vice President and Global Head | President
-----
Chief Technology Officer | Officer, chief technology
-----
Director Business Unit SAP | Administrator, SAP: business analysis
-----
Head of Data Services | Coordinator, information services: managing computer system
-----
Chief Technology Officer | Officer, chief technology
-----
Chief Technology Officer | Officer, chief technology
-----
Chief Technology Officer of Digital IT | Officer, chief technology
-----
Chief Information Officer | Officer, chief information
-----
CIO | CIO
-----
Chief Innovation Officer | Officer, chief technology
-----
Head of Department, Driver Assista

CIO | CIO
-----
Chief Information Officer | Officer, chief information
-----
Managing Data Science Consultant | Consultant, information technology: managing system
-----
Head of Cereal Bar and Chief Technology Officer | Officer, chief technology
-----
Vice President of Engineering | President
-----
Global Chief Information Officer | Officer, chief information
-----
Chief Technology Officer | Officer, chief technology
-----
Vice President | President
-----
Vice President and General Manager | President
-----
Chief Information Officer | Officer, chief information
-----
CIO and Vice President | President
-----
Chief Data Officer | Officer, chief information
-----
EVP & Chief Information Officer | Officer, chief information
-----
Senior Data Warehouse Developer | Developer, database
-----
President | President
-----
Chief Technology Officer | Officer, chief technology
-----
Chief Information Officer | Officer, chief information
-----
Chief Technology Officer | Officer, chief technology
---

Chief Innovation Officer | Officer, chief technology
-----
Sr. Director  | Director
-----
Chief Technical Officer | Officer, chief technology
-----
Chief Information Officer (CIO) | Officer, chief information
-----
Chief Innovation and Information Officer | Officer, chief information
-----
President | President
-----
Chief Product Officer | Officer, chief information
-----
Senior Vice President of Engineering | President
-----
Regional Vice President | President
-----
Chief Technology Officer | Officer, chief technology
-----
President | President
-----
Chief Data Officer | Officer, chief information
-----
Solution Architect | Architect, solutions: business
-----
President and CEO | President
-----
President of the IT and Systems Business Unit | Officer, computer systems: managing system
-----
Ingineer in Business Intelligence | Engineer, business process: information technology
-----
Chief Technology Officer | Officer, chief technology
-----
Chief Technology Officer | Officer, chief t

In [47]:
# Recall -- look at IT profiles
rec = pd.read_csv('classification_v1.csv')
rec['IT'] = rec.code.apply(lambda x: True if str(x).startswith('25') or  str(x).startswith('133') else False)   
rec.loc[rec['IT'] == False, 'IT'] = rec.loc[rec['IT'] == False, 'title'].apply(lambda x: True if x in selected_major_categories else False)   
for idx, row in rec.sample(n=1000, random_state=3).iterrows():
    print(row['IT'], row['id'], row['occupation'],"|", row['title'])
    print("-----")

False id_10783 Chief Executive Officer / President Director | Officer, chief executive
-----
False id_60575 CEO | CEO
-----
False id_85488 Co Founder and Shareholder | Treasurer, company
-----
False id_75517 Managing Director | Director, managing
-----
True id_83947 Senior Vice President, Chief Digital and Technolog | Officer, chief technology
-----
True id_31287 Vice President | President
-----
False id_34076 CEO | CEO
-----
True id_21640 Head of Analytics | Head
-----
True id_9501 Java Developer | Developer
-----
False id_85959 Chief Marketing Officer | Officer, chief executive
-----
True id_20169 Data Scientist - Freelancer | Scientist, data
-----
True id_52393 Acting Chief Technology Officer | Officer, chief technology
-----
False id_82932 Managing Director | Director, managing
-----
False id_87788 CTO and Co-Founder | President, company
-----
False id_44087 Senior Data Science Manager | Manager, data 
-----
False id_63222 CEO | CEO
-----
False id_115245 Vice President of Informati

False id_103828 Chief Executive Officer and President | Officer, chief executive
-----
False id_31000 Chief Executive Officer | Officer, chief executive
-----
True id_62022 VP - Analytics & Data Science | Scientist, data
-----
True id_92690 Chief Data Officer | Officer, chief information
-----
True id_106107 Global Chief Information Security Officer | Officer, chief information
-----
False id_63384 Founding Partner and COO | Partner
-----
False id_3265 Senior Technical Trainer | Trainer
-----
True id_68280 Visiting Scientist, Machine Learning for Fraud Detection, AML/KYC | Engineer, machine learning
-----
True id_59835 Chief Information Officer | Officer, chief information
-----
True id_44748 Director | Director
-----
True id_7694 Senior Developer | Developer
-----
False id_109663 IT Vice President and Home Care and P&G Profession | Manager, care: nursing home
-----
False id_87135 Co-Founder | Director, company
-----
True id_79739 Chief of Product | Officer, chief technology
-----
Fals

False id_116690 Vice President of Strategy, Marketing and Governme | President, enterprise
-----
False id_55173 Chief Executive Officer | Officer, chief executive
-----
True id_110479 Chief Technology Officer | Officer, chief technology
-----
False id_113868 Senior Vice President of Global Operations | President, enterprise
-----
False id_33206 Co-Founder | Director, company
-----
False id_21863 Group CTO | Agent, group insurance
-----
False id_10632 General Manager, International Business | Manager, general: business services
-----
True id_37018 Chief Experience Officer Europe | Officer, chief technology
-----
False id_112925 Director of Financial Services | Director, corporate services
-----
False id_100345 Chairman of the Advisory Board | Member, board
-----
True id_47037 Data Science and Artificial Intelligence Leader | Scientist, data
-----
False id_60649 CEO | CEO
-----
True id_52025 Chief Digital Officer | Officer, chief technology
-----
False id_116183 Country Chief Operating O

-----
True id_7722 Expert IT Developer | Developer, website
-----
False id_63644 Group Managing Director | Director, managing
-----
True id_13680 Senior Data Scientist  | Scientist, data
-----
False id_55356 Vice President of Global Sales | President, company
-----
False id_47259 Director of Software Engineering & IT | Assistant, computer: engineering (software support)
-----
True id_58604 CIO & CTO | CIO
-----
False id_53568 Managing Director - Benelux & Nordics | Director, managing
-----
True id_93961 Senior Vice President for the Americas | President
-----
False id_89098 Senior Director of Sales and Marketing Data Scienc | Consultant, sales: information technology
-----
False id_115617 Chief Scientific Officer | Officer, chief clinical
-----
True id_19263 Senior Consultant | Technology - Analytics & Infor | Consultant, business: information technology
-----
False id_56070 Senior Manager | Manager, lodging-house
-----
False id_101031 Chairman and Chief Executive Officer | Officer, ch