In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from gensim.models import Word2Vec, Doc2Vec, TfidfModel
from gensim.models.phrases import Phraser, Phrases

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt

In [2]:
stopword_set = stopwords.words('english')

In [3]:
stopword_set = stopword_set + ["need", "goal", "include", "looking", "seeking"]

In [4]:
df = pd.read_csv("Raw_Data/jobs.csv")
df.head()

Unnamed: 0,company,description,job_cat,loc,location,position,position_low
0,Loftium,About the role\nMachine learning is core to ou...,machine learning,"Seattle, WA","Seattle, Washington, United States",Machine Learning Engineer,
1,Zume Inc.,Who We Are\n\nZume is on a quest to be the mos...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer - Platform,
2,"TRC Staffing Services, Inc.",The goal is to lead the processes from infrast...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,
3,bidco,We are looking for a Machine Learning Engineer...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,
4,Compass /,Engineering\n\nMachine Learning Engineer\n\nSe...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,


In [18]:
descriptions = df["description"]

In [19]:
def cleaner(text, stopwords=stopwords.words('english'), symbols = "?&()*%$#@.!:;^"):
    text = text.replace("\n", " ").replace("*", "").strip().strip("\n").replace("-", "_")
    for symbol in symbols:
        text = text.replace(symbol, "")
    text = text.lower()
    new_text = []
    for word in text.split(" "):
        if word in stopwords or word == ' ' or word == '':
            continue
        else:
            new_text.append(word)
    return new_text

In [20]:
descriptions = [cleaner(d, stopwords=stopword_set) for d in descriptions]

In [21]:
df['cleaned_descriptions'] = descriptions

In [10]:
bigram = Phrases(sentences = descriptions, min_count=1, threshold=1)

In [11]:
phrased_descriptions = [bigram[d] for d in descriptions]

In [12]:
df['phrased_descriptions'] = phrased_descriptions

In [22]:
df.head()

Unnamed: 0,company,description,job_cat,loc,location,position,position_low,phrased_descriptions,cleaned_descriptions
0,Loftium,About the role\nMachine learning is core to ou...,machine learning,"Seattle, WA","Seattle, Washington, United States",Machine Learning Engineer,,"[role, machine_learning, core_business,, drivi...","[role, machine, learning, core, business,, dri..."
1,Zume Inc.,Who We Are\n\nZume is on a quest to be the mos...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer - Platform,,"[zume, quest_powerful, source, health_well_bei...","[zume, quest, powerful, source, health, well_b..."
2,"TRC Staffing Services, Inc.",The goal is to lead the processes from infrast...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,,"[lead, processes, infrastructure, config, data...","[lead, processes, infrastructure, config, data..."
3,bidco,We are looking for a Machine Learning Engineer...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,,"[machine_learning, engineer_help, us_create, a...","[machine, learning, engineer, help, us, create..."
4,Compass /,Engineering\n\nMachine Learning Engineer\n\nSe...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,,"[engineering, machine_learning, engineer_seatt...","[engineering, machine, learning, engineer, sea..."


In [14]:
df.description[0]

"About the role\nMachine learning is core to our business, driving everything from our long-term and short-term rental income prediction algorithms to our revenue optimization algorithms and our properties acquisition pipeline. We're looking for a talented, experienced ML engineer to help us scale quickly, smartly and nimbly. This role offers a unique opportunity to join an amazing team and grow your career as quickly as Loftium scales.This role reports to our Head of Machine Learning.\n\nResponsibilities:\nBuild machine learning products that help drive our business\nMake thoughtful architectural decisions with limited information translating business requirements into machine learning products\nCommunicate decision rationales, formulating project plans and delivering results\nAutomating model comparison and feature selection to optimize performance\nShipping code to our staging and production environments multiple times a day\nBuilding data pipelines that allow us to improve our mach

In [23]:
df.cleaned_descriptions[0]

['role',
 'machine',
 'learning',
 'core',
 'business,',
 'driving',
 'everything',
 'long_term',
 'short_term',
 'rental',
 'income',
 'prediction',
 'algorithms',
 'revenue',
 'optimization',
 'algorithms',
 'properties',
 'acquisition',
 'pipeline',
 "we're",
 'talented,',
 'experienced',
 'ml',
 'engineer',
 'help',
 'us',
 'scale',
 'quickly,',
 'smartly',
 'nimbly',
 'role',
 'offers',
 'unique',
 'opportunity',
 'join',
 'amazing',
 'team',
 'grow',
 'career',
 'quickly',
 'loftium',
 'scalesthis',
 'role',
 'reports',
 'head',
 'machine',
 'learning',
 'responsibilities',
 'build',
 'machine',
 'learning',
 'products',
 'help',
 'drive',
 'business',
 'make',
 'thoughtful',
 'architectural',
 'decisions',
 'limited',
 'information',
 'translating',
 'business',
 'requirements',
 'machine',
 'learning',
 'products',
 'communicate',
 'decision',
 'rationales,',
 'formulating',
 'project',
 'plans',
 'delivering',
 'results',
 'automating',
 'model',
 'comparison',
 'feature',
 's

In [106]:
df.phrased_descriptions[0]

['role',
 'machine_learning',
 'core_business,',
 'driving',
 'everything',
 'long_term_short_term',
 'rental',
 'income',
 'prediction_algorithms',
 'revenue_optimization',
 'algorithms',
 'properties',
 'acquisition',
 'pipeline',
 "we're",
 'talented,',
 'experienced_ml',
 'engineer_help',
 'us_scale',
 'quickly,',
 'smartly',
 'nimbly',
 'role_offers',
 'unique_opportunity',
 'join_amazing',
 'team',
 'grow_career',
 'quickly',
 'loftium',
 'scalesthis',
 'role_reports',
 'head',
 'machine_learning',
 'responsibilities_build',
 'machine_learning',
 'products_help',
 'drive_business',
 'make',
 'thoughtful',
 'architectural_decisions',
 'limited_information',
 'translating_business',
 'requirements',
 'machine_learning',
 'products',
 'communicate',
 'decision',
 'rationales,',
 'formulating',
 'project_plans',
 'delivering_results',
 'automating',
 'model',
 'comparison',
 'feature_selection',
 'optimize_performance',
 'shipping_code',
 'staging',
 'production_environments',
 'mult

In [25]:
df["cleaned_phrased_descriptions"] = [' '.join(d) for d in df.phrased_descriptions]

In [26]:
df.head()

Unnamed: 0,company,description,job_cat,loc,location,position,position_low,phrased_descriptions,cleaned_descriptions,cleaned_phrased_descriptions
0,Loftium,About the role\nMachine learning is core to ou...,machine learning,"Seattle, WA","Seattle, Washington, United States",Machine Learning Engineer,,"[role, machine_learning, core_business,, drivi...","[role, machine, learning, core, business,, dri...","role machine_learning core_business, driving e..."
1,Zume Inc.,Who We Are\n\nZume is on a quest to be the mos...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer - Platform,,"[zume, quest_powerful, source, health_well_bei...","[zume, quest, powerful, source, health, well_b...",zume quest_powerful source health_well_being w...
2,"TRC Staffing Services, Inc.",The goal is to lead the processes from infrast...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,,"[lead, processes, infrastructure, config, data...","[lead, processes, infrastructure, config, data...",lead processes infrastructure config data_coll...
3,bidco,We are looking for a Machine Learning Engineer...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,,"[machine_learning, engineer_help, us_create, a...","[machine, learning, engineer, help, us, create...",machine_learning engineer_help us_create artif...
4,Compass /,Engineering\n\nMachine Learning Engineer\n\nSe...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,,"[engineering, machine_learning, engineer_seatt...","[engineering, machine, learning, engineer, sea...",engineering machine_learning engineer_seattle ...


In [27]:
df.cleaned_phrased_descriptions[0]

"role machine_learning core_business, driving everything long_term_short_term rental income prediction_algorithms revenue_optimization algorithms properties acquisition pipeline we're talented, experienced_ml engineer_help us_scale quickly, smartly nimbly role_offers unique_opportunity join_amazing team grow_career quickly loftium scalesthis role_reports head machine_learning responsibilities_build machine_learning products_help drive_business make thoughtful architectural_decisions limited_information translating_business requirements machine_learning products communicate decision rationales, formulating project_plans delivering_results automating model comparison feature_selection optimize_performance shipping_code staging production_environments multiple_times day building data_pipelines allow_us improve machine_learning products_services design_deploy efficient data services stores we're_5+ years_experience leveraging_machine learning solve_complex business_problems you’re adaptabl

In [30]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [31]:
X = df.cleaned_phrased_descriptions
y = df.job_cat

In [32]:
tfidf = TfidfVectorizer()

In [33]:
X_t = tfidf.fit_transform(X)

In [37]:
X_t.toarray()

(3635, 104903)

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [45]:
xtrain, xtest, ytrain, ytest = train_test_split(X_t.toarray(), y, test_size=0.20)

In [46]:
clf = RandomForestClassifier()

In [47]:
clf.fit(xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [68]:
clf.score(xtrain, ytrain)

0.9948418156808804

In [48]:
clf.score(xtest, ytest)

0.8101788170563962

In [54]:
top_10 = clf.feature_importances_.argsort()[-10:]

In [60]:
for index in top_10:
    print(tfidf.get_feature_names()[index])

architecture
data_scientists
data_pipeline
data_engineering
data
machine_learning
data_analyst
data_engineer
data_science
data_scientist


In [67]:
sorted(clf.feature_importances_)[-10:]

[0.0037029815635264097,
 0.004449839448428987,
 0.004729146704267233,
 0.005840377811517096,
 0.006961058315575879,
 0.010142766632406599,
 0.011154746617437388,
 0.015451252193096354,
 0.017390792162151517,
 0.02240457788219398]