Author: Laiya Lubben (llubben@umich.edu)
<br/>Last updated: Aug 15, 2021

# **Initialization - Import / Install libraries**

In [1]:
# Here is a list of libraries we need to conduct the analysis:
import pandas as pd                                           # Loading files into pandas dataframe
import numpy as np                                            # To use numpy aggregation functions
from tqdm.auto import tqdm                                    # track loop time
import warnings                                               # Suppress all warnings
warnings.filterwarnings('ignore')

import pymysql
import re
import json
import model_shared_utilities as msu                          # helper functions 
import topic_model_utilities as tm                            # helper functions
# --------------------------------------------------------------------------------------------------
import matplotlib.pyplot as plt                               # Generate visualization 
import matplotlib.cm as cm
import altair as alt                                          
import seaborn as sns
# --------------------------------------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler              # Transforming dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import sent_tokenize
# --------------------------------------------------------------------------------------------------
from sklearn.ensemble import RandomForestClassifier           # Model training
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# --------------------------------------------------------------------------------------------------
from numpy import concatenate
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
# --------------------------------------------------------------------------------------------------
import nltk                                                   # Downloading necessary packagings from nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\laiya\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\laiya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laiya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\laiya\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\laiya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laiya\AppData\Roaming\nltk_data...
[nltk

True

In [2]:
# This is a magic function to generate the graph within the notebook 
%matplotlib inline

# use this to set random_state to reproduce the same result
RANDOM_SEED = 14


# **Helper Functions**

Functions in the ***model_shared_utilities*** module relevant to this notebook: 

* <font color="blue">get_labels(domain_df, company_df, job_df, dropna=True):</font>
  * merge the three dataframes in order to get some labels (domain) for the job dataset based on companies' names

# **Load / Join Datasets**

In [3]:
# read the secret keys from json file 
# Note the config file need to be filled with your own credentials before running this notebook 
# or ask the owners of this repository for the filled config file
with open('config.json', 'r') as f:
    secret = json.load(f)
    

In [4]:
# connect to the database
connection = pymysql.connect(host=secret['host'],
                              user=secret['user'],
                              password=secret['password'],
                              database=secret['database'],
                              port=secret['port'],
                              charset=secret['charset'],
                              cursorclass=pymysql.cursors.DictCursor)

cursor = connection.cursor()

cursor.execute("SELECT * FROM jd;")
table = cursor.fetchall()

cursor.execute("SELECT * FROM companies;")
table2 = cursor.fetchall()

connection.close()

In [5]:
# Now let's put the tables into pandas dataframe 
job_df = pd.DataFrame(table)
job_df = job_df.dropna(subset=['job_description'])
print("Job dataset has", job_df.shape[0], "rows of data")

company_df = pd.DataFrame(table2)
print("Company dataset has", company_df.shape[0], "rows of data")

Job dataset has 18087 rows of data
Company dataset has 1024219 rows of data


In [6]:
# let's load the list of domains we have defined/condensed from the industry column from company_df  
filename = 'domain_list.xlsx'
domain_df = pd.read_excel(filename)

# extracting the domains for the job postings
final_df = msu.get_labels(domain_df, company_df, job_df, dropna=True)

There are 10399 matches


In [7]:
# creating instance of labelencoder
# this is to transform the categorical label(domain) into integers
labelencoder = LabelEncoder()
final_df['label'] = labelencoder.fit_transform(final_df['job_domain'])


# **Model Training**

We are able to find domain labels for some of the data in the job dataset by merging the job dataset with the [company dataset](https://www.kaggle.com/peopledatalabssf/free-7-million-company-dataset) based on companies' names. In this notebook, we want to try semi-supervised learning methods - [label spreading](https://scikit-learn.org/stable/modules/generated/sklearn.semi_supervised.LabelSpreading.html) and [label propagation](https://scikit-learn.org/stable/modules/generated/sklearn.semi_supervised.LabelPropagation.html). We will first train a Random Forest Classifier as a basedline to comparing the performance with the semi-learning algorithms.

In [8]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000,              # only top n by freq
                                   lowercase=True,                  # turn all to lowercase
                                   token_pattern=r"[A-Za-z]{4,15}", # words with 4 to 15 characters
                                   ngram_range=(1,2),               # include 2-word phrases
                                   min_df=25,                       # note: absolute count of doc
                                   max_df=0.75,                     # note: % of docs
                                   stop_words='english')            # default English stopword

# transform the cleaned job descriptions
X = tfidf_vectorizer.fit_transform(final_df["cleaned_jd"])  
y = final_df.label

# split the labeled data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_SEED, stratify=y)

In [9]:
# using a random forest classifier as the baseline
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)
pred_y = model.predict(X_test)
score = accuracy_score(y_test, pred_y)

# summarize score
print('Random Forest Classifier Accuracy: %.3f' % (score*100))

Random Forest Classifier Accuracy: 70.577


In [10]:
# evaluate label spreading on the semi-supervised learning dataset
# split train into labeled and unlabeled
X_train_lab, X_test_unlab, y_train_lab, y_test_unlab = train_test_split(X_train, y_train, random_state=RANDOM_SEED, stratify=y_train)

# recombine the training input
X_train_mixed = concatenate((X_train_lab.toarray(), X_test_unlab.toarray()))

# reassign the y_test_unlab labels as "-1" for unlabeled data
nolabel = [-1 for _ in range(len(y_test_unlab))]

# recombine training dataset labels
y_train_mixed = concatenate((y_train_lab, nolabel))

# define model
model = LabelSpreading()
model.fit(X_train_mixed, y_train_mixed)

# make predictions on hold out test set
pred_y = model.predict(X_test)
score = accuracy_score(y_test, pred_y)

# summarize score
print('Label Spreading Accuracy: %.3f' % (score*100))


Label Spreading Accuracy: 64.000


In [11]:
# evaluate label propagation on the semi-supervised learning dataset
# define model
model = LabelPropagation()
model.fit(X_train_mixed, y_train_mixed)

# make predictions on hold out test set
pred_y = model.predict(X_test)
score = accuracy_score(y_test, pred_y)

# summarize score
print('Label Propagation Accuracy: %.3f' % (score*100))


Label Propagation Accuracy: 63.231


In [12]:
# It is seem as though both the label spreading and label propagation generate low accuracy in comparing to the default
# random forest classifer, we will just use supervised learning method to predict the domains