#Allow Drive Access

In [None]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


#Install Dependencies

In [None]:
%%capture
!pip install --upgrade fasttext
!pip install gensim==4.2.0
!pip install adjustText
!pip install umap-learn
!pip install hdbscan

# Imports and Declaring Constants

In [None]:
#Imports
import fasttext
import pandas as pd
import fasttext
import fasttext.util
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import CountVectorizer
from adjustText import adjust_text
from multiprocessing import Pool
from scipy.spatial.distance import cosine
import umap.umap_ as umap
import hdbscan
from adjustText import adjust_text
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#Set path and file names
path = '/content/drive/My Drive/skill_bias_jobs/data/'
mpath = '/content/drive/My Drive/skill_bias_jobs/model/'

#Input file name:
file = 'skills_jd.csv'

#Created file names:
textfile = 'jd.txt'
skill_file = 'skill_count.csv'
skill_association_file = 'skill_bias_words.csv'
clustering_file = 'skill_cluster_hdbscan.csv'
modelfile = 'jobs_fasttext.bin'

# Training FastText

## Loading and Preparing Data

In [None]:
df = pd.read_csv(path+file)

df.job_title.fillna('', inplace = True)
df.description.fillna('', inplace = True)
df.job_title = df.job_title.str.strip()
df.description = df.description.str.strip()

df['job_title'] = df['job_title'].astype(str) + ' ' +  df['description'].astype(str)
df = df.drop(['description'], axis=1)

#Further preprocessing
df['job_title'] = [[word for word in simple_preprocess(str(doc),deacc=False, min_len=1, max_len=30)] for doc in df['job_title']]
df['job_title'] = df['job_title'].map(lambda tokens: ' '.join(tokens))

#Fasttext requires text file; saving as text file
with open(path + textfile, 'w') as f:
    for line in df['job_title'].values:
        f.write(line)
        f.write('\n')

#Generating a skills dictionary
df = df.drop(['job_title'], axis=1)

df.key_skills.fillna('', inplace = True)
df.key_skills = df.key_skills.str.strip()

#Further preprocessing for skills
df['key_skills'] = [[word for word in simple_preprocess(str(doc),deacc=False, min_len=2, max_len=30)] for doc in df['key_skills']]
df['key_skills'] = df['key_skills'].map(lambda tokens: ' '.join(tokens))

#Skills dictionary with frequency
def tokenizer(s):
   return s.split(',')
word_vectorizer = CountVectorizer(max_features=100000, min_df=0, max_df=1.0, analyzer='word', ngram_range=(1,1), tokenizer = tokenizer, lowercase = False, binary=True)
sparse_matrix = word_vectorizer.fit_transform(df.key_skills)
frequencies = sum(sparse_matrix).toarray()[0]
df = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names_out(), columns=['frequency'])

df.to_csv(path + skill_file)

del df

##Training and saving the domain specific model

In [None]:
model = fasttext.train_unsupervised(path + textfile, minn=3, maxn=6, dim=300, epoch = 10, lr = 0.05)
model.save_model(mpath + modelfile)

#Retrieving Embeddings

## Loading Skills and Fasttext model

In [None]:
#Load skill dictionary and domain specific Fasttext
df = pd.read_csv(path + skill_file)

#Dropping empty skills
df['Unnamed: 0'].fillna('', inplace = True)
df = df[df['Unnamed: 0']!=""]

model = fasttext.load_model(mpath + modelfile)



In [None]:
# To download the pre-trained English model (note: this is quite heavy)
fasttext.util.download_model('en', if_exists='ignore')
model_pre = fasttext.load_model('cc.en.300.bin')



## Obtaining Cosine similarity with gender attribute words

In [None]:
#Cosine similarities with the words female and male only using domain specific embeddings
def cosine_similarity_female(skill):
  return 1- cosine(model[skill], model['female'])

def cosine_similarity_male(skill):
  return 1 - cosine(model[skill], model['male'])

#Average cosine similarities with all gender attribute words using domain specific embeddings
def cosine_similarity_female_avg(skill):
  return 1- (cosine(model[skill], model['female']) + cosine(model[skill], model['females']) + cosine(model[skill], model['woman']) + cosine(model[skill], model['women']) +
             cosine(model[skill], model['girl']) + cosine(model[skill], model['girls']) + cosine(model[skill], model['lady']) + cosine(model[skill], model['ladies']) +
             cosine(model[skill], model['feminine']))/9

def cosine_similarity_male_avg(skill):
  return 1 - (cosine(model[skill], model['male']) + cosine(model[skill], model['males']) + cosine(model[skill], model['man']) + cosine(model[skill], model['men']) +
              cosine(model[skill], model['boy']) + cosine(model[skill], model['boys']) + cosine(model[skill], model['gent']) + cosine(model[skill], model['gents']) +
              cosine(model[skill], model['guy']) + cosine(model[skill], model['guys']) + cosine(model[skill], model['masculine']))/11

#Cosine similarities using the pre-trained model
def cosine_similarity_female2(skill):
  return 1- cosine(model_pre[skill], model_pre['female'])

def cosine_similarity_male2(skill):
  return 1 - cosine(model_pre[skill], model_pre['male'])

In [None]:
#Calculating Similarity with Female
p = Pool(2)
df['cosine_female'] = p.map(cosine_similarity_female, df['Unnamed: 0'])
p.close()
p.join()

#Calculating Similarity with Male
p = Pool(2)
df['cosine_male'] = p.map(cosine_similarity_male, df['Unnamed: 0'])
p.close()
p.join()

#Calculating Similarity with Female using all gender attribute words
p = Pool(2)
df['cosine_female_avg'] = p.map(cosine_similarity_female_avg, df['Unnamed: 0'])
p.close()
p.join()

#Calculating Similarity with Male using all gender attribute words
p = Pool(2)
df['cosine_male_avg'] = p.map(cosine_similarity_male_avg, df['Unnamed: 0'])
p.close()
p.join()


#Calculating Similarity with Female pre-trained
p = Pool(2)
df['cosine_female_pre'] = p.map(cosine_similarity_female2, df['Unnamed: 0'])
p.close()
p.join()

#Calculating Similarity with Male pre-trained
p = Pool(2)
df['cosine_male_pre'] = p.map(cosine_similarity_male2, df['Unnamed: 0'])
p.close()
p.join()

df.to_csv(path + skill_association_file, index=False)

del df

##Skill Clustering

In [None]:
#Loading the file with skill associations (load Fasttext again if required)
model = fasttext.load_model(mpath + modelfile)

df = pd.read_csv(path + skill_association_file)
df['Unnamed: 0'].fillna('', inplace = True)
df = df[df['Unnamed: 0']!=""]

df['female_bias'] = np.sign(df['cosine_female'] - df['cosine_male'])

#Restricting the sample to skills that occur in more than 10 job ads
df = df[df['frequency']>10]



In [None]:
#Obtaining embeddings
def embeddings(skill):
  return model[skill]

p = Pool(4)
df['embeddings'] = p.map(embeddings, df['Unnamed: 0'])
p.close()
p.join()

x = df['embeddings'].values
x = np.concatenate(x, axis=0).reshape(len(x),300)

In [None]:
#Using UMAP for reducing dimensions; read here: https://umap-learn.readthedocs.io/en/latest/ (note: n_neighbor 20->200 doesn't significantly change visualization)
x_umap = umap.UMAP(n_neighbors = 20, random_state=42, n_components = 2, min_dist = 0, metric = 'cosine').fit_transform(x)
df['umap0'] = x_umap[:, 0]
df['umap1'] = x_umap[:, 1]

#Using HDBSCAN for clustering; read here: https://hdbscan.readthedocs.io/en/latest/
clusterer = hdbscan.HDBSCAN(metric='euclidean', min_cluster_size=20, min_samples = 1, cluster_selection_epsilon = .2, cluster_selection_method = 'eom')
clusterer.fit(x_umap)
df['cluster'] = clusterer.labels_
df['cluster_prob'] = clusterer.probabilities_

df.to_csv(path + clustering_file, index = False)

#Visualizations

In [None]:
#Loading and cleaning data
df = pd.read_csv(path + clustering_file)

#Choosing only top n words by frequency and coloring based on cluster
topn = 4

#Removing noise, skills that can't be classified, and keeping only top 4 in each category
dk = df[df['cluster']!=-1]
dk = dk[dk['Unnamed: 0']!='high school pass']
dk = dk[dk['Unnamed: 0']!='axis']
dk = dk[dk['Unnamed: 0']!='district']
dk = dk[dk['Unnamed: 0']!='bengal']
dk = dk[dk['Unnamed: 0']!='hs']
dk = dk[dk['Unnamed: 0']!='basic computer'] #repeated
dk = dk[dk['Unnamed: 0']!='good communication'] #repeated
dk = dk[dk['Unnamed: 0']!='communication skills'] #repeated
dk = dk.groupby(['cluster']).apply(lambda x: x.nlargest(topn, columns = ['frequency']))

# #Removing clusters having names or educational degree
dk = dk[dk['cluster']!=6]
dk = dk[dk['cluster']!=18]

#Computing gender bias and normalization to display colors
dk['female_bias'] = dk['cosine_female'] - dk['cosine_male']
# df['female_bias'] = np.sign(df['female_bias'])
max = dk['female_bias'].max()
min = dk['female_bias'].min()

dk["female_bias"] = np.where(dk["female_bias"]<0, -dk["female_bias"]/min, dk["female_bias"])
dk["female_bias"] = np.where(dk["female_bias"]>0, dk["female_bias"]/max, dk["female_bias"])

## Skill Cluster Visualization

In [None]:
#Skills map (This is used to produce Figure 2 in the paper)
fig, ax = plt.subplots(figsize=(40,40))

ax.scatter(dk['umap0'], dk['umap1'], s = dk['frequency']/2, c = dk['cluster'], cmap='Spectral')

plt.gca().axes.get_xaxis().set_visible(False)
plt.gca().axes.get_yaxis().set_visible(False)

texts = []
for i, txt in enumerate(dk['Unnamed: 0']):
    texts.append(ax.annotate(txt, (dk['umap0'].values[i], dk['umap1'].values[i]), size = 28))

adjust_text(texts)

## Gender Association Visualization

In [None]:
#Skill map with gender associations (This is used to produce Figure 3 in the paper)
fig, ax = plt.subplots(figsize=(40,40))

ax.scatter(dk['umap0'], dk['umap1'], s = dk['frequency']/2, c = dk['female_bias'], cmap='bwr')

plt.gca().axes.get_xaxis().set_visible(False)
plt.gca().axes.get_yaxis().set_visible(False)

texts = []
for i, txt in enumerate(dk['Unnamed: 0']):
    texts.append(ax.annotate(txt, (dk['umap0'].values[i], dk['umap1'].values[i]), size = 28))

adjust_text(texts)