#Allow Drive Access

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


#Install Dependencies

In [None]:
%%capture
!pip install fasttext==0.9.2
!pip install gensim==4.2.0

#Imports and Declaring Constants

In [None]:
#Imports
import pandas as pd
import fasttext
import fasttext.util
from multiprocessing import Pool
import numpy as np
from gensim.utils import simple_preprocess
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

In [None]:
#Set paths and file names
path = "/content/drive/My Drive/skill_bias_jobs/data/"
mpath = "/content/drive/My Drive/skill_bias_jobs/model/"
file = 'skills_jd.csv'
mfile = 'jobs_fasttext.bin'
topic_metrics_file = 'topic_number_metrics.csv'
occupation_file = 'jt_occupation_fasttext.csv'
occupation_label_file = 'occupation_labels_fasttext.csv'

#Occupation Classification

In [None]:
# Function to get sentence vectors
def embeddings(title):
  return model.get_sentence_vector(title)

#Loading data
df=pd.read_csv(path+file, encoding='ISO-8859-1', sep=',')
df.job_title = df.job_title.str.strip()
df.job_title.fillna('', inplace = True)
df.description = df.description.str.strip()
df.description.fillna('', inplace = True)
df.drop(columns=['key_skills'], inplace=True)

#Dropping duplicates in job title and description
df.drop_duplicates(inplace=True)

#Loading Model
model = fasttext.load_model(mpath + mfile)

p = Pool(2)
df['embeddings'] = p.map(embeddings, df['job_title']+ " " + df['description'])
p.close()
p.join()

x = df['embeddings'].values
x = np.concatenate(x, axis=0).reshape(len(x),300)

df.drop(columns=['embeddings'], inplace=True)



## Finding cluster number based on elbow rule

In [None]:
inertias = []
total_clusters = []
for k in range(10,1000,10):
    kmeans = KMeans(n_clusters = k, random_state=42)
    kmeans.fit(x)
    x_pred = kmeans.predict(x)
    inertias.append(kmeans.inertia_)
    total_clusters.append(k)

dj = pd.DataFrame({'total_clusters':total_clusters,'inertias':inertias})
dj.to_csv(path+topic_metrics_file, index=False)

#Plotting inertia against number of clusters
import matplotlib.pyplot as plt
plt.plot(total_clusters, inertias, 'bx-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

## Clustering

In [None]:
#Kmeans: elbow rule suggests around 300 occupations
kmeans = KMeans(n_clusters = 300, random_state = 42)
kmeans.fit(x)
df['cluster_knn'] = kmeans.predict(x)

df.to_csv(path+occupation_file, index=False)

# Occupation Labeling based on Job Titles
Note: This helps in understanding the main content in occupation clusters

In [None]:
#Setting stopwords
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him',
              'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who',
              'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
              'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
              'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
              'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
              'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y',
              'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma',
              'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't",
              'wouldn', "wouldn't", 'cum', 'urgent', 'requirement', 'boy', 'required', 'job', 'jobs', 'iii', 'iv', 'etc', 'th', 'salary', 'chennai', 'women', 'wanted',
              'ahmedabad', 'bangalore', 'walkin', 'ctc', 'walking', 'walk', 'earn', 'delhi', 'new', 'role', 'homes', 'working', 'two', 'way', 'interview', 'pm', 'person',
              'saturday', 'sunday', 'feb', 'hiring', 'need', 'ii', 'female', 'winter', 'pre',
              'final', 'candidates', 'candidate', 'west', 'get', 'years', 'year', 'lpa', 'big', 'per', 'month', 'coimbatore',
              'black', 'white', 'indiranagar', 'australian', 'inside', 'apply', 'available', 'amazon', 'pvt', 'limited', 'looking', 'patna', 'world',
              'one', 'male', 'multiple', 'basis', 'immediate', 'opputunity', 'females', 'woman', 'girls', 'girl', 'ladies', 'lady', 'males', 'man', 'men', 'guy', 'guys', 'boys',
              'gents', 'gent', 'good', 'opportunity', 'letter', 'upto', 'non', 'mahindra', 'us',
              'shortlisted', 'ambattur', 'opening', 'infosys', 'openings', 'accenture', 'results', 'waiting', 'gross', 'malaysia',
              'cv', 'resume', 'drive', 'position', 'offer', 'sal', 'profile', 'contact', 'spot', 'mega', 'firm',
              'short', 'lacs', 'listed', 'ltd', 'leading', 'rina', 'india', 'others', 'nikitha', 'excellent', 'noida', 'also', 'co',
              'indianmoney', 'based', 'mounika', 'syed', 'deepika', 'hire', 'udhyog', 'bharat',
              'invites', 'technologies', 'august', 'consultancy', 'huge', 'best', 'location', 'uk', 'tuesday',
              'kind', 'attention', 'face', 'level', 'lac', 'package', 'convergys', 'hyderabad', 'reputed', 'mumbai', 'ricago', 'walkout', 'bizknowmics', 'free',
              'congratulations', 'currently', 'vacancy', 'hc', 'ntpc', 'honda', 'samsung', 'siel', 'step', 'thursday', 'june', 'koramangala', 'interviews',
              'selected', 'sector', 'concentrix', 'well', 'leo', 'include', 'eligible', 'industry', 'invite', 'october', 'cube', 'thane', 'cal', 'rally', 'st',
              'private', 'hp', 'rakesh', 'kolkata', 'open', 'dell', 'july', 'hdfc', 'indirapuram', 'april', 'tesco', 'sun', 'september', 'malakpet', 'place', 'permit',
              'usa', 'companies', 'registered', 'sunita', 'dec', 'try', 'nehru', 'startups', 'organisation', 'oct', 'urgently', 'rivera',
              'cryoviva', 'bigbasket', 'opportunities', 'december', 'sat', 'de', 'appointment', 'reminder', 'servicenow', 'murali',
              'invitation', 'small', 'includes', 'gurgaon', 'may', 'make', 'chaithra', 'requirements', 'next', 'icici', 'joining', 'opeinig',
              'cnx', 'headquartered', 'colorado', 'hurry', 'monday', 'summer', 'varsha', 'jan', 'swiggy', 'nterview', 'hana', 'passion', 'anyone', 'bold', 'kora',
              'joinees', 'batch', 'lakhs', 'nov', 'sub', 'ahmadabad', 'nagar', 'dreamgains', 'january', 'nj', 'south', 'kondapur', 'kotak',
              'genpact', 'yrs', 'verma', 'ashish', 'ranchi', 'locality', 'start', 'birla', 'aditya', 'jamnagar', 'require', 'week',
              'omega', 'shubhalaxmi', 'nirman', 'vinay', 'geekay', 'wns', 'parel', 'join', 'flipkart', 'range', 'plus', 'exl', 'adeeba', 'friday', 'limit',
              'jana', 'date', 'shortlist', 'national', 'kolkatta', 'rajkot', 'future', 'hari', 'tcs', 'keerthi', 'east', 'pue', 'exceutive', 'park', 'haryana', 'loc',
              'mangalore', 'super', 'thywill', 'jaipur', 'paytm', 'citi', 'quota', 'gachibowli', 'grab', 'details', 'description', 'guarantee', 'become',
              'jayanagar', 'opportinities', 'ghaziabad', 'rounds', 'indore', 'like', 'left', 'basheera', 'hyd', 'mid', 'solidworks', 'swathi', 'paid',
              'hring', 'nandith', 'nagpur', 'mar', 'ariba', 'away', 'tomorrow', 'hirng', 'pan', 'hiya', 'oriented', 'grow', 'excellence', 'congratulation', 'indiaranagar',
              'ways', 'ten', 'wait', 'ample', 'jp', 'lucknow', 'sandhya', 'yr', 'sooner', 'bujji', 'exciting', 'avaya', 'ludhiana', 'sept', 'banca',
              'sukanya', 'nischal', 'gentle', 'chandigarh', 'bengal', 'harsh', 'shivangi', 'gujarat', 'going', 'months', 'wonders', 'jenifer', 'vijayanagar', 'asha', 'using',
              'go', 'various', 'ltmd', 'surat', 'days', 'tivoli', 'aug', 'sametime', 'great', 'want', 'naaz', 'gurugram', 'north', 'assam', 'november', 'jyoti',
              'persons', 'addl', 'ranjangaon', 'sep', 'hired', 'jdedwards', 'yalamanchili', 'ltocas', 'sandstone', 'growe', 'locations', 'kalyani', 'pavithra', 'followed',
              'wipro', 'sbi', 'chhattisgarh', 'know', 'oppurtunity', 'saket', 'opporunities', 'mukesh', 'shruthi', 'prefered', 'preferred', 'later', 'getting', 'opportunityfor',
              'stay', 'carefully', 'asia', 'jagatpura', 'reetu', 'walkins', 'opportunies', 'jai', 'allahabad', 'farha', 'please', 'near', 'goregaon',
              'sablaa', 'interested', 'third', 'ncr', 'zoya', 'salem', 'edwards', 'australia', 'padma', 'xi', 'joiner', 'joiners', 'kovaipudur', 'monthly',
              'balrampur', 'aakash', 'smriti', 'madurai', 'uae', 'freshersfemale', 'weekdays', 'nasik', 'given', 'purnima', 'poornima', 'mens', 'wednesday', 'oppurtunties',
              'rajani', 'madhapur', 'mohan', 'pali', 'genuine', 'aditi', 'met', 'raj', 'mishresh', 'lkh', 'lk', 'bhilwara', 'arnold', 'shorltisted', 'outs',
              'chandivali', 'vietnam', 'womens', 'greet', 'badarpur', 'yearas', 'vizag', 'lahari', 'soon', 'vaccancy', 'rich', 'lower', 'fair', 'dubai', 'bandhan','axis', 'graduate', 'andheri','pune',
              'jamshedpur','married','preet','marriott','navi','shomik','dasgupta','day','shift','notice','work','flexible','selection']

In [None]:
#Cleaning title
df = pd.read_csv(path + occupation_file)

#Convert to lower case
df.job_title = df.job_title.str.lower()

#Keep only alphabets and numbers
df.job_title = df.job_title.str.replace('[^0-9a-zA-Z]+', ' ')
df.job_title.fillna('', inplace = True)

#Remove return and other things
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

#Removing leading and trailing spaces
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

#Removing Extra Spaces
df['job_title'] = df['job_title'].replace('\s+', ' ', regex=True)

# Stop word removal
df['job_title'] = [[word for word in simple_preprocess(str(doc),deacc=False, min_len=2, max_len=30) if word not in stop_words] for doc in df['job_title']]
df['job_title'] = df['job_title'].map(lambda tokens: ' '.join(tokens))

  df.job_title = df.job_title.str.replace('[^0-9a-zA-Z]+', ' ')


##Getting phrases from titles using bigram pasting

In [None]:
#Training phrase model
sentences = df['job_title'].to_list()
sentences = [doc.split(" ") for doc in sentences]

phrase_model = Phrases(sentences, connector_words=ENGLISH_CONNECTOR_WORDS, scoring = 'npmi', threshold=-1)

del sentences

#Transforming job titles
def bigram_pasting(sentence):
  return " ".join(phrase_model[sentence.split(" ")])

p = Pool(2)
df['job_title'] = p.map(bigram_pasting, df['job_title'])
p.close()
p.join()

##Obtaining TF-IDF-based labels

In [None]:
#Joining all documents belonging to a given topic
docs_per_topic = df.groupby(['cluster_knn'], as_index = False).agg({'job_title': ' '.join})

#Obtaining TF-IDF scores
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

tf_idf, count = c_tf_idf(docs_per_topic.job_title.values, m=len(df))

#Extracting top 20 words for each occupation
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.cluster_knn)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['cluster_knn'])
                     .ti
                     .count()
                     .reset_index()
                     .rename({"cluster_knn": "Topic", "text": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)

dj = pd.DataFrame(top_n_words)
dj.to_csv(path + occupation_label_file)