# Business Problem:

Our main business objectives are to understand the dynamics of the labour market of Armenia using the online job portal post as a proxy.

# Import necessary packages

In [None]:
import numpy as np
import pandas as pd

#for charts
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud      #need to install wordcloud package

In [None]:
#for text processing
import string
import re
import nltk
from textblob import TextBlob

In [None]:
#for tokenization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#for feature selection
from sklearn import decomposition

#for model building
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report

### Import data

In [None]:
jobs = pd.read_csv('/kaggle/input/jobposts/data job posts.csv')

In [None]:
jobs.head().T

In [None]:
jobs.shape

In [None]:
jobs.info()

In [None]:
#lowercase the column names
jobs.columns = jobs.columns.str.lower()

In [None]:
jobs.columns

# Data Cleaning

In [None]:
#removing duplicate jobposts based on title and post
jobs = jobs.drop_duplicates(['jobpost', 'title'])

In [None]:
jobs.shape

In [None]:
#removing records with null title
#jobs = jobs[jobs.title.notna()]
#jobs.shape

# Type of demanding jobs in Armenia

**1. Preprocessing the text data**

In [None]:
jobs['title'] = jobs['title'].astype('str')

In [None]:
string.punctuation

In [None]:
#UDF to do basic cleaning of title column to understand type of jobs
def clean_data(text):
    text = text.lower()  # convert all the text into lowercase
    text = text.strip()  #remove starting and trailing whitespaces
    #special_chars = re.compile('[@!#$%^&*()<>?/\|}{~:;]')
    #text = re.sub(special_chars,'', text)
    special_char_reg = '([a-zA-Z0-9]+)' + '[!"#$%&\'()*+,-./:;<=>?@\\^_`{|}~]' + '([a-zA-Z0-9]+)'
    text = re.sub(special_char_reg, ' ', text)
    text = re.sub(r'\s+', ' ', text) #remove all line formattings
    text = re.sub(r'\d+', '', text) #remove digits
    text = ''.join(c for c in text if c not in string.punctuation)   #remove pecial symbols from job titles
    return text

In [None]:
a = 'Ful8l-ti9me Community Connections f09:053yy'
special_char_reg = '([a-zA-Z0-9]+)' + '[!"#$%&\'()*+,-./:;<=>?@\\^_`{|}~]' + '([a-zA-Z0-9]+)'
re.sub(special_char_reg, ' ', a).strip()

In [None]:
jobs.title.head(6)

In [None]:
title_df = jobs.title.apply(lambda x : clean_data(x))
title_df.head()

In [None]:
from nltk import WordNetLemmatizer
#nltk.download('punkt')

In [None]:
#nltk.download('wordnet')

In [None]:
def lemma(text):
    word_list = nltk.word_tokenize(text) #tokenize beofre lemmatization
    lemma_output = ' '.join(WordNetLemmatizer().lemmatize(word) for word in word_list)
    return lemma_output

In [None]:
# Define the sentence to be lemmatized
sentence = "public bats outreach and strengthening of a growth"
sentence = "The striped bats are hanging on their feet for best"
# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)
#> ['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']

# Lemmatize list of words and join
lemmatized_output = ' '.join([WordNetLemmatizer().lemmatize(w) for w in word_list])
print(lemmatized_output)

In [None]:
##Lematization
import spacy
#neccesary to download the english model using "python -m spacy download en"
nlp = spacy.load('en_core_web_sm')
# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(sentence)

# Extract the lemma for each token and join
" ".join([token.lemma_ for token in doc])

In [None]:
title_df_1 = title_df.apply(lambda x : lemma(x))

In [None]:
title_df_1.head()

In [None]:
#Stop words removal
stop = nltk.corpus.stopwords.words('english')
#stop.extend(['armenian', 'armenia', 'job', 'title', 'position', 'location', 'responsibilities', 'application',
#                  'procedures', 'deadline', 'required','qualifications', 'renumeration', 'salary', 'date', 'company', 'llc'])

In [None]:
title_df_1 = title_df_1.apply(lambda x : ' '.join(x for x in x.split() if x not in stop))

In [None]:
title_df_1.head()

Now to undertand the most demanding jobs in armenia we can create a **bi/tri gram DTM** on the job titles and find the most occuring token to be the most demanding job

In [None]:
#Tokenization using count vectorizer
count_vect = CountVectorizer(ngram_range=(1,1))
token = count_vect.fit_transform(title_df_1)

In [None]:
token

In [None]:
print(count_vect.get_feature_names())

In [None]:
print('Total number of tokens/words in all the job titles - ', len(count_vect.get_feature_names()))

In [None]:
temp_df =  pd.DataFrame(token.toarray(), columns=count_vect.get_feature_names())
temp_df.tail()

In [None]:
#count the accurence of each token in entire corpus
count_df = temp_df.apply(lambda x : x.sum())

In [None]:
count_df = pd.DataFrame(count_df).reset_index()

In [None]:
count_df.columns = ['Word', 'Count']

In [None]:
top_jobs = count_df.sort_values(by= 'Count', ascending=False)

In [None]:
top_jobs[:10]

In [None]:
# plot the WordCloud image to show top 50 type of demanding jobs in armenia     
wordcloud = WordCloud(width = 1000, height = 500).generate(' '.join(top_jobs[:50].Word))
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

# Job Nature changing over time

We can use dispersion plot to see how jobs change over time. To do this we need to get the important topics out of the jobpost and then plot their dispersion over time.

In [None]:
jobs['jobpost'] = jobs['jobpost'].astype('str')

In [None]:
#UDF to do basic cleaning of title column to understand type of jobs
def clean_data(text):
    text = text.lower()  # convert all the text into lowercase
    text = text.strip()  #remove starting and trailing whitespaces
    #special_chars = re.compile('[@!#$%^&*()<>?/\|}{~:;]')
    #text = re.sub(special_chars,'', text)
    special_char_reg = '([a-zA-Z0-9]+)' + '[!"#$%&\'()*+,-./:;<=>?@\\^_`{|}~]' + '([a-zA-Z0-9]+)'
    text = re.sub(special_char_reg, ' ', text)
    text = re.sub(r'\s+', ' ', text) #remove all line formattings
    text = re.sub(r'\d+', '', text) #remove digits
    text = ''.join(c for c in text if c not in string.punctuation)   #remove pecial symbols from job titles
    return text

In [None]:
jobs.jobpost.head()

In [None]:
jobpost_df = jobs.jobpost.apply(lambda x : clean_data(x))

In [None]:
jobpost_df.head(10)

In [None]:
##Lematization
import spacy
#neccesary to download the english model using "python -m spacy download en"
#nlp = spacy.load('en_core_web_sm')
lemmatized_out = []
count = 0
#for jobpost in jobpost_df:
#    doc = nlp(jobpost)
#    x = " ".join(word.lemma_ for word in doc)
#    print(count)
#    count += 1
#    lemmatized_out.append(x)

In [None]:
#lemmatized_out[0]

In [None]:
#Stop words removal
stop = nltk.corpus.stopwords.words('english')
stop.extend(['armenian', 'armenia', 'job', 'title', 'position', 'location', 'responsibility', 'application',
             'procedure', 'deadline', 'requirement','qualification', 'renumeration', 'salary', 'date', 'company', 'llc',
             'person', 'employement', 'post', 'follow', 'resume', 'open', 'about', 'announcement', 'link', 'website',
             'organization', 'duration'])

In [None]:
#jobpost_df_0 = pd.Series(lemmatized_out)

In [None]:
jobpost_df.head(10)

In [None]:
jobpost_df_1 = jobpost_df.apply(lambda x : ' '.join(word for word in x.split() if word not in stop))

In [None]:
jobpost_df_1.head(10)

In [None]:
#Now we will create tokens out of this processed data

tfidf_vect = TfidfVectorizer(ngram_range=(1,1), min_df=0.05, max_df=0.95)
tfidf_vect

In [None]:
token_jobpost = tfidf_vect.fit_transform(jobpost_df_1)

In [None]:
vocab = tfidf_vect.get_feature_names()
#print(vocab)

In [None]:
token_jobpost

In [None]:
len(tfidf_vect.get_feature_names())

In [None]:
token_df = pd.DataFrame(token_jobpost.toarray(), columns=tfidf_vect.get_feature_names())
token_df.head()

In [None]:
#Apply LDA technique to understand important job nature and profiles

lda = decomposition.LatentDirichletAllocation(n_components = 5, learning_method = 'online', max_iter = 50, random_state = 3)
lda.fit_transform(token_jobpost)
topics = lda.components_

In [None]:
topics

In [None]:
# view the topic models for cluster 0
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topics):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

topic_summaries

In [None]:
#vocab = []
#def fn_token(post):
#    list_temp = nltk.word_tokenize(post)
#    vocab.extend(list_temp)

#jobpost_df_1.apply(lambda x : fn_token(x))

In [None]:
#full_vocab = []
#for word in  vocab:
#        if word not in full_vocab:
#            full_vocab.append(word)

In [None]:
#print(full_vocab)

In [None]:
topic_words_tokens = []
for topic in topic_summaries:
    word_token = nltk.word_tokenize(topic)
    topic_words_tokens.extend(word_token)
print(topic_words_tokens)


In [None]:
#use lexical dispersion plot to see the topics use over time
#Start pylab inline mode, so figures will appear in the notebook
#%pylab inline

#from nltk.draw.dispersion import dispersion_plot

#dispersion_plot(vocab, topic_words_tokens[:10])

In [None]:
plot_df = pd.concat([jobpost_df_1, jobs.year], axis = 1)
#plot_df = jobpost_df_2.apply(lambda x : fn() )

In [None]:
topic_words_tokens[:10]

In [None]:
('topic', '2018')

In [None]:
nt = [(topic, year)  for year in plot_df.year  for topic in topic_words_tokens[0:9] ]

In [None]:
cfd = nltk.ConditionalFreqDist(nt)

In [None]:
#conditional frequency distribution plot to see the use of topics over time
cfd = nltk.ConditionalFreqDist(
    (target, year)
    for year in plot_df.year
    for a in plot_df.jobpost
    for w in nltk.word_tokenize(a)    
    for target in topic_words_tokens[:10]
    if w.lower().startswith(target))
cfd.plot()

#    for w in jobpost_df_1.words(year)

# Desired characteristics and  Skill-sets
To understand this we can make clusters using job description column of the data

In [None]:
x = jobs[jobs.jobdescription.isna() == False]

In [None]:
jobs.shape

In [None]:
x.shape

In [None]:
x.head()

#### Text pre-processing

In [None]:
x['jobdescription'] = x['jobdescription'].astype('str')

In [None]:
desc_df = x.jobdescription.apply(lambda x : clean_data(x))
desc_df.head()

In [None]:
#LEmmatization
desc_df_1 = desc_df.apply(lambda x : lemma(x))

In [None]:
#lemmatized_out[0:6]

In [None]:
#desc_df_0 = pd.Series(lemmatized_out)

In [None]:
desc_df_1.head()

In [None]:
#stop word removal
desc_df_1 = desc_df_1.apply(lambda x : ' '.join(x for x in x.split() if x not in stop))

In [None]:
desc_df_1.head()

In [None]:
#Tokenization
tfidf_vect = TfidfVectorizer(ngram_range=(1,1), min_df = 0.05, max_df=0.95, stop_words='english')
x_tdm = tfidf_vect.fit_transform(desc_df_1)
#print(x_tdm)

In [None]:
df_clust = pd.DataFrame(x_tdm.toarray(), columns=tfidf_vect.get_feature_names())

In [None]:
df_clust.head()

### Clustering

#### Dimension Reduction

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics

In [None]:

model = KMeans(n_clusters=5, 
               init='k-means++', 
               max_iter=100, n_init=1,random_state=5)
kmeans = model.fit(x_tdm)

In [None]:
# we create a kmeans model
km_3 = KMeans(n_clusters=3,init='k-means++', max_iter=100, n_init=1, random_state=5).fit(x_tdm)
km_4 = KMeans(n_clusters=4,init='k-means++', max_iter=100, n_init=1, random_state=5).fit(x_tdm)
km_5 = KMeans(n_clusters=5,init='k-means++', max_iter=100, n_init=1, random_state=5).fit(x_tdm)
km_6 = KMeans(n_clusters=6,init='k-means++', max_iter=100, n_init=1, random_state=5).fit(x_tdm)
km_7 = KMeans(n_clusters=7,init='k-means++', max_iter=100, n_init=1, random_state=5).fit(x_tdm)
km_8 = KMeans(n_clusters=8,init='k-means++', max_iter=100, n_init=1, random_state=5).fit(x_tdm)
km_9 = KMeans(n_clusters=9,init='k-means++', max_iter=100, n_init=1, random_state=5).fit(x_tdm)

In [None]:
# save the cluster labels and sort by cluster
x['cluster_3'] = km_3.labels_
x['cluster_4'] = km_4.labels_
x['cluster_5'] = km_5.labels_
x['cluster_6'] = km_6.labels_
x['cluster_7'] = km_7.labels_
x['cluster_8'] = km_8.labels_
x['cluster_9'] = km_9.labels_

In [None]:
len(tfidf_vect.get_feature_names())

In [None]:
vocab = np.array(tfidf_vect.get_feature_names())
vocab

In [None]:
cluster_centers = np.array(km_5.cluster_centers_)
cluster_centers[0].argsort()

In [None]:
km_3.labels_

In [None]:
x['cluster_3'].value_counts()/sum(x['cluster_3'].value_counts())

In [None]:
x['cluster_4'].value_counts()/sum(x['cluster_4'].value_counts())

In [None]:
x['cluster_5'].value_counts()/sum(x['cluster_5'].value_counts())

In [None]:
x['cluster_6'].value_counts()/sum(x['cluster_6'].value_counts())

In [None]:
x['cluster_7'].value_counts()/sum(x['cluster_7'].value_counts())

In [None]:
x['cluster_8'].value_counts()/sum(x['cluster_8'].value_counts())

Either 5-6 , is the optimal solution for our clusters

### Evaluation clusters

### 1. Silhouette Coefficient(Higher the better)

In [None]:
from sklearn import  metrics
metrics.silhouette_score(x_tdm, labels=km_3.labels_)

In [None]:
scores = []
scores.append(metrics.silhouette_score(x_tdm, labels=km_3.labels_))
scores.append(metrics.silhouette_score(x_tdm, labels=km_4.labels_))
scores.append(metrics.silhouette_score(x_tdm, labels=km_5.labels_))
scores.append(metrics.silhouette_score(x_tdm, labels=km_6.labels_))
scores.append(metrics.silhouette_score(x_tdm, labels=km_7.labels_))
scores.append(metrics.silhouette_score(x_tdm, labels=km_8.labels_))
scores.append(metrics.silhouette_score(x_tdm, labels=km_9.labels_))
scores

In [None]:
plt.plot(range(3,10), scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.grid('True')

7 Cluster seems to be optimal

In [None]:
# sorting the cluster centers for 5 clusters
sorted_vals = [km_5.cluster_centers_[i].argsort() for i in range(0,np.shape(km_5.cluster_centers_)[0])]

In [None]:
# get top 10 words from that cluster
words=set()
for i in range(len(km_5.cluster_centers_)):
    words = set(vocab[sorted_vals[i][-10:]])
    print(words)

In [None]:
# sorting the cluster centers for 6 clusters
sorted_vals = [km_6.cluster_centers_[i].argsort() for i in range(0,np.shape(km_6.cluster_centers_)[0])]

In [None]:
# get top 10 words from that cluster
words=set()
for i in range(len(km_6.cluster_centers_)):
    words = set(vocab[sorted_vals[i][-10:]])
    print(words)

Above analysis shows that in cluster 6 the tokens get repeated and clusters are more similar to each other. That means **cluster 5** is optimal. 

# IT Job Classification

In [None]:
class_data = jobs[(jobs.title.isna() == False) & (jobs.jobrequirment.isna() == False) & (jobs.requiredqual.isna() == False) &
                 (jobs.jobdescription.isna() == False) & (jobs.aboutc.isna() == False) & (jobs.company.isna() == False)]

In [None]:
class_data.shape

In [None]:
class_data.isna().sum()

In [None]:
#identify Y variable
class_data['it'] = class_data.it.apply(lambda x : 0 if (x is False) else 1)
y=class_data['it']

In [None]:
y.value_counts()

In [None]:
sns.countplot(y)

In [None]:
class_data = class_data['title'].str.cat(class_data['jobrequirment'], sep =" ").str.cat(class_data['requiredqual'], sep =" ").str.cat(
    class_data['jobdescription'], sep =" ").str.cat(class_data['aboutc'], sep =" ").str.cat(class_data['company'], sep =" ")
class_df = class_data

In [None]:
class_df.head()

### Text pre-processing

In [None]:
class_df = class_df.apply(lambda x : clean_data(str(x)))
class_df.head()

In [None]:
#Lemmatization
class_df_1 = class_df.apply(lambda x : lemma(x))

In [None]:
class_df_1.head()

In [None]:
#stop word removal
class_df_1 = class_df_1.apply(lambda x : ' '.join(x for x in x.split() if x not in stop))

In [None]:
class_df_1.head()

In [None]:
#Tokenization
tfidf_vect = TfidfVectorizer(ngram_range=(1,1), min_df = 0.05, max_df=0.95, stop_words='english')
x_tdm = tfidf_vect.fit_transform(class_df_1)
#print(x_tdm)

In [None]:
df_clust = pd.DataFrame(x_tdm.toarray(), columns=tfidf_vect.get_feature_names())

In [None]:
df_clust.head()

## Building a Random Forest Model

#### Divide the data into train and test

In [None]:
df_clust.shape

In [None]:
from sklearn.model_selection import train_test_split

train_x, test_x,train_y, test_y = train_test_split(df_clust,y, test_size = 0.2, random_state = 5)
print(train_x.shape, test_x.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'n_estimators':[130,150,160,180,200],
              'max_features':[13,15,17,19]}

grid_rf = GridSearchCV(estimator= RandomForestClassifier(),
                      param_grid=param_grid,
                      cv = 10,
                      n_jobs=-1, verbose=True)

grid_rf.fit(train_x,train_y)

In [None]:
grid_rf.best_score_

In [None]:
grid_rf.best_params_

In [None]:
grid_rf.best_estimator_

In [None]:
#Fit the model
rf_model = grid_rf.best_estimator_
rf_model.fit(train_x, train_y)

#### Prdict the output for train and validation set

In [None]:
rf_train_predict = pd.DataFrame({'actual' : train_y,
                                 'predicted' : rf_model.predict(train_x)})
rf_train_predict.head()

In [None]:
rf_test_predict = pd.DataFrame({'actual' : test_y,
                                 'predicted' : rf_model.predict(test_x)})
rf_test_predict.head()

# Model Evaluation

In [None]:
#1. Check accuracy score on train and test

print('Accuracy Score for train dataset : ' , metrics.accuracy_score(rf_train_predict.actual, rf_train_predict.predicted))
print('Accuracy Score for test dataset : ' , metrics.accuracy_score(rf_test_predict.actual, rf_test_predict.predicted))

In [None]:
#2. Check roc_auc score on train and test

print('ROC-AUC Score for train dataset : ' , metrics.roc_auc_score(rf_train_predict.actual, rf_train_predict.predicted))
print('ROC-AUC Score for validation dataset : ' , metrics.roc_auc_score(rf_test_predict.actual, rf_test_predict.predicted))

In [None]:
#3. Create confusion matrix
#for test

conn_cm_test = metrics.confusion_matrix(rf_test_predict.actual, rf_test_predict.predicted, [1,0])
sns.heatmap(conn_cm_test, fmt= '.2f', annot=True,  xticklabels=['IT', 'NOT IT'], yticklabels=['IT', 'NOT IT'])

In [None]:
#4. Create classification report
print(metrics.classification_report(rf_test_predict.actual, rf_test_predict.predicted))

In [None]:
indices = np.argsort(rf_model.feature_importances_)[::-1]
feature_rank = pd.DataFrame(columns = ['rank', 'feature', 'importance'])
for f in range(train_x.shape[1]):
    feature_rank.loc[f] = [f+1,
                          train_x.columns[indices[f]],
                          rf_model.feature_importances_[indices[f]]]
feature_rank.round(3)

In [None]:
feature_rank[:17]

Presence of keywords like software, developer, web, design, cs are the important feautures while clasifying any job as **IT or NON-IT**

# Similarity of Jobs

In [None]:
###Using TF-IDF as cosine similarity

In [None]:
#from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def get_cosine_sim(doc): 
    vectors = [t for t in get_vectors(doc)]
    return cosine_similarity(vectors)
    
def get_vectors(doc):
    text = [t for t in doc]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

### Using word embeddings Doc2Vec

In [None]:
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

In [None]:
jobpost_df_1.shape

In [None]:
#Pre-processed text of jobpost column
jobpost_df_1.head(10)

In [None]:
title_df_1.shape

In [None]:
#Pre-processed text of title column column
title_df_1.head()

#### 1. Create the tags with each post

In [None]:
df_sim = pd.concat([jobpost_df_1, title_df_1], axis = 1)
df_sim.loc[1810]

In [None]:
docs=[]
def fn_tag_doc(jobpost, title):
        docs.append(TaggedDocument(words = jobpost.split(), tags = [title]))     

df_sim.apply(lambda x : fn_tag_doc(x['jobpost'], x['title']), axis = 1)

In [None]:
docs[1]

#### 2. Build a Model to convert each document(jobpost) into vectors to be used to check similarity

In [None]:
model_sim = Doc2Vec(docs, dm=0, alpha = 0.025, min_alpha = 0.025, min_count = 0)  # use fixed learning rate

In [None]:
for epoch in range(10):
    model_sim.train(docs, total_examples= model_sim.corpus_count, epochs=model_sim.epochs)
    model_sim.alpha -= 0.002  # decrease the learning rate
    model_sim.min_alpha = model_sim.alpha  # fix the learning rate, no decay

#### 3.Check the similarity of a given job title and get top 10 jobposts similar to that job_title

In [None]:
model_sim.docvecs.most_similar(positive=[model_sim.infer_vector('chief financial officer'.split())],topn=10)

Above shows **top 10 titles** which are similar to the title 'chief financial officer'.

In [None]:
#docs[0].tags

In [None]:
#tags_list=[]
#for i in range(0, df_sim.shape[0]):
#    c = str(docs[i].tags).replace('[', '')
#    c = c.replace(']', '')
#    c = c.replace("'", '')
#    tags_list.append(c)

In [None]:
#tags_list[0]

In [None]:
#sim_list = []
#for tag in tags_list:
    #print(model_sim.docvecs.similarity('bcc specialist', tag))
#    sim_list.append(model_sim.docvecs.similarity('software developer', tag))
    
#sim_list[0]

In [None]:
#sim_score_df = pd.concat([pd.Series(jobpost_df_1),pd.Series(tags_list), pd.Series(sim_list)], axis =1)
#sim_score_df.columns=['jobpost', 'title', 'similarity_score']
#sim_score_df
#sim_score_df.sort_values(by = 'similarity_score', ascending = False)