In [40]:
import numpy as np
import pandas as pd
from gensim import models,corpora
from gensim import similarities
import pyLDAvis.gensim
import warnings
import re
import spacy

In [41]:
pd.set_option('max_colwidth',400)
warnings.filterwarnings('ignore')
pyLDAvis.enable_notebook()
PATH ='../data/'

In [46]:
nlp = spacy.load('en')
def clean_up_spacy(text):
    text_out = set()
    doc= nlp(text)
    for token in doc:         
        if  len(token)<15 and token.is_punct is False :
            if token.text != '':
                text_out.add(token.text)
    text_out = list(text_out)
    return text_out

# Loading the pretrained LDA model

In [12]:
Lda = models.LdaMulticore
lda_final =Lda.load(PATH+'lda_final')
dictionary = corpora.Dictionary.load(PATH+'dictionary')
doc_term_matrix = corpora.MmCorpus(PATH+'doc_term_matrix.mm')

# Feature Engineering

## Document-topic matrix : Used for automated Document tagging

### Tagging training data

In [13]:
doc2topic =lda_final.get_document_topics(doc_term_matrix,minimum_probability=0)
doc2topic = pd.DataFrame(list(doc2topic))
num_topics = lda_final.num_topics
doc2topic.columns = ['Topic'+str(i+1) for i in range(num_topics)]
for i in range(len(doc2topic.columns)):
    doc2topic.iloc[:,i]=doc2topic.iloc[:,i].apply(lambda x: x[1])
doc2topic['Automated_topic_id'] =doc2topic.apply(lambda x: np.argmax(x),axis=1)
doc2topic.head()

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Automated_topic_id
0,0.00641,0.00641,0.00641,0.00641,0.00641,0.00641,0.736178,0.00641,0.00641,0.199718,0.00641,0.006411,Topic7
1,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003969,0.956348,0.003968,0.003968,Topic10
2,0.005208,0.005208,0.005208,0.005208,0.005208,0.005208,0.005209,0.005208,0.005208,0.005209,0.005208,0.942708,Topic12
3,0.002604,0.042106,0.002604,0.002604,0.002604,0.002604,0.002604,0.002604,0.200676,0.245217,0.002604,0.491168,Topic12
4,0.004902,0.412774,0.004902,0.004902,0.004902,0.004902,0.004902,0.004902,0.004902,0.004902,0.205122,0.337986,Topic2


In [42]:
data = pd.read_csv(PATH+'input_data.csv',index_col='Unnamed: 0')
data_final =pd.concat([data,doc2topic],axis=1)
data_final.head(1)

Unnamed: 0,id,Skills,Job title,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Automated_topic_id
0,1.0,"['TECHNICAL SKILLS\xa0', <br/>, '\xa0', <br/>, '• R • Tableau • Machine Learning\xa0', <br/>, '• D3.js • SQL, PostgreSQL, pgadmin 4 • JavaScript\xa0', <br/>, '• Python • HTML/CSS • ...",Data Scientist Intern,0.00641,0.00641,0.00641,0.00641,0.00641,0.00641,0.736178,0.00641,0.00641,0.199718,0.00641,0.006411,Topic7


### Tagging unseen data

In [51]:
text = "pandas, scikit, numpy, matplotlib, regression, classification, python, r, sql, tableau, hadoop"
bow = dictionary.doc2bow(clean_up_spacy(text))

# lets see how our model fares with the given data. The above skills matches that of a data scientist. 


Topic_probability =pd.DataFrame(lda_final[bow])
Topic_probability.columns = ['Topic','Probability']
Topic_probability['Topic'] = Topic_probability['Topic'].apply(lambda x: 'Topic'+str(x+1))

Topic_probability.sort_values(by='Probability',ascending =False)

Unnamed: 0,Topic,Probability
6,Topic7,0.923611
11,Topic12,0.006945
8,Topic9,0.006945
9,Topic10,0.006945
1,Topic2,0.006944
4,Topic5,0.006944
0,Topic1,0.006944
2,Topic3,0.006944
5,Topic6,0.006944
7,Topic8,0.006944


In [43]:
num_topics =12
vis = pyLDAvis.gensim.prepare(lda_final, doc_term_matrix, dictionary,sort_topics=False)
pyLDAvis.save_html(vis,f'pyLDAvis_{num_topics}.html')
vis

## Calculating similarities

In [36]:
lda_index = similarities.MatrixSimilarity(lda_final[doc_term_matrix])
similarities = lda_index[lda_final[doc_term_matrix]]

def find_similar(doc_id,topn=10):
    similar_docs =sorted(list(enumerate(similarities[doc_id])),key=lambda x: -x[1])[1:]
    return similar_docs[:topn]

In [37]:
find_similar(10,10)

[(10, 1.0),
 (36, 1.0),
 (48, 1.0),
 (62, 1.0),
 (76, 1.0),
 (102, 1.0),
 (117, 1.0),
 (133, 1.0),
 (149, 1.0),
 (166, 1.0)]