In [34]:
import numpy as np
import pandas as pd
from gensim import models,corpora
from gensim import similarities
import warnings

In [6]:
pd.set_option('max_colwidth',400)
warnings.filterwarnings('ignore')

# Loading the pretrained LDA model

In [35]:
Lda = models.LdaMulticore
lda_final =Lda.load('lda_final')
dictionary = corpora.Dictionary.load('dictionary')
doc_term_matrix = corpora.MmCorpus('doc_term_matrix.mm')

# Feature Engineering

## Document-topic matrix : Used for automated Document tagging

In [43]:
doc2topic =lda_final.get_document_topics(doc_term_matrix,minimum_probability=0)
doc2topic = pd.DataFrame(list(doc2topic))
num_topics = lda_final.num_topics
doc2topic.columns = ['Topic'+str(i+1) for i in range(num_topics)]
for i in range(len(doc2topic.columns)):
    doc2topic.iloc[:,i]=doc2topic.iloc[:,i].apply(lambda x: x[1])
doc2topic['Automated_topic_id'] =doc2topic.apply(lambda x: np.argmax(x),axis=1)
doc2topic.head()

In [48]:
data = pd.read_csv('./Topic-Modeling/data/input_data.csv',index_col='Unnamed: 0')
data_final =pd.concat([data,doc2topic],axis=1)
data_final.head()

Unnamed: 0,id,Skills,Job title,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Automated_topic_id
0,1.0,"['TECHNICAL SKILLS\xa0', <br/>, '\xa0', <br/>, '• R • Tableau • Machine Learning\xa0', <br/>, '• D3.js • SQL, PostgreSQL, pgadmin 4 • JavaScript\xa0', <br/>, '• Python • HTML/CSS • ...",Data Scientist Intern,0.00641,0.00641,0.00641,0.00641,0.00641,0.00641,0.736321,0.00641,0.00641,0.199575,0.00641,0.006411,Topic7
1,2.0,"['TECHNICAL SKILLS:\xa0', <br/>, 'Languages Java, C, C++, Python, R, Scala, SQL\xa0', <br/>, 'Web Services SOAP, REST\xa0', <br/>, 'Web Technologies HTML, CSS, JavaScript, PHP\xa0', <br/>, 'Database DB2, MySQL\xa0', <br/>, 'Software Android Studio, Eclipse, IntelliJ IDEA, NetBeans, GIT']",Junior Data Scientist,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003969,0.956348,0.003968,0.003968,Topic10
2,3.0,"['TECHNICAL SKILLS\xa0', <br/>, '• Proficient: Java, Haskell, Python, R, SQL, C, C++, Matlab, Excel, VBA\xa0', <br/>, '• Familiar with: JMP, Tableau, ArcGIS, Weka, TensorFlow']",Data Scientist Intern,0.005208,0.005208,0.005208,0.005208,0.005208,0.005208,0.005209,0.005208,0.005208,0.005209,0.005208,0.942708,Topic12
3,4.0,"['TECHNICAL SKILLS\xa0', <br/>, 'Relevant Coursework: Artificial Intelligence, Machine Learning, Data Mining, Statistics, Multivariate and\xa0', <br/>, 'Exploratory Data Analysis.\xa0', <br/>, 'Big Data Technologies: Hadoop, Apache Hive, Apache Pig, Apache Spark.\xa0', <br/>, 'Visualization Tools: Tableau, ggplot2, Sci2.\xa0', <br/>, 'Programming Languages: Python, Java, R, SQL, C.\xa0', <br/>...",Data Scientist,0.002604,0.042107,0.002604,0.002604,0.002604,0.002604,0.002604,0.002604,0.201023,0.245,0.002604,0.491036,Topic12
4,5.0,"['SKILLS\xa0', <br/>, '\xa0', <br/>, 'SOFTWARE:\xa0', <br/>, '• System Checks and trouble shooting\xa0', <br/>, '• Computer/hardware support and troubleshooting\xa0', <br/>, '• Programming proficient ( Python, C/C++, MATLAB)\xa0', <br/>, '• CAD (PTC Creo),\xa0', <br/>, '• Mastery in Microsoft Excel, Microsoft Office, Microsoft Power Point\xa0', <br/>, 'DESIGN:\xa0', <br/>, '• Art composition\x...",DATA SCIENTIST,0.004902,0.412762,0.004902,0.004902,0.004902,0.004902,0.004902,0.004902,0.004902,0.004902,0.20511,0.33801,Topic2


## Calculating similarities

In [36]:
lda_index = similarities.MatrixSimilarity(lda_final[doc_term_matrix])
similarities = lda_index[lda_final[doc_term_matrix]]

def find_similar(doc_id,topn=10):
    similar_docs =sorted(list(enumerate(similarities[doc_id])),key=lambda x: -x[1])[1:]
    return similar_docs[:topn]

In [37]:
find_similar(10,10)

[(10, 1.0),
 (36, 1.0),
 (48, 1.0),
 (62, 1.0),
 (76, 1.0),
 (102, 1.0),
 (117, 1.0),
 (133, 1.0),
 (149, 1.0),
 (166, 1.0)]