# Parse JDs, extract skills


In [1]:
import os
import re
import json
from pprint import pprint
from bs4 import BeautifulSoup
from bs4.element import Comment

import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords
from nltk import RegexpTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import gensim
from pprint import pprint

import codecs
import os
import time

from stop_words import get_stop_words

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/goodgame/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/goodgame/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/goodgame/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/goodgame/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/goodgame/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/goodgame/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
import boto3
s3 = boto3.resource('s3')

In [5]:
BUCKET_NAME = 'tech-salary-project'

In [6]:
%%time

bucket = s3.Bucket(BUCKET_NAME)
all_job_titles = []
all_job_ids = []
all_summaries = []
for o in bucket.objects.all():
    if o.key.startswith('salaries'):
        continue
    
    object = bucket.Object(o.key)
    try :
        lines = object.get()['Body'].read().decode('utf-8').splitlines()
        for line in lines:
            d = json.loads(line)
            
            title = d['title']
            jid = d['id']
            summary = d['summary']
            
            all_job_ids.append(jid)
            all_job_titles.append(title)
            all_summaries.append(summary)
    except Exception as e:
        continue

CPU times: user 13.3 s, sys: 1.5 s, total: 14.8 s
Wall time: 7min 10s


In [7]:
print('Number of all job titles:', len(all_job_titles))
uniq_job_titles = set(all_job_titles)
print('Number of unique job titles:', len(uniq_job_titles))

print('Number of all job ids:', len(all_job_ids))
uniq_job_ids = set(all_job_ids)
print('Number of unique job ids:', len(uniq_job_ids))

uniq_job_ids = set(all_job_ids)
print('Number of unique job ids:', len(uniq_job_ids))

print('\nNumber of summaries:',len(all_summaries))
unique_summaries = set(all_summaries)
print('Number of unique summaries:', len(unique_summaries))

Number of all job titles: 41777
Number of unique job titles: 21917
Number of all job ids: 41777
Number of unique job ids: 39692
Number of unique job ids: 39692

Number of summaries: 41777
Number of unique summaries: 39300


## Use BS4 to convert the 'summary' field HTML into visible text, and load the files into memory

In [8]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def make_text(text):
    soup = BeautifulSoup(text, "html5lib")
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)

In [9]:
%%time
texts = []

for item in unique_summaries:
    texts.append(make_text(item))

CPU times: user 4min 26s, sys: 771 ms, total: 4min 26s
Wall time: 4min 27s


In [14]:
# Write locally

for idx, text in enumerate(texts):
    title = all_job_titles[idx].replace(' ', '_').replace('/','_').replace('.','')
    job_id = str(all_job_ids[idx])
    doc_title = '/Users/goodgame/Desktop/texts/'+title+'_'+job_id+'.txt'
    with open(doc_title,'w+') as outfile:
        outfile.write(text)

In [12]:
print(all_job_titles[4].replace(' ', '_').replace('/','_'))

Specialist,_IT_App_Development_-_Application_Developer_(Java)


## Extract noun phrases

In [128]:
stopwords = stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer()

In [129]:
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()

def normalise(word):
    word = word.lower().replace('/','').replace('-','').replace('•','')
    # word = stemmer.stem_word(word) #if we consider stemmer then results comes with stemmed word, but in this case word will not match with comment
    word = lemmatizer.lemmatize(word)
    return word

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword. We can increase the length if we want to consider large phrase"""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted

def get_terms(tree):
    for leaf in leaves(tree):
        term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
        yield term

In [131]:
# combine functions above
def noun_phrases(text):
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')    
    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()
    grammar = r"""
        NBAR:
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

        NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
    """
    chunker = nltk.RegexpParser(grammar)
    toks = tokenizer.tokenize(text)
    postoks = nltk.tag.pos_tag(toks)
    tree = chunker.parse(postoks)
    terms = get_terms(tree)
    bad_words = ['opportunity', 'ideal candidate', 'team', 'year', 'knowledge','experience']
    clean_terms = []
    
    for term in terms:
        term = ' '.join(term).replace('\n','').replace(',','').replace('(','')
        term = term.replace(')','')
        term = term.strip()
        if term not in bad_words:
            clean_terms.append(term)
    return clean_terms

In [134]:
print(len(texts))

39300


In [135]:
%%time
parsed_dataset = []

def parse_input_docs(doc):
    return ' '.join(noun_phrases(doc))

for item in texts:
    parsed_dataset.append(parse_input_docs(item))
print(len(parsed_dataset))

39300
CPU times: user 17min 13s, sys: 5.33 s, total: 17min 18s
Wall time: 17min 21s


In [136]:
print(parsed_dataset[234])

make .tv make .tv provider cloud based live video acquisition management solution live video cloud “lvc” platform think video router switcher cloud company germany where development team headquarter seattle company unmatched solution top medium entertainment technology company esl mtv viacom mlb bamtech fox sport brasil swiss broadcasting corp srf. make .tv’s live video cloud content programmer service provider field news sport esports entertainment bring demand video screen source mobile video traditional camera .tv's investor microsoft venture voyager capital vulcan capital agile flat hierarchy lot freedom ownership regard self organization selection tool responsibility due distributed organization office seattle usa cologne germany satellite office los angeles remote location place beautiful place structure mistake "startup quick thing information www .make.tv [http:www.make.tv] responsibility implementation lvc backend service availability latency scalability make .tv’s service sol

We'll clearly need a better way to capture certain tech skills (C and C++ are cut off, etc.), but it's not bad for now.

### Transform POS vectors into TF-IDF space. 
The terms should be more useful than the terms from the entire document.

In [33]:
stop_words = get_stop_words('en')

In [137]:
%%time
# TF-IDF transformation in sklearn

pos_vect = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,2), analyzer='word')  
pos_tfidf = pos_vect.fit_transform(texts)
print("\n Here are the dimensions of our two-gram dataset: \n", pos_tfidf.shape, "\n")


 Here are the dimensions of our two-gram dataset: 
 (39300, 2070759) 

CPU times: user 54 s, sys: 1.64 s, total: 55.6 s
Wall time: 56 s


### Clustering using Kmeans

In [138]:
%%time
true_k = 30
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(pos_tfidf)

CPU times: user 16min 15s, sys: 50.5 s, total: 17min 5s
Wall time: 17min 11s


In [139]:
# Pickling the model to avoid waiting another 30 min
import pickle
filename = 'kmeans_100_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [140]:
!ls -lah kmeans_100_model.sav

-rw-r--r--  1 goodgame  staff   474M Mar 20 21:26 kmeans_100_model.sav


_This model is huge._

In [141]:
# Reload from disk, to be sure it works
loaded_model = pickle.load(open('kmeans_100_model.sav', 'rb'))

In [142]:
print("Top terms per cluster:")
order_centroids = loaded_model.cluster_centers_.argsort()[:, ::-1]
terms = pos_vect.get_feature_names()
for i in range(true_k):
    print("\n\nCluster %d:" % i, "\n")
    for ind in order_centroids[i, :40]:
        print(' %s' % terms[ind])

Top terms per cluster:


Cluster 0: 

 experience
 job
 required
 skills
 software
 data
 work
 will
 management
 must
 time
 years
 00
 support
 team
 project
 development
 type
 full
 knowledge
 customer
 technical
 job type
 full time
 ability
 days
 systems
 services
 business
 company
 position
 ago
 year
 days ago
 client
 quality
 service
 requirements
 strong
 education


Cluster 1: 

 data
 business
 analyst
 analysis
 experience
 reporting
 data analyst
 analytics
 reports
 skills
 ability
 work
 management
 sql
 will
 information
 statistical
 data analysis
 excel
 required
 analytical
 support
 health
 team
 strong
 knowledge
 insights
 requirements
 quality
 job
 tools
 years
 systems
 research
 analyze
 including
 project
 processes
 related
 sources


Cluster 2: 

 amazon
 experience
 amazon com
 software
 technical
 will
 aws
 development
 alexa
 com
 basic qualifications
 qualifications
 team
 software development
 computer
 services
 systems
 preferred qualifications


### Topic modeling using Gensim

Note that there's a memory-friendly way to do this by streaming docs.

https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Corpora_and_Vector_Spaces.ipynb

This _should_ work in memory, but if not then we have a streaming method to fall back on.

In [98]:
# Save the corpus in its current state before making modifications
original_texts = texts

In [143]:
# remove words that appear only once
texts = [[word for word in document.lower().split() if word not in stop_words]
         for document in parsed_dataset]

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

In [145]:
# Build and store a dictionary
dictionary = gensim.corpora.Dictionary(texts)
dictionary.save('/tmp/jobs.dict')  # store the dictionary, for future reference

In [146]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # All three words appear in the dictionary

[(29, 1), (777, 1), (1014, 1)]


In [147]:
corpus = [dictionary.doc2bow(text) for text in texts]
# store to disk, for later use
gensim.corpora.MmCorpus.serialize('/tmp/jobs.mm', corpus)  

In [148]:
if (os.path.exists("/tmp/jobs.dict")):
    dictionary = gensim.corpora.Dictionary.load('/tmp/jobs.dict')
    corpus = gensim.corpora.MmCorpus('/tmp/jobs.mm')
    print("Used saved dictionary and corpus")
else:
    print("No dictionary or corpus available at that path")

Used saved dictionary and corpus


In [149]:
# Sanity check the dictionary
print(dictionary[1])
print(dictionary[1044])
print(dictionary[2500])

.net
seeker
humanity


In [150]:
%%time
# Transformation using tf_idf
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

CPU times: user 14.6 s, sys: 70.4 ms, total: 14.6 s
Wall time: 14.7 s


In [151]:
%%time
# construct a model using Latent Semantic Analysis
lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=500)

CPU times: user 1min 56s, sys: 6.74 s, total: 2min 2s
Wall time: 1min 12s


In [152]:
%%time
# construct another model using Hierarchical Dirichlet Processing
hdp = gensim.models.HdpModel(corpus, id2word=dictionary)

CPU times: user 6min 41s, sys: 19.6 s, total: 7min
Wall time: 6min 36s


In [154]:
with open('sample_resume.txt', 'r') as infile:
    resume = infile.read()
doc = parse_input_docs(resume)
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space

In [155]:
# transform corpus to LSI space and index it
index = gensim.similarities.MatrixSimilarity(lsi[corpus])

In [156]:
index.save('/tmp/jobs.index')

In [161]:
sims = index[vec_lsi]
sims_sorted = sorted(enumerate(sims), key=lambda item: -item[1])
for item in sims_sorted[:5]:
    print(all_job_titles[item[0]]+'\nID: '+all_job_ids[item[0]],"\n\tIndex:",item[0],"\n\tSimilarity:",item[1])

Java Software Engineer ( Microservices)
ID: 8872b775eb7cab9f 
	Index: 38415 
	Similarity: 0.78755414
Web DevOps Engineer
ID: f6c96bb60e22c12b 
	Index: 4892 
	Similarity: 0.78742665
Software Developer
ID: da95cfcf76084c53 
	Index: 16792 
	Similarity: 0.78210187
Software Engineer
ID: c4872866cc29f339 
	Index: 35603 
	Similarity: 0.7818614
Database Administrator
ID: a60da84dcba79745 
	Index: 28238 
	Similarity: 0.7818416


#### Reflection
Hmmm, those results don't seem super helpful. Moving onto SpACy.

## SpaCy
From [this example on Github](https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb)

Installation had some oddities; run the following:
```bash
$ pip install -U spacy
$ python -m spacy download en
```

I tried first without preprocessed "skill phrase" text as the corpus. Then, I ran back through with the "skill phrase" text. Both types of models and text output docs are saved in this directory.

In [1]:
import spacy
import pandas as pd
import itertools as it
import en_core_web_sm

import spacy
nlp = spacy.load('en')

In [167]:
parsed_jd = nlp(job_str)
print(parsed_jd)

Data Scientist/Machine Learning Engineer, Adaptive Authentication
Position Description:

This is an opportunity to join our fast-growing Adaptive Authentication team to develop cutting-edge risk-based adaptive authentication policies. We are looking for a Data Scientist/Machine Learning Engineer to build large-scale distributed systems while using machine learning to solve business problems. The ideal candidate has experience building models from complex systems, developing enterprise-grade software in an object-oriented language, and experience or knowledge in security, authentication or identity.

Our elite team is fast, innovative and flexible; with a weekly release cycle and individual ownership we expect great things from our engineering and reward them with stimulating new projects and emerging technologies.


Job Duties and Responsibilities:

Build and own models that identify risk associated with anomalous activity in the cloud for authentication
Build Machine Learning pipeline

In [165]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

1. Segment texts (JDs) into sentences & normalize text
2. First-order phrase modeling $\rightarrow$ apply first-order phrase 
3. model to transform sentences
4. Second-order phrase modeling $\rightarrow$ apply second-order 
5. phrase model to transform sentences
6. Apply text normalization and second-order phrase model to text of complete reviews

In [168]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    SRG: modified for a list
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    for review in filename:
        yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [172]:
%%time

import codecs
# This is time consuming; make the if statement True to run
if 0 == 0:
    with codecs.open('spacy_jds_concat_parsed.txt', 'w', encoding='utf_8') as f:
        for sentence in original_texts:
            f.write(sentence + '\n')

CPU times: user 287 ms, sys: 105 ms, total: 393 ms
Wall time: 397 ms


In [174]:
unigram_sentences = LineSentence('spacy_jds_concat_parsed.txt')

In [175]:
# print some examples
# for unigram_sentence in it.islice(unigram_sentences, 230, 240):
#     print(u' '.join(unigram_sentence))
#     print(u'')

Job Description Visa's Digital and Mobile Product Development (DMPD) team is building a new generation of products to facilitate commerce in everyone's digital and mobile lives. Our focus is to build intuitive features that expose profound new value for our customers, merchants and developers. As a Senior DevOps Engineer for Platform Engineering and Operations Support, you will join a team focused on making DMPD applications always available. The team is responsible for continuous integration, extending scalability, availability, and resiliency of DMPD products like Visa Checkout, Visa Direct, Visa Token Services, and Visa Digital Commerce Apps. You will help build and roll-out a platform for containerizing applications with monitoring, logging, and features like auto-scaling and auto-healing. Responsibilities: Develop continuous integration and management solutions for containers Build, enhance and scale critical pipeline, application, and infrastructure technologies in DMPD Build sol

In [177]:
%%time

bigram_model = Phrases(unigram_sentences)
bigram_model.save('spacy_bigram_model_all_PARSED')

CPU times: user 39.6 s, sys: 796 ms, total: 40.4 s
Wall time: 40.5 s


In [178]:
# load the finished model from disk
bigram_model = Phrases.load('spacy_bigram_model_all_PARSED')

In [179]:
%%time

with codecs.open('spacy_bigram_sentences_PARSED.txt', 'w', encoding='utf_8') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = u' '.join(bigram_model[unigram_sentence])
        f.write(bigram_sentence + '\n')



CPU times: user 1min 7s, sys: 141 ms, total: 1min 8s
Wall time: 1min 8s


In [180]:
bigram_sentences = LineSentence('spacy_bigram_sentences_PARSED.txt')

In [182]:
# print examples; certain bigrams are underlined
# for bigram_sentence in it.islice(bigram_sentences, 240, 250):
#     print(u' '.join(bigram_sentence))
#     print(u'')

In [183]:
%%time
trigram_model = Phrases(bigram_sentences)
trigram_model.save('spacy_trigram_model_all_PARSED')

CPU times: user 38.2 s, sys: 832 ms, total: 39 s
Wall time: 39.4 s


In [184]:
# load the finished model from disk
trigram_model = Phrases.load('spacy_trigram_model_all_PARSED')

In [185]:
%%time

with codecs.open('spacy_trigram_sentences_PARSED.txt', 'w', encoding='utf_8') as f:
    for bigram_sentence in bigram_sentences:
        trigram_sentence = u' '.join(trigram_model[bigram_sentence])
        f.write(trigram_sentence + '\n')



CPU times: user 1min 8s, sys: 406 ms, total: 1min 8s
Wall time: 1min 9s


In [186]:
trigram_sentences = LineSentence('spacy_trigram_sentences_PARSED.txt')

In [187]:
# for trigram_sentence in it.islice(trigram_sentences, 240, 250):
#     print(u' '.join(trigram_sentence))
#     print(u'')

#### Final preprocessing

In [190]:
%%time

with codecs.open('spacy_trigram_transformed_jds_all_PARSED.txt', 'w', encoding='utf_8') as f:
    for parsed_review in nlp.pipe(line_review(original_texts),
                                  batch_size=10000, n_threads=4):
        # lemmatize the text, removing punctuation and whitespace
        unigram_review = [token.lemma_ for token in parsed_review
                          if not punct_space(token)]

        # apply the first-order and second-order phrase models
        bigram_review = bigram_model[unigram_review]
        trigram_review = trigram_model[bigram_review]

        # remove any remaining stopwords
        trigram_review = [term for term in trigram_review
                          if term not in stopwords]

        # write the transformed review as a line in the new file
        trigram_review = u' '.join(trigram_review)
        f.write(trigram_review + '\n')



CPU times: user 1h 40min 23s, sys: 1h 37min 49s, total: 3h 18min 13s
Wall time: 2h 17min 38s


In [191]:
print(u'Original:' + u'\n')

for review in it.islice(line_review(original_texts), 11, 12):
    print(review)

print(u'----' + u'\n')
print(u'Transformed:' + u'\n')

with codecs.open('spacy_trigram_transformed_jds_all_PARSED.txt', encoding='utf_8') as f:
    for review in it.islice(f, 11, 12):
        print(review)

Original:

Position Overview  Are you a full stack Software Engineer looking for an opportunity to work in a fast paced and fun environment? Then consider joining us here at Skillsoft in Rochester, NY. We are building SaaS platforms and client applications for our content delivery products with the goal of being the “best in the industry”. Our products allow 6000+ customers consume content from a variety of sources with a strong focus on video, from any device, with features that fit the needs of learner. We believe that our customers deserve the best learning experiences and we are delivering on those needs with the products you will be working on. You could be the one to help us achieve this goal!  We are looking for exceptional developers to join our team and willing to learn new technology in this rapidly evolving world. You will build front end customer facing product features using cutting edge technologies while implementing REST APIs and other back-end functionality as needed. 

In [17]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import LsiModel

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle

  params = attr.ib(convert=attr.converters.optional(tuple))
  ids = attr.ib(default=None, convert=_ensure_immutable_ids)


In [193]:
%%time

trigram_reviews = LineSentence('spacy_trigram_sentences_PARSED.txt')

# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_reviews)

# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
trigram_dictionary.compactify()

trigram_dictionary.save('spacy_trigram_dict_all.dict')
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load('spacy_trigram_dict_all.dict')

CPU times: user 16.4 s, sys: 245 ms, total: 16.7 s
Wall time: 17.1 s


In [194]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [195]:
%%time

# generate bag-of-words representations for all JDs and save them as a matrix
MmCorpus.serialize('spacy_trigram_bow_corpus_all.mm',
                   trigram_bow_generator('spacy_trigram_sentences_PARSED.txt'))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus('spacy_trigram_bow_corpus_all.mm')

CPU times: user 19.3 s, sys: 147 ms, total: 19.4 s
Wall time: 19.4 s


### Modeling

In [196]:
%%time

with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    # workers => sets the parallelism, and should be
    # set to your number of physical cores minus one
    lsi = gensim.models.LsiModel(trigram_bow_corpus, 
                                 id2word=trigram_dictionary, 
                                 num_topics=500)
    
    lsi.save('spacy_lsi_model_all')
    
# load the finished LDA model from disk
lsi = LsiModel.load('spacy_lsi_model_all')

CPU times: user 3min 2s, sys: 11.3 s, total: 3min 13s
Wall time: 1min 41s


In [197]:
def explore_topic(topic_number, topn=10):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
#     print(u'{:20} {}'.format(u'term', u'frequency') + u'')

    for term, frequency in lsi.show_topic(topic_number, topn=10):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [198]:
for i in range(600):
    print("\n\nTopic %s" % str(i+1))
    explore_topic(topic_number=i)



Topic 1
data                 0.266
product              0.257
support              0.157
solutions            0.146
design               0.136
The                  0.132
Ability              0.132
including            0.131
products             0.130
systems              0.117


Topic 2
data                 0.695
product              -0.584
products             -0.172
sales                -0.129
systems              0.081
customer             -0.078
database             0.076
customers            -0.074
marketing            -0.073
market               -0.070


Topic 3
data                 -0.565
product              -0.556
design               0.130
-                    0.126
systems              0.125
applications         0.107
solutions            0.103
you                  0.100
application          0.098
your                 0.098


Topic 4
-                    -0.353
your                 -0.280
support              0.250
you                  -0.249
Ability              0.181
dat

will                 -0.303
management           0.294
,                    0.284
you                  0.227
teams                0.206
services             -0.206
from                 0.198
at                   -0.188
project              -0.180
your                 -0.172


Topic 48
management           0.336
,                    -0.331
services             -0.310
have                 0.270
you                  -0.233
tools                0.211
.                    -0.211
will                 -0.165
project              -0.163
your                 0.149


Topic 49
and/or               -0.414
from                 0.372
will                 0.247
at                   0.184
system               -0.179
management           0.174
The                  -0.165
application          0.159
ability              0.158
engineering          0.155


Topic 50
,                    -0.391
.                    -0.248
customer             0.240
customers            -0.220
requirements         0.217
at   

ensure               -0.266
internal             0.264
network              -0.252
analysis             -0.229
complex              0.202
as_well_as           0.196
help                 0.176
drive                0.167
AWS                  -0.156
reporting            -0.148


Topic 104
we                   0.369
We                   -0.278
information          0.260
as_well_as           0.227
projects             0.212
ensure               -0.211
Our                  -0.194
processes            -0.193
performance          0.191
complex              -0.168


Topic 105
technologies         -0.412
use                  -0.234
as_well_as           -0.213
ensure               0.193
processes            -0.185
people               -0.147
As                   0.146
into                 0.145
such_as              0.145
Strong               -0.135


Topic 106
technologies         -0.339
analytics            -0.304
be                   -0.297
as_well_as           0.248
use                  0.227


QA                   -0.251
DevOps               -0.223
features             -0.192
market               -0.184
people               -0.180
digital              0.162
web                  -0.157
automation           0.154
opportunity          0.149
integration          0.145


Topic 159
integration          -0.230
Familiarity_with     0.219
make                 -0.192
leading              -0.177
client               0.173
clients              -0.165
organization         0.164
key                  0.164
A                    -0.152
web                  -0.150


Topic 160
implement            0.265
digital              -0.245
-_30+_days_ago       -0.218
understand           -0.186
Familiarity_with     0.176
QA                   -0.173
solution             0.173
make                 0.154
existing             -0.129
integration          0.128


Topic 161
market               -0.273
clients              0.195
multiple             0.180
understand           -0.173
professional         -0.161

which                0.222
Responsibilities     -0.190
supporting           0.156
Software             0.153
Qualifications       -0.147
programming          0.144
like                 0.138
any                  -0.126
(                    0.125
creating             0.124


Topic 217
activities           -0.229
which                -0.208
expertise            0.203
needs                -0.165
Support              0.149
role                 -0.140
has                  0.139
features             0.130
Must_have            0.125
strategy             0.124


Topic 218
environment.         0.273
deployment           -0.225
documentation        -0.177
supporting           -0.170
time                 0.156
any                  0.136
Linux                -0.129
release              -0.127
Support              0.121
developers           0.119


Topic 219
expertise            -0.245
Support              -0.187
operational          0.155
solutions.           -0.149
strategy             0.147
has 

position             -0.147
This_is              -0.142
define               0.136
critical             0.128
its                  0.121
delivering           0.119
who                  0.119
design,              0.119
assigned             0.110
Python               0.110


Topic 274
systems.             0.190
critical             -0.160
core                 -0.150
Proficiency          0.147
Manage               0.140
hardware             -0.132
Lead                 -0.129
This_is              -0.125
you_will             0.118
opportunities        0.117


Topic 275
how                  0.172
This_is              0.163
products.            0.147
based_on             -0.144
applications.        -0.138
appropriate          0.135
current              -0.134
providing            0.133
hardware             -0.127
assigned             -0.125


Topic 276
access               0.230
standards            -0.189
operational          -0.176
?                    0.160
current              -0.137
base

2+_years             -0.181
You_are              -0.152
Android              -0.148
3+_years             0.146
success              0.128
strategic            -0.126
Participate_in       -0.124
critical             -0.122
iOS                  -0.116
Responsible_for      0.115


Topic 330
communicate          -0.163
Responsible_for      -0.148
functional           -0.130
assigned             0.128
appropriate          -0.121
Working              -0.120
stakeholders         -0.117
Linux                0.113
Computer_Science_or  -0.111
plan                 0.108


Topic 331
applications.        0.194
functional           -0.169
overall              -0.151
Computer_Science_or  0.143
Android              -0.139
Participate_in       0.127
You_are              0.117
team,                0.115
way                  -0.110
Able                 -0.110


Topic 332
stakeholders         -0.175
monitoring           0.159
writing              0.153
preferred.           -0.145
advanced             -0.1

team,                -0.209
planning             0.124
game                 0.121
changes              -0.121
communication        0.119
stakeholders         -0.118
portfolio            -0.111
them                 0.105
advanced             0.104
job                  -0.104


Topic 389
environments         0.157
UI                   -0.148
team,                0.139
analysis,            0.129
effective            -0.127
experiences          0.122
management,          0.113
components           0.110
practices            0.110
Maintain             0.101


Topic 390
Proficient           -0.192
or_more              -0.145
creative             0.133
execution            0.132
perform              0.122
communicate          -0.121
City                 0.113
following            0.110
analysis,            -0.110
years                0.109


Topic 391
developer            -0.161
systems,             0.149
job                  0.146
team,                -0.145
creative             0.142
Requir

is_looking_for       0.147
you_will_be          -0.124
efforts              -0.115
tasks                -0.114
written              -0.108
results              -0.104
procedures           -0.103
contribute           -0.101
video                0.092
is_responsible_for   0.090


Topic 448
â€¢                  0.138
focus_on             -0.136
background           0.135
services.            0.113
contribute           0.106
requirements.        0.100
degree               0.100
Ensure               -0.096
Windows              -0.092
tasks                0.090


Topic 449
degree               -0.119
management,          -0.117
tools,               0.114
Manager              -0.112
assist               -0.111
focus_on             -0.111
collaboration        0.104
PS                   0.096
data_center          -0.096
API                  -0.095


Topic 450
Expertise            0.165
data,                -0.137
while                -0.122
(e.g.                0.119
â€¢                  -0.118

## SpaCy, this time using a corpus that only consists of skill-phrases

In [205]:
%%time
# Note that the data itself (line 9) is not "original_texts", which is the raw text;
# it's "parsed_dataset", which is the collection of POS-tagged skill-phrases

import codecs
# This is time consuming; make the if statement True to run
if 0 == 0:
    with codecs.open('spacy_jds_concat_parsed_POS.txt', 'w', encoding='utf_8') as f:
        for sentence in parsed_dataset:
            f.write(sentence + '\n')

CPU times: user 146 ms, sys: 36.1 ms, total: 182 ms
Wall time: 183 ms


In [206]:
unigram_sentences = LineSentence('spacy_jds_concat_parsed_POS.txt')

In [207]:
%%time

bigram_model = Phrases(unigram_sentences)
bigram_model.save('spacy_bigram_model_all_PARSED_POS')

  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):


CPU times: user 20.9 s, sys: 284 ms, total: 21.2 s
Wall time: 21.2 s


In [208]:
# load the finished model from disk
bigram_model = Phrases.load('spacy_bigram_model_all_PARSED_POS')

In [209]:
%%time

with codecs.open('spacy_bigram_sentences_PARSED_POS.txt', 'w', encoding='utf_8') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = u' '.join(bigram_model[unigram_sentence])
        f.write(bigram_sentence + '\n')



CPU times: user 41.4 s, sys: 212 ms, total: 41.7 s
Wall time: 41.9 s


In [210]:
bigram_sentences = LineSentence('spacy_bigram_sentences_PARSED_POS.txt')

In [211]:
# print examples; certain bigrams are underlined
# for bigram_sentence in it.islice(bigram_sentences, 240, 250):
#     print(u' '.join(bigram_sentence))
#     print(u'')

In [212]:
%%time
trigram_model = Phrases(bigram_sentences)
trigram_model.save('spacy_trigram_model_all_PARSED_POS')

  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
  if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):


CPU times: user 20.8 s, sys: 305 ms, total: 21.1 s
Wall time: 21.1 s


In [213]:
# load the finished model from disk
trigram_model = Phrases.load('spacy_trigram_model_all_PARSED_POS')

In [214]:
%%time

with codecs.open('spacy_trigram_sentences_PARSED_POS.txt', 'w', encoding='utf_8') as f:
    for bigram_sentence in bigram_sentences:
        trigram_sentence = u' '.join(trigram_model[bigram_sentence])
        f.write(trigram_sentence + '\n')



CPU times: user 37.4 s, sys: 246 ms, total: 37.6 s
Wall time: 38 s


In [215]:
trigram_sentences = LineSentence('spacy_trigram_sentences_PARSED_POS.txt')

In [216]:
# for trigram_sentence in it.islice(trigram_sentences, 240, 250):
#     print(u' '.join(trigram_sentence))
#     print(u'')

#### Final preprocessing

In [217]:
%%time

with codecs.open('spacy_trigram_transformed_jds_all_PARSED_POS.txt', 'w', encoding='utf_8') as f:
    for parsed_review in nlp.pipe(line_review(original_texts),
                                  batch_size=10000, n_threads=4):
        # lemmatize the text, removing punctuation and whitespace
        unigram_review = [token.lemma_ for token in parsed_review
                          if not punct_space(token)]

        # apply the first-order and second-order phrase models
        bigram_review = bigram_model[unigram_review]
        trigram_review = trigram_model[bigram_review]

        # remove any remaining stopwords
        trigram_review = [term for term in trigram_review
                          if term not in stopwords]

        # write the transformed review as a line in the new file
        trigram_review = u' '.join(trigram_review)
        f.write(trigram_review + '\n')



CPU times: user 1h 39min 13s, sys: 1h 46min 23s, total: 3h 25min 37s
Wall time: 2h 47min 42s


In [218]:
print(u'Original:' + u'\n')

for review in it.islice(line_review(original_texts), 11, 12):
    print(review)

print(u'----' + u'\n')
print(u'Transformed:' + u'\n')

with codecs.open('spacy_trigram_transformed_jds_all_PARSED_POS.txt', encoding='utf_8') as f:
    for review in it.islice(f, 11, 12):
        print(review)

Original:

Position Overview  Are you a full stack Software Engineer looking for an opportunity to work in a fast paced and fun environment? Then consider joining us here at Skillsoft in Rochester, NY. We are building SaaS platforms and client applications for our content delivery products with the goal of being the “best in the industry”. Our products allow 6000+ customers consume content from a variety of sources with a strong focus on video, from any device, with features that fit the needs of learner. We believe that our customers deserve the best learning experiences and we are delivering on those needs with the products you will be working on. You could be the one to help us achieve this goal!  We are looking for exceptional developers to join our team and willing to learn new technology in this rapidly evolving world. You will build front end customer facing product features using cutting edge technologies while implementing REST APIs and other back-end functionality as needed. 

In [220]:
%%time

trigram_reviews = LineSentence('spacy_trigram_sentences_PARSED_POS.txt')

# learn the dictionary by iterating over all of the reviews
trigram_dictionary = Dictionary(trigram_reviews)

# filter tokens that are very rare or too common from
# the dictionary (filter_extremes) and reassign integer ids (compactify)
trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
trigram_dictionary.compactify()

trigram_dictionary.save('spacy_trigram_dict_all_POS.dict')
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load('spacy_trigram_dict_all_POS.dict')

CPU times: user 7.83 s, sys: 178 ms, total: 8.01 s
Wall time: 8.22 s


In [221]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [226]:
%%time

# generate bag-of-words representations for all JDs and save them as a matrix
MmCorpus.serialize('spacy_trigram_bow_corpus_all_POS.mm',
                   trigram_bow_generator('spacy_trigram_sentences_PARSED_POS.txt'))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus('spacy_trigram_bow_corpus_all_POS.mm')

CPU times: user 11.8 s, sys: 135 ms, total: 11.9 s
Wall time: 12 s


### Modeling

In [230]:
%%time

with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    # workers => sets the parallelism, and should be
    # set to your number of physical cores minus one
    lsi = gensim.models.LsiModel(trigram_bow_corpus, 
                                 id2word=trigram_dictionary, 
                                 num_topics=50)
    
    lsi.save('spacy_lsi_model_all_POS')

CPU times: user 20.7 s, sys: 662 ms, total: 21.4 s
Wall time: 16.7 s


In [238]:
# load the finished LDA model from disk
lsi = LsiModel.load('spacy_lsi_model_all_POS')

In [239]:
def explore_topic(topic_number, topn=10):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
#     print(u'{:20} {}'.format(u'term', u'frequency') + u'')

    for term, frequency in lsi.show_topic(topic_number, topn=60):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [240]:
for i in range(50):
    print("\n\nTopic %s" % str(i+1))
    explore_topic(topic_number=i)



Topic 1
sale                 0.179
support              0.178
client               0.152
position             0.144
platform             0.135
information          0.134
analysis             0.133
’s                   0.130
program              0.130
knowledge            0.126
manager              0.125
role                 0.124
industry             0.121
problem              0.120
quality              0.120
database             0.111
user                 0.108
marketing            0.108
web                  0.106
organization         0.105
security             0.105
test                 0.104
performance          0.101
level                0.100
or                   0.100
need                 0.098
strategy             0.097
market               0.096
practice             0.095
cloud                0.093
people               0.093
partner              0.091
research             0.090
job                  0.089
complex              0.089
professional         0.086
issue             

performance          0.234
manager              -0.233
’s                   -0.194
analysis             -0.169
professional         0.169
network              0.149
code                 0.145
analytics            0.130
automation           -0.129
job                  -0.121
healthcare           0.116
information          -0.113
aws                  0.103
infrastructure       -0.103
science              -0.101
issue                0.101
problem              0.099
knowledge            -0.097
market               0.096
implementation       -0.095
world                -0.091
enterprise           -0.090
need                 0.089
health               0.084
mobile               -0.081
web                  -0.081
integration          -0.080
strategy             -0.076
model                -0.076
complex              0.075
cloud                0.069
report               0.066
level                0.065
employee             0.063
role                 0.060
analyst              0.059
control    