In [1]:
import argparse
import spacy
import gensim
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from nltk.corpus import stopwords
import en_core_web_sm

# yes it's a thing
import nltk

In [2]:
with open("/Users/jlamb/repos/w210/skills/sandbox/data_science/app_files/sample_resume.txt", "r") as f:
    user_input_text = f.read()

In [24]:
trigram_dict_file = "models/trigram_dictionary.dict"
bigram_model_file = "models/bigram_model_pos"
trigram_model_file = "models/trigram_model_pos"
lda_model_file = "models/lda_alpha_eta_auto_27"

In [22]:
trigram_dictionary = Dictionary.load(trigram_dict_file)
bigram_model = Phrases.load(bigram_model_file)
trigram_model = Phrases.load(trigram_model_file)
lda = LdaModel.load(lda_model)

In [5]:
# Get list of hard skills
all_hard_skills = []
with open('models/hard_skills.txt', 'r') as infile:
    for line in infile:
        line = line.strip()
        all_hard_skills.append(line)

# Load globals
nlp = spacy.load('en')
stopwords = stopwords.words('english')

In [27]:

topic_names = {1: u'Consulting and Contracting',
               2: u'DevOps',
               3: u'* Meta Job Description Topic: Students and Education',
               4: u'Finance and Risk',
               5: u'* Meta Job Description Topic: Benefits',
               6: u'* Meta Job Description Topic: Facebook Advertising',
               7: u'Aerospace and Flight Technology',
               8: u'* Meta Job Description Topic: Soft Skills',
               9: u'Product Management',
               10: u'Compliance and Process/Program Management',
               11: u'Project and Program Management',
               12: u'* Meta Job Description Topic: Generic',
               13: u'* Meta Job Description Topic: EO and Disability',
               14: u'Healthcare',
               15: u'Software Engineering and QA',
               16: u'Accounting and Finance',
               17: u'Human Resources and People',
               18: u'Sales',
               19: u'* Meta Job Description Topic: Startup-Focused',
               20: u'Federal Government and Defense Contracting',
               21: u'Web Development and Front-End Software Engineering',
               22: u'UX and Design',
               23: u'* Meta Job Description Topic: Education-Focused',
               24: u'Academic and Medical Research',
               25: u'Data Science',
               26: u'* Meta Job Description Topic: Non-Discrimination',
               27: u'Business Strategy'}

In [6]:
def vectorize_input(input_doc, bigram_model, trigram_model, trigram_dictionary):
    """
    (1) parse input doc with spaCy
    (2) apply text pre-proccessing steps,
    (3) create a bag-of-words representation
    (4) create an LDA representation
    """

    # parse the review text with spaCy
    parsed_doc = nlp(input_doc)

    # lemmatize the text and remove punctuation and whitespace
    unigram_doc = []
    for token in parsed_doc:
        if not (token.is_punct or token.is_space):
            unigram_doc.append(token.lemma_)

    # apply the first-order and secord-order phrase models
    bigram_doc = bigram_model[unigram_doc]
    trigram_doc = trigram_model[bigram_doc]

    # remove any remaining stopwords
    trigram_review = [term for term in trigram_doc
                      if not term in stopwords]

    # create a bag-of-words representation
    doc_bow = trigram_dictionary.doc2bow(trigram_doc)

    # create an LDA representation
    document_lda = lda[doc_bow]
    return trigram_review, document_lda

In [8]:
input_doc = user_input_text
parsed_doc = nlp(input_doc)

In [11]:
user_skills, my_lda = vectorize_input(user_input_text,
                                      bigram_model,
                                      trigram_model,
                                      trigram_dictionary)



In [82]:
class SkillRecommender:

    def __init__(self,
                 trigram_dictionary,
                 bigram_model,
                 trigram_model,
                 lda_model,
                 topic_names,
                 hard_skills=[]):
        """
        Create an instance of the thing we'll use to do skill
        parsing and recommendation
        """
        self.trigram_dictionary = trigram_dictionary
        self.bigram_model = bigram_model
        self.trigram_model = trigram_model
        self.lda_model = lda_model
        self.topic_names = topic_names
        self.hard_skills = set(hard_skills)
        self.nlp = spacy.load('en')

    def fit(self, input_text):
        """
        Given a string with an input document and the trained
        bigram and trigram models, return a trigram document
        representation.
        """
        # parse the review text with spaCy
        parsed_doc = self.nlp(input_text)

        # lemmatize the text and remove punctuation and whitespace
        unigram_doc = []
        for token in parsed_doc:
            if not (token.is_punct or token.is_space):
                unigram_doc.append(token.lemma_)

        # apply the first-order and secord-order phrase models
        bigram_doc = self.bigram_model[unigram_doc]
        trigram_doc = self.trigram_model[bigram_doc]

        # Parse out skills
        stopword_list = stopwords.words('english')
        skills = filter(lambda x: x not in stopword_list, trigram_doc)

        self.trigram_doc = trigram_doc
        self.skills = set(skills)

    def predict(self, skills_text, num_jobs=3, skills_per_job=10):
        """
        Go do that hockey. Produces a JSON with an element called
        "predictions" that holds a list of job area matches. Each job
        has the following things:
        1. "job_name"
        2. "match_percent" = float in [0,1]. Match between user resume and job
        3. "skills" = a dictionary with lists of skills
        """

        # create a bag-of-words representation
        doc_bow = self.trigram_dictionary.doc2bow(self.trigram_doc)

        # create an LDA representation
        document_lda = self.lda_model[doc_bow]

        # sort topics in descending order by match probability
        sorted_doc_lda = sorted(document_lda,
                                key=lambda review_lda: -review_lda[1])

        # Initialize a dictionary for predictions
        preds = {
            "predictions": []
        }

        # Update the dictionary of predictions
        for i in range(num_jobs):
            topic_number = sorted_doc_lda[i][0]

            # get skills for this particular job
            skills_with_freq = self.lda_model.show_topic(topic_number,
                                                         topn=skills_per_job)

            # Get just the list of skill names for this job
            just_skills = set(map(lambda tup: tup[0], skills_with_freq))

            # Get all the relevant skills designations
            has = just_skills.intersection(self.skills)
            missing = just_skills.difference(self.skills)

            # Grab the relevant information to serve back
            prediction = {
                "job_name": self.topic_names[topic_number],
                "match_percent": sorted_doc_lda[i][1],
                "skills": {
                    "has": {
                        "all": list(has),
                        "labeled": {
                            "hard": list(has.intersection(self.hard_skills)),
                            "other": list(has.difference(self.hard_skills))
                        }
                    },
                    "missing": {
                        "all": list(missing),
                        "labeled": {
                            "hard": list(missing.intersection(self.hard_skills)),
                            "other": list(missing.difference(self.hard_skills))
                        }
                    }
                }
            }

            preds["predictions"].append(prediction)

        return(preds)

In [83]:
# Create a SkillRecommender
model = SkillRecommender(
    trigram_dictionary=Dictionary.load(trigram_dict_file),
    bigram_model=Phrases.load(bigram_model_file),
    trigram_model=Phrases.load(trigram_model_file),
    lda_model=LdaModel.load(lda_model_file),
    topic_names=topic_names,
    hard_skills=all_hard_skills
)

In [84]:
# fit to input text
model.fit(user_input_text)

# get skills
skills = model.skills



In [85]:
prediction = model.predict(skills_text="hey")

In [101]:
import json
x = json.loads("{1: 'hey'}")

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)

In [87]:
model.trigram_doc

['leadership',
 'and',
 'experience',
 'head',
 'of',
 'data_science',
 'shift',
 '|',
 'berkeley',
 'ca|',
 '2017',
 'august',
 '2017',
 'present',
 'building',
 'a',
 'machine_learn',
 'system',
 'to',
 'match',
 'transition',
 'military',
 'service',
 'member',
 'to',
 'tech',
 'job',
 'additional',
 'focus',
 'on',
 'growth',
 'bi',
 'datum',
 'communication',
 'and',
 'evangelism',
 'serve',
 'as',
 'pm',
 'for',
 'data',
 'product',
 'data_science',
 'advisor',
 'commonlit',
 '|',
 'washington',
 'd.c.',
 '|',
 'september',
 '2017',
 'present',
 'advising',
 'a',
 'literacy',
 'nonprofit',
 'grow',
 'at',
 '500k',
 'user',
 'month',
 'to',
 'be',
 'data',
 'drive',
 'create',
 'business',
 'intelligence',
 'workflow',
 'and',
 'teach',
 'commonlit',
 'employee',
 'sql',
 'a',
 'b',
 'testing',
 'and',
 'datum',
 'visualization',
 'implement',
 'ml',
 'psychometric',
 'model',
 'design',
 'and',
 'implement',
 'a',
 'complex',
 'randomize',
 'control',
 'trial',
 'that',
 'success

In [88]:
type(3)

int

In [92]:
type(trigram_dictionary)

gensim.corpora.dictionary.Dictionary

In [102]:
import pickle

In [103]:
topic_names = {
    1: u'Consulting and Contracting',
    2: u'DevOps',
    3: u'* Meta Job Description Topic: Students and Education',
    4: u'Finance and Risk',
    5: u'* Meta Job Description Topic: Benefits',
    6: u'* Meta Job Description Topic: Facebook Advertising',
    7: u'Aerospace and Flight Technology',
    8: u'* Meta Job Description Topic: Soft Skills',
    9: u'Product Manager',
    10: u'Compliance and Process/Program Management',
    11: u'Project and Program Management',
    12: u'* Meta Job Description Topic: Generic',
    13: u'* Meta Job Description Topic: EO and Disability',
    14: u'Healthcare',
    15: u'Software Engineer',
    16: u'Accounting and Finance',
    17: u'Human Resources and People',
    18: u'Sales',
    19: u'* Meta Job Description Topic: Startup-Focused',
    20: u'Federal Government and Defense Contracting',
    21: u'Software Engineer',
    22: u'UX Designer',
    23: u'* Meta Job Description Topic: Education-Focused',
    24: u'Academic and Medical Research',
    25: u'Data Scientist',
    26: u'* Meta Job Description Topic: Non-Discrimination',
    27: u'Business Strategy'
}

In [108]:
pickle.dump(file=open("/Users/jlamb/repos/w210/skills/app/resume_parsing/models/topic_names.pkl", "wb"), obj=topic_names)

In [109]:
x = pickle.load(open("/Users/jlamb/repos/w210/skills/app/resume_parsing/models/topic_names.pkl", "rb"))

In [111]:
type(x)

dict

In [112]:
thing = json.loads('[1,2,3,4]')

In [113]:
thing

[1, 2, 3, 4]