In [99]:
from job_database import JobsDb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
import inspect
from IPython.display import Markdown, display
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(word) for word in word_tokenize(doc) if word.isalpha()]

def display_object(obj):
    """Summary of display_object. Get source code for the provided object 
    and display in notebook as markdown.
    """
    source = inspect.getsource(obj)
    wrapped_source = f'```python\n{source}\n```'
    markdown_source = Markdown(wrapped_source)
    display(markdown_source)

In [100]:
query = """
SELECT * from jobs
WHERE title LIKE '%data%'
OR description LIKE '%data%';
"""
db = JobsDb()
data = db.load_query_as_df(query)
db.close()

In [101]:
descriptions = df['description'].tail(100)

In [102]:
pipe = Pipeline(
    steps = [
        ('vectorizer', CountVectorizer(
            encoding='utf-8',
            decode_error='ignore',
            strip_accents='unicode',
            lowercase=True,
            preprocessor=None,
            tokenizer=LemmaTokenizer(),
            stop_words='english',
            ngram_range=(1,4),
            analyzer='word',
            max_df=0.95,
            min_df=0.05,
            max_features=None
                                      )),
        ('topicModel', LatentDirichletAllocation())
    ]
)

In [103]:
%time pipe.fit(descriptions)

CPU times: user 5.04 s, sys: 116 ms, total: 5.16 s
Wall time: 3.31 s


Pipeline(steps=[('vectorizer',
                 CountVectorizer(decode_error='ignore', max_df=0.95,
                                 min_df=0.05, ngram_range=(1, 4),
                                 stop_words='english', strip_accents='unicode',
                                 tokenizer=<__main__.LemmaTokenizer object at 0x7f169c6c25f8>)),
                ('topicModel', LatentDirichletAllocation())])

In [104]:
%time pipe.transform(descriptions)

CPU times: user 1.43 s, sys: 11.8 ms, total: 1.45 s
Wall time: 1.34 s


array([[5.46676568e-04, 5.46528237e-04, 5.46510236e-04, 5.46507308e-04,
        5.46477617e-04, 5.46595745e-04, 5.46526860e-04, 5.46513389e-04,
        9.95081159e-01, 5.46504695e-04],
       [5.74825930e-04, 5.74904934e-04, 5.74830156e-04, 3.69754485e-02,
        5.74750819e-04, 4.37970036e-01, 5.74811690e-04, 5.74798378e-04,
        2.43050384e-01, 2.78555209e-01],
       [6.85083670e-04, 6.85116094e-04, 4.65633396e-02, 8.42943267e-02,
        6.84956736e-04, 4.91637511e-01, 6.85101458e-04, 6.85058393e-04,
        1.36190723e-01, 2.37888783e-01],
       [4.42589172e-04, 4.42600319e-04, 4.42546655e-04, 4.42556945e-04,
        4.42552687e-04, 9.26220712e-01, 7.02386683e-02, 4.42598115e-04,
        4.42606936e-04, 4.42568493e-04],
       [4.44538986e-04, 4.44587295e-04, 4.44529561e-04, 1.25851347e-01,
        4.44482039e-04, 8.70592313e-01, 4.44566809e-04, 4.44560027e-04,
        4.44565122e-04, 4.44510234e-04],
       [9.85480861e-01, 1.61315353e-03, 1.61320693e-03, 1.61312168e-03,
   

In [105]:
pipe.named_steps['vectorizer'].get_feature_names()

['ability',
 'ability communicate',
 'ability manage',
 'ability work',
 'ability work effectively',
 'ability work independently',
 'able',
 'accept',
 'access',
 'accommodation',
 'according',
 'account',
 'accountability',
 'accuracy',
 'accurate',
 'achieve',
 'acquisition',
 'act',
 'action',
 'action employer',
 'action employer qualified',
 'action employer qualified applicant',
 'actionable',
 'active',
 'actively',
 'activity',
 'acumen',
 'addition',
 'additional',
 'address',
 'advance',
 'advanced',
 'advanced degree',
 'advancing',
 'affirmative',
 'affirmative action',
 'age',
 'age marital',
 'age marital status',
 'age national',
 'age national origin',
 'agency',
 'agile',
 'ai',
 'aid',
 'air',
 'airflow',
 'algorithm',
 'alignment',
 'allow',
 'analysis',
 'analyst',
 'analytic',
 'analytical',
 'analytical skill',
 'analytics',
 'analytics data',
 'analytics solution',
 'analytics team',
 'analytics tool',
 'analyze',
 'analyzing',
 'ancestry',
 'annual',
 'answer',