In [None]:
pip install -e ..

In [1]:
from src.data import data_cleaning, job_database

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
import inspect
from IPython.display import Markdown, display
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return data_cleaning.doc_processor(doc)

def display_object(obj):
    """Summary of display_object. Get source code for the provided object 
    and display in notebook as markdown.
    """
    source = inspect.getsource(obj)
    wrapped_source = f'```python\n{source}\n```'
    markdown_source = Markdown(wrapped_source)
    display(markdown_source)

In [16]:
query = """
SELECT * from jobs
WHERE title LIKE '%data%'
OR description LIKE '%data%';
"""
db = job_database.JobsDb()
data = db.load_query_as_df(query)
db.close()

In [17]:
descriptions = data['description'].tail(100)

In [61]:
pipe = Pipeline(
    steps = [
        ('vectorizer', CountVectorizer(
            encoding='utf-8',
            decode_error='ignore',
            strip_accents='unicode',
            lowercase=True,
            preprocessor=None,
            tokenizer=LemmaTokenizer(),
            stop_words='english',
            ngram_range=(1,4),
            analyzer='word',
            max_df=0.95,
            min_df=0.05,
            max_features=None
                                      )),
        ('topicModel', LatentDirichletAllocation())
    ]
)

In [62]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
%time pipe.fit(descriptions)



CPU times: user 8.37 s, sys: 56 ms, total: 8.43 s
Wall time: 8.43 s


Pipeline(steps=[('vectorizer',
                 CountVectorizer(decode_error='ignore', max_df=0.95,
                                 min_df=0.05, ngram_range=(1, 4),
                                 stop_words='english', strip_accents='unicode',
                                 tokenizer=<__main__.LemmaTokenizer object at 0x7fbc5c7d6460>)),
                ('topicModel', LatentDirichletAllocation())])

In [64]:
%time pipe.transform(descriptions)

CPU times: user 6.87 s, sys: 40 ms, total: 6.91 s
Wall time: 6.92 s


array([[6.09650405e-02, 4.07564412e-02, 2.06638066e-01, 5.55672608e-04,
        3.98447445e-01, 5.55642794e-04, 5.55705428e-04, 5.55727681e-04,
        2.90414634e-01, 5.55625070e-04],
       [2.49653325e-01, 6.58067083e-04, 3.57972842e-01, 6.57996687e-04,
        8.38707931e-02, 6.58026698e-04, 6.58070926e-04, 6.58083094e-04,
        3.04554769e-01, 6.58026196e-04],
       [4.23862900e-04, 4.23923402e-04, 5.58352142e-01, 4.23848506e-04,
        2.31846475e-01, 4.23817032e-04, 2.06834454e-01, 4.23772735e-04,
        4.23845023e-04, 4.23860672e-04],
       [4.38732418e-04, 4.38790536e-04, 5.24694024e-01, 4.38700535e-04,
        4.38746855e-04, 4.38733036e-04, 2.52734438e-01, 4.38689229e-04,
        1.74439322e-01, 4.54998236e-02],
       [1.47091472e-03, 1.47072310e-03, 1.47094980e-03, 1.96813583e-01,
        3.95402405e-01, 1.47078503e-03, 3.00070024e-01, 1.47084269e-03,
        1.47079990e-03, 9.88889725e-02],
       [1.70695158e-04, 1.70684779e-04, 6.45388285e-01, 1.70696104e-04,
   

In [65]:
tokens2 = pipe.named_steps['vectorizer'].get_feature_names()

In [66]:
len(tokens)

1620

In [67]:
tokens

['000',
 '10',
 '10 years',
 '100',
 '12',
 '15',
 '2020',
 '40',
 '401',
 'aa',
 'abilities',
 'ability',
 'ability communicate',
 'ability manage',
 'ability work',
 'ability work effectively',
 'ability work independently',
 'able',
 'accept',
 'access',
 'accommodation',
 'accommodations',
 'according',
 'account',
 'accountability',
 'accuracy',
 'accurate',
 'achieve',
 'acquisition',
 'act',
 'action',
 'action employer',
 'action employer qualified',
 'action employer qualified applicants',
 'actionable',
 'active',
 'activities',
 'acumen',
 'ad',
 'addition',
 'additional',
 'address',
 'advanced',
 'advanced degree',
 'advancing',
 'affirmative',
 'affirmative action',
 'affirmative action employer',
 'affirmative action employer qualified',
 'age',
 'age marital',
 'age marital status',
 'age national',
 'age national origin',
 'agencies',
 'agency',
 'agile',
 'ai',
 'air',
 'airflow',
 'algorithms',
 'alignment',
 'allow',
 'analyses',
 'analysis',
 'analyst',
 'analytic'

In [68]:
tokens2

['ability',
 'ability communicate',
 'ability manage',
 'ability work',
 'ability work effectively',
 'ability work independently',
 'able',
 'accept',
 'access',
 'accommodation',
 'accord',
 'account',
 'accountability',
 'accuracy',
 'accurate',
 'achieve',
 'acquisition',
 'act',
 'action',
 'action employer',
 'action employer qualify',
 'action employer qualify applicant',
 'actionable',
 'active',
 'activity',
 'acumen',
 'addition',
 'additional',
 'address',
 'administer',
 'advance',
 'advance diversity',
 'advanced',
 'affirmative',
 'affirmative action',
 'age',
 'age marital',
 'age marital status',
 'age national',
 'age national origin',
 'agency',
 'agile',
 'ai',
 'aid',
 'air',
 'airflow',
 'algorithm',
 'algorithms',
 'align',
 'alignment',
 'allow',
 'analysis',
 'analyst',
 'analytic',
 'analytical',
 'analytical skill',
 'analytics',
 'analytics data',
 'analytics solution',
 'analytics team',
 'analytics tool',
 'analyze',
 'analyze data',
 'ancestry',
 'annual',