# Assumption 1

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from glob import glob
import re
import seaborn as sns
import pickle
import joblib
from typing import Union
import nltk
import concurrent.futures

pd.set_option('display.max_columns', None) 

%matplotlib inline

In [None]:
small_list = ['JobID', 'CleanJobTitle', 'CanonState', 'CanonCounty', 'CanonCity', 'JobText', 'JobDate', 'CanonEmployer', 'Latitude', 'Longitude',
              'InternshipFlag', 'IsDuplicate', 'CanonPostalCode', 'CanonYearsOfExperienceLevel', 'BGTSubOcc']

In [None]:
even_smaller_list = ['JobID', 'CleanJobTitle', 'JobText', 'CanonEmployer', 'JobDate']

In [None]:
dtypes={'JobID': np.str, 'CleanJobTitle': np.str, 'JobDomain': np.str, 
        'CanonCity': np.str, 'CanonCountry': np.str, 'CanonState': np.str, 
        'JobText': np.str, 'JobURL': np.str, 'PostingHTML': np.float64, 
        'Source': np.str, 'JobReferenceID': np.str, 'Email': np.str, 
        'CanonEmployer': np.str, 'Latitude': np.str, 'Longitude': np.str, 
        'CanonIntermediary': np.str, 'Telephone': np.str, 'CanonJobTitle': 'object', 
        'CanonCounty': np.str, 'DivisionCode': np.float64, 'MSA': np.str, 'LMA': np.str,
        'InternshipFlag': np.str, 'ConsolidatedONET': np.float64, 'CanonCertification': np.str, 
        'CanonSkillClusters': np.str, 'CanonSkills': np.str, 'IsDuplicate': np.str, 
        'IsDuplicateOf': np.float64, 'CanonMaximumDegree': np.str, 'CanonMinimumDegree': np.str, 
        'CanonOtherDegrees': np.str, 'CanonPreferredDegrees': np.str,
        'CanonRequiredDegrees': np.str, 'CIPCode': np.str, 'StandardMajor': np.str, 
        'MaxExperience': np.float64, 'MinExperience': np.float64, 'ConsolidatedInferredNAICS': np.float64, 
        'BGTOcc': np.str, 'MaxAnnualSalary': np.float64, 'MaxHourlySalary': np.float64, 
        'MinAnnualSalary': np.float64, 'MinHourlySalary': np.float64, 'YearsOfExperience': np.str, 
        'CanonJobHours': np.str, 'CanonJobType': np.str, 'CanonPostalCode': np.str, 
        'CanonYearsOfExperienceCanonLevel': np.str, 'CanonYearsOfExperienceLevel': np.str, 
        'ConsolidatedTitle': np.str, 'Language': np.str, 'BGTSubOcc': np.str, 'JobDate': np.str,
        'ConsolidatedDegreeLevels': np.str, 'MaxDegreeLevel': np.float64, 'MinDegreeLevel': np.float64
                       }

Add below the dataset for the week you'd like to test.

In [None]:
%%time

df = pd.read_csv('data_18_0806_0812.csv', 
                 low_memory=False, parse_dates=['JobDate'], usecols=even_smaller_list,
                 dtype=dtypes)

Check the true memory it is occupying in your computer.

In [None]:
df.info(memory_usage='deep')

Calculate the percentage of missing values.

In [None]:
df.isnull().sum() / df.shape[0] * 100

Filter out observations without a job description.

In [None]:
df = df[df['JobText'].notna()].copy()

Clean the `JobText` column, compute the length (in characters) of the job descriptions, and convert clean text into lower case.

In [None]:
%%time

df['clean_text'] = df['JobText'].apply(lambda x: ' '.join(list(filter(None, x.split('\n')))))
df['len_text'] = df['clean_text'].apply(len)
df['low_clean'] = df['clean_text'].apply(lambda x: x.lower())

### Keywords

In [None]:
down_ward = [' will supervise ', 'supervises', ' interns ', ' intern ',
             ' guides ', ' mentors ', ' leads ', ' lead ', 'oversees', 
             'will guide', ' be in charge of ', ' mentor ', 'coaching',
             'mentoring', 'coordinating', 'building teams', 'guiding',
             'advising', 'setting performance standards', 'resolving conflicts',
             'responsibility for outcomes', 'directs', 'appoints', 'instructs',
             'recruits', 'manages'
]

up_ward = [' interns ', ' intern ', 'reports to ', 'report to ', 'answers to', 
           ' managed by ', ' responds to ', ' directed by ', ' receives guidance ', 
           ' supervised by ', 'assists', 'supports', 'helps']

##### Keywords approach.

1. Identify the keywords above
2. Convert the boolean result into integer type
3. Replace downward looking keywords with a 3
4. Subtract upward looking from downward looking to get the mid
5. Replace negative instances of upward looking with a positive 1
6. Change 0's to `NaN`
7. Create a bucket with labels
    - High == downward looking
    - mid == mid
    - low == upward looking
8. Print value counts

In [None]:
%%time

df['down_ward'] = df['low_clean'].str.contains(' will supervise | supervises | interns | intern | guides | mentors | leads | lead | oversees | will guide | be in charge of | mentor | coaching | mentoring | coordinating | building teams | guiding | advising | setting performance standards | resolving conflicts | responsibility for outcomes | directs | appoints | instructs | recruits | manages', regex=True)
df['upward'] = df['low_clean'].str.contains(' interns | intern | reports to | report to | answers to | managed by | responds to | directed by | receives guidance | supervised by | assists | supports | helps', regex=True)
df['upward'] = df['upward'].astype(np.int8)
df['down_ward'] = df['down_ward'].astype(np.int8)
df['down_ward'] = df['down_ward'].replace(1, 3)
df['all_levels'] = (df['down_ward'] - df['upward'])
df['all_levels'] = df['all_levels'].replace(-1, 1)
df['all_levels'] = df['all_levels'].replace(0, np.nan)
labels_dict = {1.0: 'low', 2.0: 'mid', 3.0: 'high'}
df['bucket_label'] = df['all_levels'].map(labels_dict)
df['bucket_label'].value_counts()

## Smaller sample for testing assumption

Fiter out the missing values.

In [None]:
%%time

df_dos = df[df['bucket_label'].notna()].copy()

Release some memory from your computer.

In [None]:
del df

Take a random sample of 50k.

In [None]:
df_dos = df_dos.sample(50000)
df_dos.shape

In [None]:
# stop_words = nltk.corpus.stopwords.words('english')

Deep cleaning function. Notice that we want to keep the stopwords in so that is commented out.

In [None]:
def normalize_doc(doc):
    """
    This function normalizes your list of documents by taking only
    words, numbers, and spaces in between them. It then filters out
    stop words.
    """
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
    filtered_tokens = [token for token in tokens]
    # filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

corp_normalizer = np.vectorize(normalize_doc)

Clean the clean text using all of the cores in your computer.

In [None]:
%%time

with concurrent.futures.ProcessPoolExecutor() as executor:
    results = executor.map(corp_normalizer, df_dos['clean_text'].values)

Extract the elements out and assign it back to the same variable.

In [None]:
%%time

extract_results = [text for text in results]
df_dos['low_clean'] = extract_results

In [None]:
extract_results[0]

In [None]:
df_dos['low_clean'] = df_dos['low_clean'].astype(np.str)

In [None]:
df_dos['len_text'].describe()

Get every word instance as a boolean variable.

In [None]:
%%time

# Here we iterate throught the list of words
for word in down_ward: # and assign the keyword as a variable and a 1 if the word was found
    df_dos[word.strip()] = df_dos['low_clean'].str.contains(word) # 0 if not
    
# Here we iterate throught the list of words
for word in up_ward: # and assign the keyword as a variable and a 1 if the word was found
    df_dos[word.strip()] = df_dos['low_clean'].str.contains(word) # 0 if not

In [None]:
up_stripped = [w.strip() for w in up_ward]
down_stripped = [w.strip() for w in down_ward]

Some the amount of keywords in an observation.

In [None]:
df_dos['up_instances'] = df_dos.loc[:, up_stripped].sum(axis=1)
df_dos['up_instances'].head()

In [None]:
df_dos['down_instances'] = df_dos.loc[:, down_stripped].sum(axis=1)
df_dos['down_instances'].head()

In [None]:
df_dos.head()

Get the first 60 characters of the instance where the keywords appear.

In [None]:
%%time

def get_words(word: str, string: str) -> Union[str, None]:
    if word in string:
        return string[string.index(word):string.index(word)+60]
    
for word in up_ward:
    df_dos[word.strip()] = df_dos['low_clean'].apply(lambda x: get_words(word, x))
    
for word in down_ward:
    df_dos[word.strip()] = df_dos['low_clean'].apply(lambda x: get_words(word, x))

In [None]:
df_dos.head()

Save the dataset.

In [None]:
path = 'Dropbox/Burning Glass/Analysis/company_data/'

In [None]:
df_dos.info(memory_usage='deep')

In [None]:
%%time

df_dos.to_csv(path + 'keywords_check_2018_august.csv', index=False)