# 09 Clustering Specific Companies

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import re, csv, os
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from joblib import parallel_backend
from dask.distributed import Client
import joblib
import nltk
import concurrent.futures as cf
from glob import glob
from sklearn.preprocessing import FunctionTransformer


pd.set_option('display.max_columns', None)
csv.field_size_limit(10000000)

%matplotlib inline

In [2]:
client = Client(processes=False)

In [24]:
path = '~/Dropbox/Burning Glass/Data/companies_76k/filtered_data_16/'
path_out = '~/Dropbox/Burning Glass/Analysis/approach_8'
num = 16
fil_num = '06'

In [25]:
col_names = ['JobID', 'CleanJobTitle', 'CanonCity', 'CanonState', 'CanonPostalCode',
             'BGTOcc', 'clean_text', 'EmployerClean', 'JobDate']

dtypes={'JobID': np.str, 'CanonJobTitle': np.str, 'EmployerClean': np.str,
        'CleanJobTitle': np.str, 'CanonCity': np.str, 'CanonCounty': np.str,
        'CanonState': np.str, 'ConsolidatedTitle': np.str, 'BGTOcc': np.str,
        'JobDate': np.str, 'CanonPostalCode': np.str}

In [26]:
to_remove = ['Assistant Manager', 'Deputy Manager', 'Manager', 'Senior Manager', 'General Manager', 'Assistant Director',
             'Deputy Director', 'Director', 'Senior Director', 'Deputy Vice President', 'Vice President', 'Senior Vice President',
             'President', 'Chief']
to_remove_low = [word.lower() for word in to_remove]

In [27]:
def remove_titles(doc):
    tokens = nltk.word_tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in to_remove_low]
    clean = ' '.join(filtered_tokens)
    return clean

def normalize_doc(doc):
    """
    This function normalizes your list of documents by taking only
    words, numbers, and spaces in between them. It then filters out
    stop words if you want to.
    """
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
    filtered_tokens = [token for token in tokens]
    # filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

corp_normalizer = np.vectorize(normalize_doc)

In [28]:
# files = glob('random_data/rand*.csv')
# files[:4]

In [29]:
# %%time


# def get_files(file):
#     return pd.read_csv(file, dtype=dtypes, 
#                      usecols=best_list, parse_dates=['JobDate'], low_memory=False)


# with cf.ThreadPoolExecutor() as executor:
#     results = executor.map(get_files, files)
    
# df = pd.concat(results)
# df.reset_index(drop=True, inplace=True)
# df.head()

In [30]:
# ddf = dd.read_csv(os.path.join(path, 'da*.csv'), 
#                  engine='python',
#                  dtype=dtypes,
#                  assume_missing=True,
#                  error_bad_lines=False,
#                  blocksize=None,
#                  usecols=col_names,
#                 )
# ddf

In [31]:
# ddf1 = ddf.map_partitions(lambda data: data.drop_duplicates(subset='CleanJobTitle'))

In [32]:
df = pd.read_csv(path + f'data_filtered_{fil_num}.csv', low_memory=False, usecols=col_names, dtype=dtypes).drop_duplicates(subset='CleanJobTitle')
df.head()

Unnamed: 0,CanonCity,CanonState,CleanJobTitle,JobDate,JobID,CanonPostalCode,BGTOcc,EmployerClean,clean_text
0,New York,NY,Visual Merchandiser,2016-02-06,38018841872,10001,27-1026.92,Ethan Allen,Advertisement Visual Merchandiser - Part Ti...
1,New York,NY,Delivery Driver/Warehouse Teammate,2016-02-06,38019329664,10001,53-3032.00,Worldpac,Delivery Driver/Warehouse Teammate Company:...
2,Palatka,FL,Phlebotomist Prn,2016-02-09,38020414142,32177,31-9097.00,Hospital Corporation of America,Phlebotomist PRN( Job Number: 26760-349) ...
3,Waco,TX,Cosmetics Central Market Pl 296,2016-02-11,38021484381,76701,41-2031.00,Belk,Cosmetics Central Tx Market Pl Belk #296 Wa...
4,Sumter,SC,Cosmetics Mall 503,2016-02-11,38021482934,29150,41-1011.00,Belk,"Cosmetics Jessamine Mall Belk #503 Sumter, ..."


In [33]:
%%time

X_train, y_test = train_test_split(df, test_size=.2, random_state=42, shuffle=True)

X_train_2, y_test_2 = X_train.copy(), y_test.copy()

# %%time

# X_train_2['CleanJobTitle'] = X_train_2['CleanJobTitle'].str.lower().apply(lambda x: remove_titles(x, to_remove_low))
# y_test_2['CleanJobTitle'] = y_test_2['CleanJobTitle'].str.lower().apply(lambda x: remove_titles(x, to_remove_low))
# X_train_2.head(10)

rm_titles = np.vectorize(remove_titles)
X_train_2['CleanJobTitle'] = rm_titles(X_train_2['CleanJobTitle'].str.lower().values)
y_test_2['CleanJobTitle'] = rm_titles(y_test_2['CleanJobTitle'].str.lower().values)
y_test_2.head()

CPU times: user 6.74 s, sys: 223 ms, total: 6.97 s
Wall time: 7.56 s


Unnamed: 0,CanonCity,CanonState,CleanJobTitle,JobDate,JobID,CanonPostalCode,BGTOcc,EmployerClean,clean_text
46849,Memphis,TN,associate scrum master,2016-02-16,38023624246,37501,15-1199.95,Express Scripts,"Associate Scrum Master Locations: Memphis, ..."
26250,Kennesaw,GA,detailer,2016-02-11,38021250379,30144,53-7061.00,Carmax,"CARMAX Detailer in Kennesaw, Georgia Automo..."
10250,Reidsville,NC,cosmetics mall 31,2016-02-11,38021483110,27320,41-1011.00,Belk,Cosmetics Penrose Mall Belk # 31 Reidsville...
43130,Eden Prairie,MN,supervalu inc - front end developer,2016-02-17,38024197371,55343,15-1134.92,SuperValu,SUPERVALU Inc SUPERVALU Inc - Front End Develo...
2046,Sterling,VA,permit expeditor,2016-02-06,38019110077,20163,43-5061.00,Bohler Engineering,Permit Expeditor Permit Expeditor ...


In [34]:
preprocessing = Pipeline([
    ('normalizer', FunctionTransformer(corp_normalizer)),
    ('vect', TfidfVectorizer(ngram_range=(1, 1), min_df=10, max_df=.85))
])

km_pipe = Pipeline([
    ('km', KMeans(n_clusters=500, # how many clusters do we want
            max_iter=1000, # reshuffle each centroid x number of times
            n_init=15, # that x num of times can be set here
            random_state=42,
            n_jobs=-1))
])

pipe = Pipeline([
    ('preprocessor', preprocessing),
    ('km_model', km_pipe)
])

##################################################

preprocessing2 = Pipeline([
    ('normalizer2', FunctionTransformer(corp_normalizer)),
    ('vect2', TfidfVectorizer(ngram_range=(1, 1), min_df=10, max_df=.85))
])

km_pipe2 = Pipeline([
    ('km2', KMeans(n_clusters=500, # how many clusters do we want
            max_iter=1000, # reshuffle each centroid x number of times
            n_init=15, # that x num of times can be set here
            random_state=42,
            n_jobs=-1))
])

pipe2 = Pipeline([
    ('preprocessor2', preprocessing2),
    ('km_model2', km_pipe2)
])

In [None]:
%%time

with joblib.parallel_backend('dask'):
    pipe.fit(X_train['CleanJobTitle'].values)
    preprocessed_data = pipe.transform(X_train['CleanJobTitle'].values)
    pipe2.fit(X_train_2['CleanJobTitle'].values)
    preprocessed_data2 = pipe2.transform(X_train_2['CleanJobTitle'].values)



## With Titles

In [None]:
pipe['km_model']['km'].labels_[:10]

In [None]:
clusters = pipe['km_model']['km'].labels_
X_train['clusters'] = clusters
X_train['distance'] = preprocessed_data.sum(axis=1).round(2)
X_train['dist_dummy'] = np.where(X_train['distance'] < np.percentile(X_train['distance'], 5), 1, 0)

test_transformed = pipe['preprocessor'].transform(y_test['CleanJobTitle'].values)
test_predict = pipe['km_model'].predict(test_transformed)
y_test['predictions'] = test_predict
y_test['distance'] = test_transformed.sum(axis=1).round(2)
y_test['dist_dummy'] = np.where(y_test['distance'] < np.percentile(y_test['distance'], 5), 1, 0)

## Without Titles

In [None]:
pipe2['km_model2']['km2'].labels_[:10]

In [None]:
clusters2 = pipe2['km_model2']['km2'].labels_
X_train_2['clusters2'] = clusters2
X_train_2['distance2'] = preprocessed_data2.sum(axis=1).round(2)
X_train_2['dist_dummy2'] = np.where(X_train_2['distance2'] < np.percentile(X_train_2['distance2'], 5), 1, 0)

test_transformed2 = pipe2['preprocessor2'].transform(y_test_2['CleanJobTitle'].values)
test_predict2 = pipe2['km_model2'].predict(test_transformed2)
y_test_2['predictions2'] = test_predict2
y_test_2['distance2'] = test_transformed2.sum(axis=1).round(2)
y_test_2['dist_dummy2'] = np.where(y_test_2['distance2'] < np.percentile(y_test_2['distance2'], 5), 1, 0)

Back into the main Dataset

In [None]:
X_train['clusters2'] = clusters2
X_train['distance2'] = preprocessed_data2.sum(axis=1).round(2)
X_train['dist_dummy2'] = np.where(X_train_2['distance2'] < np.percentile(X_train_2['distance2'], 5), 1, 0)

y_test['predictions2'] = test_predict2
y_test['distance2'] = test_transformed2.sum(axis=1).round(2)
y_test['dist_dummy2'] = np.where(y_test_2['distance2'] < np.percentile(y_test_2['distance2'], 5), 1, 0)

In [None]:
X_train.head()

In [None]:
y_test.head()

Save Trained Pipelines

In [None]:
joblib.dump(pipe, f'models/titles_500c_pipe_{num}.pkl');
joblib.dump(pipe2, f'models/no_titles_500c_pipe_{num}.pkl');

In [None]:
%%time

X_train.to_csv(path_out + '/clustering' + f'/train_titles_in_500c_{num}.csv', index=False)
y_test.to_csv(path_out + '/clustering' + f'/test_titles_in_500c_{num}.csv', index=False)
X_train_2.to_csv(path_out + '/clustering' + f'/train_titles_out_500c_{num}.csv', index=False)
y_test_2.to_csv(path_out + '/clustering' + f'/test_titles_out_500c_{num}.csv', index=False)