# 09 Clustering Specific Companies

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import re, csv, os
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from joblib import parallel_backend
from dask.distributed import Client
import joblib
import nltk
import concurrent.futures as cf
from glob import glob
from sklearn.preprocessing import FunctionTransformer
from typing import List


pd.set_option('display.max_columns', None)
csv.field_size_limit(10000000)

%matplotlib inline

In [2]:
client = Client(processes=False)

In [3]:
path = '~/Dropbox/Burning Glass/Data/companies_76k/filtered_data_07/'
path_out = '~/Dropbox/Burning Glass/Analysis/approach_8'
num = 7
fil_num = '05'

In [4]:
col_names = ['JobID', 'CleanJobTitle', 'CanonCity', 'CanonState', 'CanonPostalCode',
             'BGTOcc', 'clean_text', 'EmployerClean', 'JobDate']

dtypes={'JobID': np.str, 'CanonJobTitle': np.str, 'EmployerClean': np.str,
        'CleanJobTitle': np.str, 'CanonCity': np.str, 'CanonCounty': np.str,
        'CanonState': np.str, 'ConsolidatedTitle': np.str, 'BGTOcc': np.str,
        'JobDate': np.str, 'CanonPostalCode': np.str}

In [5]:
to_remove = ['Assistant Manager', 'Deputy Manager', 'Manager', 'Senior Manager', 'General Manager', 'Assistant Director',
             'Deputy Director', 'Director', 'Senior Director', 'Deputy Vice President', 'Vice President', 'Senior Vice President',
             'President', 'Chief']
to_remove_low = [word.lower() for word in to_remove]

In [9]:
def remove_titles(array: pd.Series, list_of_words: List[str]) -> pd.Series:
    """
    This function takes in a pandas series containing string and a 
    list of words with the words to remove from the series. It returns
    the clean series back.
    """
    
    for word in list_of_words:
        array = array.str.lower().str.replace(word, '')

    return array

def normalize_doc(doc):
    """
    This function normalizes your list of documents by taking only
    words, numbers, and spaces in between them. It then filters out
    stop words if you want to.
    """
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
    filtered_tokens = [token for token in tokens]
    # filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

corp_normalizer = np.vectorize(normalize_doc)

In [7]:
# files = glob('random_data/rand*.csv')
# files[:4]

In [8]:
# %%time


# def get_files(file):
#     return pd.read_csv(file, dtype=dtypes, 
#                      usecols=best_list, parse_dates=['JobDate'], low_memory=False)


# with cf.ThreadPoolExecutor() as executor:
#     results = executor.map(get_files, files)
    
# df = pd.concat(results)
# df.reset_index(drop=True, inplace=True)
# df.head()

In [9]:
# ddf = dd.read_csv(os.path.join(path, 'da*.csv'), 
#                  engine='python',
#                  dtype=dtypes,
#                  assume_missing=True,
#                  error_bad_lines=False,
#                  blocksize=None,
#                  usecols=col_names,
#                 )
# ddf

In [10]:
# ddf1 = ddf.map_partitions(lambda data: data.drop_duplicates(subset='CleanJobTitle'))

In [7]:
df = pd.read_csv(path + f'data_filtered_{fil_num}.csv', low_memory=False, usecols=col_names, dtype=dtypes).drop_duplicates(subset='CleanJobTitle')
df.head()

Unnamed: 0,CanonCity,CanonState,CleanJobTitle,JobDate,JobID,CanonPostalCode,BGTOcc,EmployerClean,clean_text
0,Rock Springs,WY,L P N Or Certified Nursing Assistant/Medical/S...,2007-05-24,263335574,17365,29-2061.00,Memorial Hospital of Sweetwater County,Job Category: Licensed Practical or Vocational...
1,Weirton,WV,Registered Nurse,2007-05-21,263299016,26062,29-1141.00,Weirton Medical Center,Registered Nurse- RN MS -Med-Surg | Cirrus Med...
3,Athens,WV,Faculty,2007-05-25,263296995,24712,25-1199.91,Baystate Health,Specialty: Trauma Surgery Company: Baystate He...
4,Rock Springs,WY,Registered Nurse/Intensive Care Unit,2007-05-24,263345697,17365,29-1141.03,Memorial Hospital of Sweetwater County,Caregiver Jobs Clearinghouse carecareers.net H...
6,Rawlins,WY,Psychologist,2007-05-21,263346693,16774,19-3031.00,Spectrum Health,Company: Spectrum Healthcare Resources Facilit...


In [10]:
%%time

X_train, y_test = train_test_split(df, test_size=.2, random_state=42, shuffle=True)
X_train_2, y_test_2 = X_train.copy(), y_test.copy()

X_train_2['CleanJobTitle'] = remove_titles(X_train_2['CleanJobTitle'], to_remove_low)
y_test_2['CleanJobTitle'] = remove_titles(y_test_2['CleanJobTitle'], to_remove_low)
y_test_2.head()

CPU times: user 595 ms, sys: 26.6 ms, total: 622 ms
Wall time: 621 ms


Unnamed: 0,CanonCity,CanonState,CleanJobTitle,JobDate,JobID,CanonPostalCode,BGTOcc,EmployerClean,clean_text
36260,Minneapolis,MN,licensed practical nurse - home care and hospice,2007-05-29,336708720,-10131,29-2061.00,Fairview Health Services,jtext dummybgt
46939,Lincoln City,OR,supervisor sales associate,2007-06-09,318197938,31831,41-2031.00,Tanger Outlets,jtext dummybgt
20409,Antioch,CA,staff nurse ii pn075 kfhp bm,2007-05-21,337625639,28995,29-1141.00,Kaiser Permanente,| | | | Kaiser Permanente Staff Nurse II (...
47470,White Plains,NY,patent paralegal i,2007-06-06,318767943,10602,23-2011.00,Dorsey & Whitney,"Dorsey & Whitney LLP - New York, NY Work Type ..."
17180,Dallas,TX,operating room registered nurse 657687,2007-05-24,336374497,9683,29-1141.00,Texas Health Resources,jtext dummybgt


In [11]:
X_train_2.loc[X_train_2.CleanJobTitle.str.contains('manager'), 'CleanJobTitle']

Series([], Name: CleanJobTitle, dtype: object)

In [9]:
# %%time

# with cf.ThreadPoolExecutor(max_workers=28) as e:
#     CleanJobTitle_X = e.map(remove_titles, X_train_2['CleanJobTitle'].str.lower().values)
#     CleanJobTitle_y = e.map(remove_titles, y_test_2['CleanJobTitle'].str.lower().values)

CPU times: user 4.98 s, sys: 281 ms, total: 5.27 s
Wall time: 5.08 s


In [13]:
preprocessing = Pipeline([
    ('normalizer', FunctionTransformer(corp_normalizer)),
    ('vect', TfidfVectorizer(ngram_range=(1, 1), min_df=10, max_df=.85))
])

km_pipe = Pipeline([
    ('km', KMeans(n_clusters=500, # how many clusters do we want
            max_iter=1000, # reshuffle each centroid x number of times
            n_init=15, # that x num of times can be set here
            random_state=42,
            n_jobs=-1))
])

pipe = Pipeline([
    ('preprocessor', preprocessing),
    ('km_model', km_pipe)
])

##################################################

preprocessing2 = Pipeline([
    ('normalizer2', FunctionTransformer(corp_normalizer)),
    ('vect2', TfidfVectorizer(ngram_range=(1, 1), min_df=10, max_df=.85))
])

km_pipe2 = Pipeline([
    ('km2', KMeans(n_clusters=500, # how many clusters do we want
            max_iter=1000, # reshuffle each centroid x number of times
            n_init=15, # that x num of times can be set here
            random_state=42,
            n_jobs=-1))
])

pipe2 = Pipeline([
    ('preprocessor2', preprocessing2),
    ('km_model2', km_pipe2)
])

In [14]:
%%time

with joblib.parallel_backend('dask'):
    pipe.fit(X_train['CleanJobTitle'].values)
    preprocessed_data = pipe.transform(X_train['CleanJobTitle'].values)
    pipe2.fit(X_train_2['CleanJobTitle'].values)
    preprocessed_data2 = pipe2.transform(X_train_2['CleanJobTitle'].values)



CPU times: user 8min 13s, sys: 27.7 s, total: 8min 40s
Wall time: 3min 53s


## With Titles

In [66]:
pipe['km_model']['km'].labels_[:10]

array([481, 126,  16, 213,  95,  77, 473,  16,  72, 323], dtype=int32)

In [67]:
clusters = pipe['km_model']['km'].labels_
X_train['clusters'] = clusters
X_train['distance'] = preprocessed_data.sum(axis=1).round(2)
X_train['dist_dummy'] = np.where(X_train['distance'] < np.percentile(X_train['distance'], 5), 1, 0)

test_transformed = pipe['preprocessor'].transform(y_test['CleanJobTitle'].values)
test_predict = pipe['km_model'].predict(test_transformed)
y_test['predictions'] = test_predict
y_test['distance'] = test_transformed.sum(axis=1).round(2)
y_test['dist_dummy'] = np.where(y_test['distance'] < np.percentile(y_test['distance'], 5), 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

## Without Titles

In [68]:
pipe2['km_model2']['km2'].labels_[:10]

array([361, 487,   4, 356, 169, 281,  50,   4,  20, 338], dtype=int32)

In [69]:
clusters2 = pipe2['km_model2']['km2'].labels_
X_train_2['clusters2'] = clusters2
X_train_2['distance2'] = preprocessed_data2.sum(axis=1).round(2)
X_train_2['dist_dummy2'] = np.where(X_train_2['distance2'] < np.percentile(X_train_2['distance2'], 5), 1, 0)

test_transformed2 = pipe2['preprocessor2'].transform(y_test_2['CleanJobTitle'].values)
test_predict2 = pipe2['km_model2'].predict(test_transformed2)
y_test_2['predictions2'] = test_predict2
y_test_2['distance2'] = test_transformed2.sum(axis=1).round(2)
y_test_2['dist_dummy2'] = np.where(y_test_2['distance2'] < np.percentile(y_test_2['distance2'], 5), 1, 0)

Back into the main Dataset

In [70]:
X_train['clusters2'] = clusters2
X_train['distance2'] = preprocessed_data2.sum(axis=1).round(2)
X_train['dist_dummy2'] = np.where(X_train_2['distance2'] < np.percentile(X_train_2['distance2'], 5), 1, 0)

y_test['predictions2'] = test_predict2
y_test['distance2'] = test_transformed2.sum(axis=1).round(2)
y_test['dist_dummy2'] = np.where(y_test_2['distance2'] < np.percentile(y_test_2['distance2'], 5), 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [71]:
X_train.head()

Unnamed: 0,CanonCity,CanonState,CleanJobTitle,JobDate,JobID,CanonPostalCode,BGTOcc,EmployerClean,clean_text,clusters,distance,dist_dummy,clusters2,distance2,dist_dummy2
26147,Dallas,TX,Event And Promotions Assistant,2020-03-27,38737495428,75201,27-3031.94,Ultimate Solutions,Events and Promotions Assistant\n\nUltimate So...,481,611.66,0,361,611.87,0
19575,Purchase,NY,"Chart Retrieval Specialist In Purchase, | Care...",2020-03-27,38737382645,10577,21-1012.92,Ciox Health,Please Enable Cookies to Continue\nPlease enab...,126,604.61,0,487,605.89,0
4859,Phoenix,AZ,Sandwich Maker,2020-03-26,38736769337,85001,35-3021.00,Phoenix Group,Sandwich Maker\n\nIke's Love and Sandwiches - ...,16,360.73,1,4,362.71,1
71862,Dallas,TX,Dietary/Food Service Team Member,2020-03-28,38740084037,75201,35-3021.00,Capital Senior Living,APPLY TODAY! Hiring for Dietary/Food Service T...,213,612.43,0,356,613.07,0
1321,Vineland,NJ,Certified Home Health Aide Certified Home Heal...,2020-03-26,38736305424,8360,31-1011.00,Bayada Home Health Care,Certified Home Health Aide (CHHA)-Per Diem\n\n...,95,611.18,0,169,611.82,0


In [72]:
y_test.head()

Unnamed: 0,CanonCity,CanonState,CleanJobTitle,JobDate,JobID,CanonPostalCode,BGTOcc,EmployerClean,clean_text,predictions,distance,dist_dummy,predictions2,distance2,dist_dummy2
6735,Des Moines,IA,Commercial Lines - Small Business Account Manager,2020-03-26,38736811036,50301,13-2011.94,Reynolds & Reynolds,Commercial Lines - Small Business Account Mana...,225,2.35,0,306,2.19,0
1850,Chicago,IL,Ips Employment Specialist - Deaf,2020-03-26,38736349855,60290,21-1012.91,Thresholds,IPS Employment Specialist - Deaf\n\nThresholds...,119,1.0,0,33,1.0,0
62054,San Jose,CA,"Engineer Principal, Quality",2020-03-27,38737588900,95002,17-2112.00,Lumentum Operations,"Engineer Principal, Quality\n\nLumentum Operat...",390,1.7,0,233,1.7,0
49536,Philadelphia,PA,Colorectal Surgeon,2020-03-27,38738369873,19019,29-1062.00,Palm Health Resources,Colorectal Surgeon\n\nPalm Health Resources\n\...,16,1.0,0,4,1.0,0
16752,Mechanicsville,VA,Licensed Practical Nurse Prn,2020-03-26,38736742551,23111,29-2061.00,Sheltering Arms Physical Rehabilitation Centers,LPN PRN\n\nSheltering Arms Physical Rehabilita...,108,1.97,0,292,1.97,0


Save Trained Pipelines

In [73]:
joblib.dump(pipe, f'models/titles_500c_pipe_{num}.pkl');
joblib.dump(pipe2, f'models/no_titles_500c_pipe_{num}.pkl');

In [None]:
%%time

X_train.to_csv(path_out + '/clustering' + f'/train_titles_in_500c_{num}.csv', index=False)
y_test.to_csv(path_out + '/clustering' + f'/test_titles_in_500c_{num}.csv', index=False)
X_train_2.to_csv(path_out + '/clustering' + f'/train_titles_out_500c_{num}.csv', index=False)
y_test_2.to_csv(path_out + '/clustering' + f'/test_titles_out_500c_{num}.csv', index=False)