# 09 Clustering Specific Companies

In [1]:
import dask, dask.dataframe as dd
import matplotlib.pyplot as plt
import pandas as pd
import re, csv, os
import numpy as np
from typing import List, Union
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from joblib import parallel_backend
# import dask_ml.joblib
from dask.distributed import Client
import joblib
import nltk
import concurrent.futures as cf
from glob import glob
from sklearn.preprocessing import FunctionTransformer


pd.set_option('display.max_columns', None)
csv.field_size_limit(10000000)

%matplotlib inline

In [None]:
client = Client(processes=False)

In [None]:
path = '~/Dropbox/Burning Glass/Data/companies_76k/filtered_data_14/'
path_out = '~/Dropbox/Burning Glass/Analysis/approach_8'
num = 14
fil_num = '04'

In [None]:
col_names = ['JobID', 'CleanJobTitle', 'CanonCity', 'CanonState', 'CanonPostalCode',
             'BGTOcc', 'clean_text', 'EmployerClean', 'JobDate']

dtypes={'JobID': np.str, 'CanonJobTitle': np.str, 'EmployerClean': np.str,
        'CleanJobTitle': np.str, 'CanonCity': np.str, 'CanonCounty': np.str,
        'CanonState': np.str, 'ConsolidatedTitle': np.str, 'BGTOcc': np.str,
        'JobDate': np.str, 'CanonPostalCode': np.str}

In [None]:
to_remove = ['Assistant Manager', 'Deputy Manager', 'Manager', 'Senior Manager', 'General Manager', 'Assistant Director',
             'Deputy Director', 'Director', 'Senior Director', 'Deputy Vice President', 'Vice President', 'Senior Vice President',
             'President', 'Chief']
to_remove_low = [word.lower() for word in to_remove]

In [None]:
def remove_titles(doc):
    tokens = nltk.word_tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in to_remove_low]
    clean = ' '.join(filtered_tokens)
    return clean

def normalize_doc(doc):
    """
    This function normalizes your list of documents by taking only
    words, numbers, and spaces in between them. It then filters out
    stop words if you want to.
    """
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
    filtered_tokens = [token for token in tokens]
    # filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

corp_normalizer = np.vectorize(normalize_doc)

In [None]:
# files = glob('random_data/rand*.csv')
# files[:4]

In [None]:
# %%time


# def get_files(file):
#     return pd.read_csv(file, dtype=dtypes, 
#                      usecols=best_list, parse_dates=['JobDate'], low_memory=False)


# with cf.ThreadPoolExecutor() as executor:
#     results = executor.map(get_files, files)
    
# df = pd.concat(results)
# df.reset_index(drop=True, inplace=True)
# df.head()

In [None]:
# ddf = dd.read_csv(os.path.join(path, 'da*.csv'), 
#                  engine='python',
#                  dtype=dtypes,
#                  assume_missing=True,
#                  error_bad_lines=False,
#                  blocksize=None,
#                  usecols=col_names,
#                 )
# ddf

In [6]:
# ddf1 = ddf.map_partitions(lambda data: data.drop_duplicates(subset='CleanJobTitle'))

In [None]:
df = pd.read_csv(path + f'data_filtered_{fil_num}.csv', low_memory=False, usecols=col_names, dtype=dtypes).drop_duplicates(subset='CleanJobTitle')
df.head()

In [None]:
%%time

X_train, y_test = train_test_split(df, test_size=.2, random_state=42, shuffle=True)

X_train_2, y_test_2 = X_train.copy(), y_test.copy()

# %%time

# X_train_2['CleanJobTitle'] = X_train_2['CleanJobTitle'].str.lower().apply(lambda x: remove_titles(x, to_remove_low))
# y_test_2['CleanJobTitle'] = y_test_2['CleanJobTitle'].str.lower().apply(lambda x: remove_titles(x, to_remove_low))
# X_train_2.head(10)

rm_titles = np.vectorize(remove_titles)
X_train_2['CleanJobTitle'] = rm_titles(X_train_2['CleanJobTitle'].str.lower().values)
y_test_2['CleanJobTitle'] = rm_titles(y_test_2['CleanJobTitle'].str.lower().values)
y_test_2.head()

In [None]:
preprocessing = Pipeline([
    ('normalizer', FunctionTransformer(corp_normalizer)),
    ('vect', TfidfVectorizer(ngram_range=(1, 1), min_df=10, max_df=.85))
])

km_pipe = Pipeline([
    ('km', KMeans(n_clusters=500, # how many clusters do we want
            max_iter=1000, # reshuffle each centroid x number of times
            n_init=15, # that x num of times can be set here
            random_state=42,
            n_jobs=-1))
])

pipe = Pipeline([
    ('preprocessor', preprocessing),
    ('km_model', km_pipe)
])

##################################################

preprocessing2 = Pipeline([
    ('normalizer2', FunctionTransformer(corp_normalizer)),
    ('vect2', TfidfVectorizer(ngram_range=(1, 1), min_df=10, max_df=.85))
])

km_pipe2 = Pipeline([
    ('km2', KMeans(n_clusters=500, # how many clusters do we want
            max_iter=1000, # reshuffle each centroid x number of times
            n_init=15, # that x num of times can be set here
            random_state=42,
            n_jobs=-1))
])

pipe2 = Pipeline([
    ('preprocessor2', preprocessing2),
    ('km_model2', km_pipe2)
])

In [10]:
%%time

with joblib.parallel_backend('dask'):
    pipe.fit(X_train['CleanJobTitle'].values)
    preprocessed_data = pipe.transform(X_train['CleanJobTitle'].values)
    pipe2.fit(X_train_2['CleanJobTitle'].values)
    preprocessed_data2 = pipe2.transform(X_train_2['CleanJobTitle'].values)



CPU times: user 4min 38s, sys: 14.2 s, total: 4min 52s
Wall time: 2min 20s


## With Titles

In [11]:
pipe['km_model']['km'].labels_[:10]

array([ 91, 484, 346, 248,   5,  79, 345, 419, 426, 203], dtype=int32)

In [12]:
clusters = pipe['km_model']['km'].labels_
X_train['clusters'] = clusters
X_train['distance'] = preprocessed_data.sum(axis=1).round(2)
X_train['dist_dummy'] = np.where(X_train['distance'] < np.percentile(X_train['distance'], 5), 1, 0)

test_transformed = pipe['preprocessor'].transform(y_test['CleanJobTitle'].values)
test_predict = pipe['km_model'].predict(test_transformed)
y_test['predictions'] = test_predict
y_test['distance'] = test_transformed.sum(axis=1).round(2)
y_test['dist_dummy'] = np.where(y_test['distance'] < np.percentile(y_test['distance'], 5), 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

## Without Titles

In [13]:
pipe2['km_model2']['km2'].labels_[:10]

array([ 97,  37,  98,  56,  87, 428, 105, 178, 453, 353], dtype=int32)

In [14]:
clusters2 = pipe2['km_model2']['km2'].labels_
X_train_2['clusters2'] = clusters2
X_train_2['distance2'] = preprocessed_data2.sum(axis=1).round(2)
X_train_2['dist_dummy2'] = np.where(X_train_2['distance2'] < np.percentile(X_train_2['distance2'], 5), 1, 0)

test_transformed2 = pipe2['preprocessor2'].transform(y_test_2['CleanJobTitle'].values)
test_predict2 = pipe2['km_model2'].predict(test_transformed2)
y_test_2['predictions2'] = test_predict2
y_test_2['distance2'] = test_transformed2.sum(axis=1).round(2)
y_test_2['dist_dummy2'] = np.where(y_test_2['distance2'] < np.percentile(y_test_2['distance2'], 5), 1, 0)

Back into the main Dataset

In [15]:
X_train['clusters2'] = clusters2
X_train['distance2'] = preprocessed_data2.sum(axis=1).round(2)
X_train['dist_dummy2'] = np.where(X_train_2['distance2'] < np.percentile(X_train_2['distance2'], 5), 1, 0)

y_test['predictions2'] = test_predict2
y_test['distance2'] = test_transformed2.sum(axis=1).round(2)
y_test['dist_dummy2'] = np.where(y_test_2['distance2'] < np.percentile(y_test_2['distance2'], 5), 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [16]:
X_train.head()

Unnamed: 0,CanonCity,CanonState,CleanJobTitle,JobDate,JobID,CanonPostalCode,BGTOcc,EmployerClean,clean_text,clusters,distance,dist_dummy,clusters2,distance2,dist_dummy2
825,La Crosse,WI,Garden Shop/Seasonal Lead,2014-01-30,425388452,54601,41-2031.00,Sears,Garden Shop/Seasonal Lead job in LA CROSSE\n\n...,91,629.13,0,97,629.59,0
345,Atlanta,GA,Drupal Developer,2014-02-03,425346420,30301,15-1134.92,Saviance Technologies,Company: saviance Technologies\nJob Title: Dru...,484,626.85,0,37,627.37,0
10783,Arlington,TX,"Surgical Services Technician Prn, Park Surgery...",2014-02-01,425784967,76001,29-2055.00,Hospital Corporation of America,"Surgical Services Tech PRN, Trinity Park Surge...",346,622.27,0,98,622.29,0
29333,Everett,WA,"Regional Director, Accountable Care & Transformat",2014-02-05,425924784,98201,11-2022.00,Providence Health & Services,* Medical/Healthcare\n* Providence Health & Se...,248,623.4,0,56,626.04,0
3523,Maywood,IL,10028 Registered Nurse,2014-01-31,425480440,60153,29-1141.00,Loyola Medicine,10028 REGISTERED NURSE 0803\n\nCompany: Loyola...,5,614.33,1,87,611.51,1


In [17]:
y_test.head()

Unnamed: 0,CanonCity,CanonState,CleanJobTitle,JobDate,JobID,CanonPostalCode,BGTOcc,EmployerClean,clean_text,predictions,distance,dist_dummy,predictions2,distance2,dist_dummy2
19978,Orlando,FL,"Fuel Island Attendant Wtih Cdl Licensed, Pm",2014-02-04,426380066,32801,53-6031.00,Publix,Position Description\n\n performing a number o...,402,1.99,0,102,1.99,0
27491,Naperville,IL,"Sales Manager - , Ilin",2014-02-06,425767881,60540,11-2022.00,Rent-A-Center,"Sales Manager - #9448 - Naperville, IL\nin NAP...",179,1.41,0,84,1.0,0
2774,Medford,OR,Polysomnographic Technologist,2014-02-01,425451603,97501,29-2032.00,Providence Health & Services,Posted: 2014-01-28 8:52am\n\nPolysomnographic ...,118,1.0,0,332,1.0,0
4270,Provo,UT,Patient Service Representative - Prov,2014-02-02,425504775,84601,43-4051.03,Intermountain Healthcare,Patient Service Representative - (part-time) P...,25,1.73,0,209,1.73,0
15787,Grand Forks,ND,Senior Leasing Consultant,2014-02-03,426103684,58201,41-9021.00,Campus Crest,Campus Crest Senior Leasing Consultant in Gran...,82,1.66,0,168,1.66,0


Save Trained Pipelines

In [18]:
joblib.dump(pipe, f'models/titles_500c_pipe_{num}.pkl');
joblib.dump(pipe2, f'models/no_titles_500c_pipe_{num}.pkl');

In [19]:
%%time

X_train.to_csv(path_out + '/clustering' + f'/train_titles_in_500c_{num}.csv', index=False)
y_test.to_csv(path_out + '/clustering' + f'/test_titles_in_500c_{num}.csv', index=False)
X_train_2.to_csv(path_out + '/clustering' + f'/train_titles_out_500c_{num}.csv', index=False)
y_test_2.to_csv(path_out + '/clustering' + f'/test_titles_out_500c_{num}.csv', index=False)

CPU times: user 4.82 s, sys: 555 ms, total: 5.38 s
Wall time: 5.72 s


In [None]:
pcadf = pd.DataFrame(
    pipe["preprocessor"].transform(data),
    columns=["component_1", "component_2"],
)

pcadf["predicted_cluster"] = pipe["clusterer"]["kmeans"].labels_
pcadf["true_label"] = label_encoder.inverse_transform(true_labels)



plt.style.use("fivethirtyeight")
plt.figure(figsize=(8, 8))

scat = sns.scatterplot(
    "component_1",
    "component_2",
    s=50,
    data=pcadf,
    hue="predicted_cluster",
    style="true_label",
    palette="Set2",
)

scat.set_title(
    "Clustering results from TCGA Pan-Cancer\nGene Expression Data"
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)

plt.show()