In [45]:
import numpy as np
# Loading pickled objects
import pickle

# Latent Dirchlet Model
from gensim.models import LdaModel

# Build corpus
from gensim import corpora

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
# Clustering Evaluation
from sklearn.metrics import calinski_harabasz_score, silhouette_score, confusion_matrix

# Plotting
import plotly.express as px

## Load Processed Data

In [9]:
with open('../data/processed_data.pkl', mode='rb') as file:
    processed_data = pickle.load(file)

## Load Title Tokens

In [10]:
titles = [' '.join(record['title_tokens']) for record in processed_data]
titles[0]

'railcar team entry'

In [70]:
titles[69]

'azure data warehouse engineer'

## Make Title Bag of Words

In [11]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(titles)

## Load Description Tokens

In [12]:
description_tokens =[record['description_tokens'] for record in processed_data]
description_tokens[0]

['company_overview',
 'come_join_win',
 'team',
 'since',
 'plastic_express',
 'lead',
 'bulk',
 'trucking',
 'bulk',
 'terminal',
 'packaging',
 'warehouse',
 'need',
 'plastic',
 'industry',
 'strategic',
 'location',
 'modern',
 'system',
 'dedicate',
 'employee',
 'allow_us',
 'provide',
 'custom',
 'tailored',
 'logistical',
 'solution',
 'fulfill',
 'challenging',
 'need',
 'customer',
 'plastic_express',
 'operate',
 'warehouse',
 'location',
 'rail',
 'terminal',
 'across',
 'us',
 'many',
 'plastic_express',
 'site',
 'also',
 'handle',
 'commodity',
 'include',
 'paper',
 'roll',
 'steel',
 'building',
 'material',
 'dry',
 'bulk',
 'material',
 'plastic_express',
 'operate',
 'roughly',
 'truck',
 'approximately',
 'trailer',
 'perform',
 'full',
 'bulk',
 'truck',
 'distribution',
 'business',
 'plastic_express',
 'headquarter',
 'city',
 'industry',
 'ca',
 'employee',
 'nationwide',
 'goal',
 'always',
 'exceed',
 'customer',
 'expectation',
 'attitude',
 'differentiate_u

In [71]:
description_tokens[69]

['billion_dollar_publicly_trade',
 'company',
 'around_since',
 'operate',
 'like',
 'startup_still',
 'grow_webby',
 'award_win',
 'application',
 'around',
 'year',
 'subscriber_bring',
 'massive_amount',
 'data',
 'look',
 'senior',
 'data',
 'engineer',
 'experience',
 'data',
 'warehouse',
 'scratch_highly_proficient',
 'azure',
 'technology',
 'migrate_top_reason',
 'work',
 'us',
 'stable_yet',
 'grow',
 'company',
 'base',
 'annual',
 'targeted',
 'bonus_stock_option',
 'full_benefit',
 'pto',
 'etc',
 'ability',
 'mentor',
 'team',
 'edm_initiative',
 'data',
 'pipeline',
 'end_end',
 'participate',
 'technology',
 'discovery',
 'implementation',
 'architecture',
 'design',
 'team',
 'coordination',
 'etc',
 'develop',
 'high_performance',
 'script',
 'enterprise',
 'data',
 'business_intelligence',
 'analytics',
 'etc',
 'manage',
 'code',
 'version',
 'source',
 'control',
 'partner',
 'team',
 'work',
 'system',
 'owner',
 'solve_complex',
 'data',
 'issue',
 'need',
 'posi

## Make Description Bag of Words

In [13]:
dictionary = corpora.Dictionary(description_tokens)
dictionary.filter_extremes(no_below=3)
description_corpus = [dictionary.doc2bow(token) for token in description_tokens]
description_corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 4),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 3),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 3),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 2),
 (39, 1),
 (40, 1),
 (41, 3),
 (42, 1),
 (43, 2),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 2),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 2),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 2),
 (71, 2),
 (72, 1),
 (73, 1),
 (74, 2),
 (75, 3),
 (76, 1),
 (77, 2),
 (78, 4),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 2),
 (83, 1),
 (84, 1),
 (85, 2),
 (86, 1),
 (87, 1),
 (88, 1),
 (89, 3),
 (90, 1),
 (91, 1),
 (92, 1),
 (93, 1),
 (94, 1),
 (95, 1),
 (96, 1),
 (97, 1),
 (98, 2),
 (99, 1),
 (100, 1),

## Load Best Model

In [14]:
file_path = '../model/LDA-24topics'
model = LdaModel.load(file_path)

## Get Description Topic Distributions

In [15]:
rows = []
for description in description_corpus:
    topics = model.get_document_topics(description)
    vec = np.zeros(24)
    for key, prob in topics:
        vec[key] = prob
    rows.append(vec)
topic_distributions = np.array(rows)

## KMeans Clustering of Description Distributions

In [29]:
outcomes = {
    'n_clusters': [],
    'CH_score': [],
    'WCSS_score': [],
    'S_score': []
}

for n_clusters in range(2, 40):
    clusterer = KMeans(n_clusters=n_clusters)
    target_train = clusterer.fit_predict(topic_distributions)
    outcomes['CH_score'].append(calinski_harabasz_score(topic_distributions, target_train))
    outcomes['WCSS_score'].append(clusterer.inertia_)
    outcomes['S_score'].append(silhouette_score(topic_distributions,target_train))
    outcomes['n_clusters'].append(n_clusters)
    print(f'Finished evaluating for {n_clusters} clusters.')

Finished evaluating for 2 clusters.
Finished evaluating for 3 clusters.
Finished evaluating for 4 clusters.
Finished evaluating for 5 clusters.
Finished evaluating for 6 clusters.
Finished evaluating for 7 clusters.
Finished evaluating for 8 clusters.
Finished evaluating for 9 clusters.
Finished evaluating for 10 clusters.
Finished evaluating for 11 clusters.
Finished evaluating for 12 clusters.
Finished evaluating for 13 clusters.
Finished evaluating for 14 clusters.
Finished evaluating for 15 clusters.
Finished evaluating for 16 clusters.
Finished evaluating for 17 clusters.
Finished evaluating for 18 clusters.
Finished evaluating for 19 clusters.
Finished evaluating for 20 clusters.
Finished evaluating for 21 clusters.
Finished evaluating for 22 clusters.
Finished evaluating for 23 clusters.
Finished evaluating for 24 clusters.
Finished evaluating for 25 clusters.
Finished evaluating for 26 clusters.
Finished evaluating for 27 clusters.
Finished evaluating for 28 clusters.
Finished 

In [30]:
fig = px.line(
    x=outcomes['n_clusters'], 
    y=outcomes['CH_score'], 
    title='Calinski Harabasz Score',
    labels ={
        'x': 'Number of Clusters',
        'y': 'CH Score',
    }
)
fig.show()

In [31]:
fig = px.line(
    x=outcomes['n_clusters'], 
    y=outcomes['S_score'], 
    title='Silhouette Score',
    labels ={
        'x': 'Number of Clusters',
        'y': 'Silhouette Score',
    }
)
fig.show()

In [33]:
fig = px.line(
    x=outcomes['n_clusters'], 
    y=outcomes['WCSS_score'], 
    title='Within Cluster Sum of Squared Residuals',
    labels ={
        'x': 'Number of Clusters',
        'y': 'WCSS Score',
    }
)
fig.show()

In [37]:
n_clusters = 17
clusterer = KMeans(n_clusters=n_clusters)
y = clusterer.fit_predict(topic_distributions)
print(y.shape)
y[:10]

(9485,)


array([16,  5,  3,  7,  1,  7,  7,  1,  3,  2], dtype=int32)

In [38]:
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [40]:
title_model = MultinomialNB()

In [41]:
title_model.fit(X_train, y_train)

MultinomialNB()

In [64]:
pred_train = title_model.predict(X_train)
score_train = title_model.score(X_train, y_train)
pred_test = title_model.predict(X_test)
score_test = title_model.score(X_test, y_test)

In [69]:
px.imshow(
    confusion_matrix(y_train, pred_train),
    title = f'Training Set Confusion Matrix (Accuracy {round(score_train,2)})',
    color_continuous_scale='Blues',
    height = 800,
    width = 800,
    labels = {
        'x': 'Predicted',
        'y': 'Observed',
        'color': 'Frequency'
    }
)

In [68]:
px.imshow(
    confusion_matrix(y_test, pred_test),
    title = f'Test Set Confusion Matrix (Accuracy {round(score_test,2)})', 
    color_continuous_scale='Blues',
    height = 800,
    width = 800,
    labels = {
        'x': 'Predicted',
        'y': 'Observed',
        'color': 'Frequency'
    }
)