# Initial Setup

## Imports

In [1]:
# Set random seed
import numpy as np
import pandas as pd
np.random.seed(42)

# Plotting
import plotly.express as px

# Database
from JobsDb import JobsDb

# Verification
from sklearn.model_selection import train_test_split

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Dimension Reduction
from sklearn.decomposition import TruncatedSVD

# Clustering
from sklearn.cluster import KMeans

# Pipeline
from sklearn.pipeline import Pipeline

# Clustering Evaluation
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_score

## Loading the Data

In [2]:
db = JobsDb()
df = db.load_table_as_df('jobs')
db.close()
df = df.iloc[9680:]
print(df.shape)
df.head()

(9485, 4)


Unnamed: 0,id,title,url,description
9680,9681,Railcar Verifier/Transload Team Member/Data Entry,https://www.careerjet.com/jobad/us5194732b36a6...,\nCompany Overview Come join a Winning Team! ...
9681,9682,Data Entry Clerk,https://www.careerjet.com/jobad/us83f88fb60b47...,"\n prepare, compile and sort documents for dat..."
9682,9683,Data Scientist,https://www.careerjet.com/jobad/us466d6146a815...,\n \n Data Scientist is responsible for co...
9683,9684,Provider Data Specialist,https://www.careerjet.com/jobad/uscb5cda0893f6...,\n \n Title: Provider Data Specialist Loc...
9684,9685,Security Data Architect,https://www.careerjet.com/jobad/us00dc3c284dbd...,"\nOur Mission At Dobbs Defense, we deliver mi..."


## Extracting Job Descriptions

In [3]:
descriptions = df['description']

## Making Test Train Split

In [4]:
train = descriptions

# Model Architecture 

## Clustering Hyper-Parameter Selection
Re-factor with clustering pipe -> KMeans 

In [10]:
outcomes = {
    'n_components': [],
    'n_clusters': [],
    'CH_score': [],
    'WCSS_score': [],
    'S_score': []
}

vec = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range = (1,3))
train_vec = vec.fit_transform(train)

for n_components in range(6, 10):
    for n_clusters in range(2, 11):
        dimr = TruncatedSVD(n_components = n_components, random_state=42)
        train_t = dimr.fit_transform(train_vec)
        clusterer = KMeans(n_clusters=n_clusters)
        target_train = clusterer.fit_predict(train_t)
        outcomes['CH_score'].append(calinski_harabasz_score(train_t, target_train))
        outcomes['WCSS_score'].append(clusterer.inertia_)
        outcomes['S_score'].append(silhouette_score(train_t,target_train))
        outcomes['n_components'].append(n_components)
        outcomes['n_clusters'].append(n_clusters)
        print(f'Finished evaluating for {n_components} components and {n_clusters} clusters.')

Finished evaluating for 6 components and 2 clusters.
Finished evaluating for 6 components and 3 clusters.
Finished evaluating for 6 components and 4 clusters.
Finished evaluating for 6 components and 5 clusters.
Finished evaluating for 6 components and 6 clusters.
Finished evaluating for 6 components and 7 clusters.
Finished evaluating for 6 components and 8 clusters.
Finished evaluating for 6 components and 9 clusters.
Finished evaluating for 6 components and 10 clusters.
Finished evaluating for 7 components and 2 clusters.
Finished evaluating for 7 components and 3 clusters.
Finished evaluating for 7 components and 4 clusters.
Finished evaluating for 7 components and 5 clusters.
Finished evaluating for 7 components and 6 clusters.
Finished evaluating for 7 components and 7 clusters.
Finished evaluating for 7 components and 8 clusters.
Finished evaluating for 7 components and 9 clusters.
Finished evaluating for 7 components and 10 clusters.
Finished evaluating for 8 components and 2 c

In [35]:
n_components = 1800
dimr = TruncatedSVD(n_components = n_components, random_state=42)
train_t = dimr.fit_transform(train_vec)

In [36]:
px.line(np.cumsum(dimr.explained_variance_))

In [11]:
fig = px.line(
    x=outcomes['n_clusters'], 
    y=outcomes['CH_score'], 
    color=outcomes['n_components'],
    title='Calinski Harabasz Score',
    labels ={
        'x': 'Number of Clusters',
        'y': 'CH Score',
        'color': 'Principal Components'
    }
)
fig.show()

In [12]:
fig = px.line(
    x=outcomes['n_clusters'], 
    y=outcomes['WCSS_score'], 
    color=outcomes['n_components'],
    title='Within Cluster Sum of Squared Residuals',
    labels ={
        'x': 'Number of Clusters',
        'y': 'WCSS Score',
        'color': 'Principal Components'
    }
)
fig.show()

In [13]:
fig = px.line(
    x=outcomes['n_clusters'], 
    y=outcomes['S_score'], 
    color=outcomes['n_components'],
    title='Silhouette Score',
    labels ={
        'x': 'Number of Clusters',
        'y': 'Silhouette Score',
        'color': 'Principal Components'
    }
)
fig.show()

In [14]:
n_components = 2
n_clusters = 2
dimr = TruncatedSVD(n_components = n_components, random_state=42)
train_t = dimr.fit_transform(train_vec)
clusterer = KMeans(n_clusters=n_clusters)
target_train = clusterer.fit_predict(train_t)

In [15]:
px.scatter(x=train_t[:,0], y=train_t[:,1], color=target_train)

## Define Clustering Pipeline

In [None]:
clustering_pipe = Pipeline(
    steps = [
        ('vectorization', TfidfVectorizer(stop_words='english', max_features=10000, ngram_range = (1,3))),
        ('dimensionReduction', TruncatedSVD(n_components = 2, random_state=42)),
        ('clustering', KMeans(n_clusters = 5))
    ]
)

## Fit Final Clustering Pipeline

In [None]:
clustering_pipe.fit(train)

In [None]:
vecs = clustering_pipe.get_params()['dimensionReduction'].components_
vecs.shape

In [None]:
vocab = clustering_pipe.get_params()['vectorization'].vocabulary_

In [None]:
word_cloud = pd.DataFrame([[key, vecs[0,value], vecs[1,value]] for key, value in vocab.items()], columns = ['word','x','y'])

In [None]:

px.scatter(data_frame=word_cloud, x='x', y='y', hover_name='word')