IMPORTS

In [63]:

import os
from pathlib import Path
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

import gensim
from gensim.test.utils import get_tmpfile

from sklearn.cluster import Birch


from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\slavr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\slavr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


HELPER METHODS

In [64]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens


In [65]:
# n_clusters=10
# birch_clusterizer = cluster.Birch(n_clusters=None, threshold=0.2)
# for i in range(0,len(X),20):
#     birch_clusterizer.partial_fit(X[i:i+20])

# birch_clusterizer.set_params(n_clusters=n_clusters)
# birch_clusterizer.partial_fit()

In [66]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.express as px


def tsne_3d_plot(X, birch_clusterizer):

    """
    Reduce X to 3 dimensions and plot all points over 3 axes
    """
    X_df = pd.DataFrame(data = X)
    X_df['label'] = birch_clusterizer.subcluster_labels_


    tsne_model = TSNE(n_components=3, learning_rate='auto',
                    init='random', perplexity=3)
                    
    projections = tsne_model.fit_transform(X)

    fig = px.scatter_3d(
        projections, x=0, y=1, z=2,
        color=X_df.label, labels={'color': 'label'}
    )
    fig.update_traces(marker_size=8)

 

# tsne_3d_plot(X, birch_clusterizer)

In [67]:
from itertools import cycle

import matplotlib.pyplot as plt
import matplotlib.colors as colors

def matpotlib_cluster_plot(X, birch_clusterizer):
    # Use all colors that matplotlib provides by default.
    colors_ = cycle(colors.cnames.keys())

    labels = birch_clusterizer.subcluster_labels_
    centroids = birch_clusterizer.subcluster_centers_
    n_clusters = np.unique(labels).size

    print("n_clusters : %d" % n_clusters)
    for this_centroid, k, col in zip(centroids, range(n_clusters), colors_):
        mask = labels == k
        plt.scatter(X[mask, 0], X[mask, 1], c="w", edgecolor=col, marker=".", alpha=0.5)
        plt.scatter(this_centroid[0], this_centroid[1], marker="+", c="k", s=155)

In [68]:
def vec2word(model,row):
    
    word = model.wv.most_similar(positive=[row.values])[0][0]
    return word



In [132]:
def centroid_to_df_relabel_avg(clusterizer, model):
    """
    create a dataframe with labels as averaged cluster centroid
    """
    labels = clusterizer.subcluster_labels_
    centroids = clusterizer.subcluster_centers_

    df_c = pd.DataFrame(centroids)
    df_c['label'] = labels

    df_c = df_c.groupby(by='label').mean()

    df_c['label_txt'] = df_c.apply(lambda row: vec2word(model, row), axis=1)

    return df_c

In [109]:
def centroid_to_df_relabel_first(clusterizer, model):
    """
    create a dataframe with labels as first cluster centroid
    """
    labels = clusterizer.subcluster_labels_
    centroids = clusterizer.subcluster_centers_
    n_clusters = np.unique(labels).size

    u, idx_start = np.unique(labels, return_counts=False, return_index=True)
    df_c = pd.DataFrame(centroids[idx_start])
    labels=[]
    for i in range(n_clusters):
        labels.append(model.wv.most_similar(positive=[df_c.loc[i,:].values])[0][0]
        )
    df_c['label_txt'] = labels

    return df_c
    
def centroid_to_df(birch_clusterizer):
    """
    create a dataframe with 
    """
    labels = birch_clusterizer.subcluster_labels_
    centroids = birch_clusterizer.subcluster_centers_
    n_clusters = np.unique(labels).size

    u, idx_start = np.unique(labels, return_counts=False, return_index=True)
    df_c = pd.DataFrame(centroids[idx_start])
    df_c['label_txt'] = np.unique(labels)

    return df_c

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.express as px

def plotly_3d_centroid_plot(df_c):
    """
    generate 3d plot of cluster centers
    """
    tsne_model = TSNE(n_components=3, learning_rate='auto',
                    init='random', perplexity=3)

    label_col_mask = df_c.columns[df_c.columns != 'label_txt']
    df_proj = tsne_model.fit_transform(df_c[label_col_mask])
    df_proj = pd.DataFrame(df_proj)
    df_proj['label_txt'] = df_c['label_txt'].values

    fig = px.scatter_3d(
        df_proj, x=0, y=1, z=2,
        color=df_proj.label_txt, labels={'color': 'label_txt'}
    )
    fig.update_traces(marker_size=8)
    fig.show()

In [71]:

def vectorize(list_of_docs, model, clusterizer):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            clusterizer.partial_fit(vectors)
            features.append(vectors)
        else:
            features.append(zero_vector)
    return features, clusterizer

In [72]:
def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

DATA NGESTION

In [73]:
parent_folder_path = Path().cwd().parent
parent_folder_path

WindowsPath('d:/project/ai_job_autopilot/gitlab')

In [74]:
job_descr_path = parent_folder_path / 'original_data/jd1.json'

job_descr_path

WindowsPath('d:/project/ai_job_autopilot/gitlab/original_data/jd1.json')

In [75]:
df = pd.read_json(job_descr_path)

In [76]:
print(df['description'][1])

We would like to present to you a new job opportunity and I think you may find it interesting.

If you are interested kindly send the following documents to by Thursday, April 06 at 01:00 PM EST if that interests you and matches your profile.

Without mandatory documents, we cannot submit a candidate.
• Updated Resume in word format (Mandatory)
• Skills Matrix and References (Mandatory)
• Expected hourly rate (Mandatory)

Job Title: RQ05180 - DevOPS/Cloud Engineer - Senior

Client: Ministry of Health

Work Location: 5700 Yonge Street, Toronto, Ontario, Hybrid

Estimated Start Date: 2023-06-02

Estimated End Date: 2024-03-29

#Business Days: 207.00

Extension: Probable after the initial mandate

Hours per day or Week: 7.25 hours per day

Security Level: CRJMC

If you are interested to learn more about this opportunity or ifnot, please feel free to send over any names or forward this email to anyone who may be interested. Please check it out on our career site.

(Click above for Skill...

Tokenize

In [77]:
custom_stopwords = set(stopwords.words("english") + ["job", "title", "qualifications", "requirements"])

df["tokens"] = df["description"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))
tokenized_docs = df["tokens"].values
 


Load model and vectorize

In [78]:
fname = get_tmpfile(parent_folder_path / 'prep_data' / 'models' / "word2vec.model")
model = gensim.models.FastText.load(fname)


In [79]:
clusterizer = Birch(n_clusters=None, threshold=0.2)

In [80]:
vectorized_docs, clusterizer = vectorize(tokenized_docs, model=model, clusterizer = clusterizer)
len(vectorized_docs), vectorized_docs[0].shape


(1208, (359, 200))

In [81]:
n_clusters = 200
clusterizer.set_params(n_clusters=n_clusters)
clusterizer.partial_fit()

In [133]:
df_c = centroid_to_df_relabel_avg(clusterizer, model)

plotly_3d_centroid_plot(df_c)

In [83]:
print(df_c.label.tolist())

['ticket', 'viral', 'notices', 'die', 'labor', 'webapi', 'pcg', 'sprint', 'rmo', 'back', 'twitter', 'confident', 'mockups', 'markup', 'construction', 'issue', 'asynchronous', 'relationship_building', 'bdo', 'bcne', 'rd', 'webchat', 'cto', 'biologic', 'waste', 'profiles', 'coso', 'balancing', 'emailing', 'aluminium', 'wfh', 'reconciliations', 'creo', 'scope', 'event', 'logo', 'restful', 'positive', 'vsphere', 'turnaround', 'brokers', 'web', 'country', 'spices', 'direct', 'vitro', 'curry', 'landscape', 'metadata', 'selenium_webdriver', 'neural', 'dispatch', 'rabbitmq', 'awareness', 'drive', 'dba', 'hedging', 'lgd', 'installations', 'game', 'language', 'phone', 'finalization', 'payment', 'pivot', 'documents', 'rockwell', 'qa', 'ey', 'adserving', 'wanted', 'wellness', 'promise', 'shell', 'clinical_operation', 'evaluation', 'gathering', 'wfm', 'feature', 'tied', 'contracts', 'queries', 'django', 'ensuring', 'per', 'enhancements', 'fico', 'display_drivers', 'headcount', 'vulnerability', 'rlc

In [87]:
vec = model.wv.__getitem__('senior')
# labelind_2_labeltext(clusterizer, labelinds)

In [134]:

from collections import Counter

def print_cluster_centroid_text_doc_ind(df, ind):
    labelinds = clusterizer.predict(vectorized_docs[ind])
    labelinds_counter = Counter(labelinds)
    count_list = labelinds_counter.most_common(10)
    print(df['description'][ind])
    print('------------')
    print([(df_c.loc[count,'label_txt'], freq) for count,freq in count_list ])

In [138]:
print_cluster_centroid_text_doc_ind(df, ind=2)

Requisition ID: 174456

Join a purpose driven winning team, committed to results, in an inclusive and high-performing culture.

The Team

We are part of the GBME – Global Banking Markets and Engineering - Data Platform team who acts as a key business functionality within the bank leveraged by regulatory and non-regulatory systems. Data platform currently supports 100+ source systems in terms of development and solutioning using some of the key cloud technologies like Hadoop / Kafka / NiFi / Spark / Elasticsearch / Minio & Dremio.

The role:

The GBM Data Platform DevOps is responsible for providing onboarding solutions to the above-mentioned technology stack as well as maintain stability and integrity of the data platform. This role is a dedicated resource to support regulatory / compliance data platform onboarding guidance and production support.

To achieve this, your responsibilities include:

• Design Recommendations: Work very closely with project teams on understanding the... bus

(1208, 6)

We would like to present to you a new job opportunity and I think you may find it interesting.

If you are interested kindly send the following documents to by Thursday, April 06 at 01:00 PM EST if that interests you and matches your profile.

Without mandatory documents, we cannot submit a candidate.
• Updated Resume in word format (Mandatory)
• Skills Matrix and References (Mandatory)
• Expected hourly rate (Mandatory)

Job Title: RQ05180 - DevOPS/Cloud Engineer - Senior

Client: Ministry of Health

Work Location: 5700 Yonge Street, Toronto, Ontario, Hybrid

Estimated Start Date: 2023-06-02

Estimated End Date: 2024-03-29

#Business Days: 207.00

Extension: Probable after the initial mandate

Hours per day or Week: 7.25 hours per day

Security Level: CRJMC

If you are interested to learn more about this opportunity or ifnot, please feel free to send over any names or forward this email to anyone who may be interested. Please check it out on our career site.

(Click above for Skill...

In [85]:
# print("Most representative terms per cluster (based on centroids):")
# for i in range(50):
#     tokens_per_cluster = ""
#     most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
#     for t in most_representative:
#         tokens_per_cluster += f"{t[0]} "
#     print(f"Cluster {i}: {tokens_per_cluster}")

In [86]:
# test_cluster = 29
# most_representative_docs = np.argsort(
#     np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
# )
# for d in most_representative_docs[:3]:
#     print(df['description'][d])
#     print("-------------")
#     most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[test_cluster]], topn=5)
#     print(most_representative)
