IMPORTS

In [42]:

import os
from pathlib import Path
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

import gensim
from gensim.test.utils import get_tmpfile


from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\slavr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\slavr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


HELPER METHODS

In [43]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens


In [45]:
def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

DATA NGESTION

In [46]:
parent_folder_path = Path().cwd().parent
parent_folder_path

WindowsPath('d:/project/ai_job_autopilot/gitlab')

In [47]:
job_descr_path = parent_folder_path / 'original_data/jd1.json'

job_descr_path

WindowsPath('d:/project/ai_job_autopilot/gitlab/original_data/jd1.json')

In [48]:
df = pd.read_json(job_descr_path)

In [49]:
print(df['description'][1])

We would like to present to you a new job opportunity and I think you may find it interesting.

If you are interested kindly send the following documents to by Thursday, April 06 at 01:00 PM EST if that interests you and matches your profile.

Without mandatory documents, we cannot submit a candidate.
• Updated Resume in word format (Mandatory)
• Skills Matrix and References (Mandatory)
• Expected hourly rate (Mandatory)

Job Title: RQ05180 - DevOPS/Cloud Engineer - Senior

Client: Ministry of Health

Work Location: 5700 Yonge Street, Toronto, Ontario, Hybrid

Estimated Start Date: 2023-06-02

Estimated End Date: 2024-03-29

#Business Days: 207.00

Extension: Probable after the initial mandate

Hours per day or Week: 7.25 hours per day

Security Level: CRJMC

If you are interested to learn more about this opportunity or ifnot, please feel free to send over any names or forward this email to anyone who may be interested. Please check it out on our career site.

(Click above for Skill...

Tokenize

In [50]:
custom_stopwords = set(stopwords.words("english") + ["job", "title", "qualifications", "requirements"])

df["tokens"] = df["description"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))
tokenized_docs = df["tokens"].values
 


Load model and vectorize

In [51]:
fname = get_tmpfile(parent_folder_path / 'prep_data' / 'models' / "word2vec.model")
model = gensim.models.FastText.load(fname)


In [52]:
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(1208, 200)

In [53]:

ret_vals = model.wv.similar_by_vector(vectorized_docs[1])
ret_vals

[('give', 0.8144811391830444),
 ('interactions', 0.8008865118026733),
 ('initiation', 0.793384313583374),
 ('acumen', 0.7828260660171509),
 ('sharing', 0.7811826467514038),
 ('assertive', 0.7791082859039307),
 ('ve', 0.7785332202911377),
 ('qe', 0.775906503200531),
 ('principles', 0.7741429209709167),
 ('ex', 0.772027850151062)]

In [54]:
clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=50,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": df['description'],
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})



For n_clusters = 50
Silhouette coefficient: 0.15
Inertia:117.9930543711749
Silhouette values:
    Cluster 6: Size:9 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 12: Size:7 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 17: Size:9 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 23: Size:7 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 25: Size:9 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 29: Size:2 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 36: Size:2 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 40: Size:2 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 44: Size:3 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 47: Size:2 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 48: Size:9 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 49: Size:2 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 14: Size:14 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 27: Size:7 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 34: Size:7 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 4: Size:9 | Avg:1.00 | Min:1.00 | Max: 

In [55]:
print("Most representative terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: leaders improvements ex prospect give 
Cluster 1: ex acumen improvements professionalism give 
Cluster 2: boarding ex prospect improvements leaders 
Cluster 3: de partie sous curry cuisine 
Cluster 4: boarding improvements prospect professionalism prospects 
Cluster 5: interactions boarding initiation acumen ex 
Cluster 6: ex prospect improvements track professionalism 
Cluster 7: acumen improvements units prospect ex 
Cluster 8: give assertive approach deliver sharing 
Cluster 9: give qe interactions sse initiation 
Cluster 10: give qe interactions sse initiation 
Cluster 11: prospect boarding improvements ex prospects 
Cluster 12: acumen improvements objection prospect ex 
Cluster 13: improvements ex prospect acumen prospects 
Cluster 14: units acumen initiation improvements ex 
Cluster 15: ex improvements acumen leaders boarding 
Cluster 16: ex give initiation improvements interactions 
Cluster 17: acumen units i

In [56]:
test_cluster = 29
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:3]:
    print(df['description'][d])
    print("-------------")
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[test_cluster]], topn=5)
    print(most_representative)


Description

The Network Administrator is responsible for design and development of secure Network system solutions. This position is expected to work on projects, which may include evolving existing systems or implementing new technology. This position is also responsible for performing analytical, technical and support work in the planning, implementation; documentation and administration of all Networking systems.

The Kal Tire network is a critical technology that connects our global operations and Team Members while ensuring cyber security, reliability, and performance standards are maintained. The network is built using design principles from Zero Trust security model and CIS hardening guidelines and modern technology such as NGFW and SD-WAN.

Core Responsibilities

Network Administration
• Documentation and administration of all Network Infrastructure.
• Serves as a technical troubleshooter and coordinator on complex network issues affecting mission critical applications.
•... E

In [6]:
string = '''str1\nstr2....\nstrN'''
print(string)


str1
str2....
strN
