In [24]:
import re
import nltk
import string
import numpy as np
import pandas as pd

from typing import List, Dict
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [3]:
stop_words = set(stopwords.words("english"))

In [10]:
def preprocess_text(text: str) -> List[str]:
    text = text.lower()
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )
    text_tokens = word_tokenize(text)
    text_without_stopwords = [word for word in text_tokens if word not in stop_words]
    return text_without_stopwords

In [11]:
preprocess_text("Hi there, this is a test text !!!")

['hi', 'test', 'text']

In [13]:
texts_filepath = "text_to_cluster.txt"

with open(texts_filepath) as infile:
    data = infile.readlines()
    
texts_df = pd.DataFrame(data={"texts": data}, columns=["texts"])
texts_df.head()

Unnamed: 0,texts
0,Ransomware attack at Mexico's Pemex halts work...
1,#city | #ransomware | Ransomware Attack At Mex...
2,"Mexico's Pemex Oil Suffers Ransomware Attack, ..."
3,A Mexican oil company was hit by ransomware at...
4,Pemex Struck by Ransomware Attack\n


In [14]:
texts_df["cleaned_text"] = texts_df["texts"].apply(lambda x: preprocess_text(x))
texts_df.head()

Unnamed: 0,texts,cleaned_text
0,Ransomware attack at Mexico's Pemex halts work...,"[ransomware, attack, mexicos, pemex, halts, wo..."
1,#city | #ransomware | Ransomware Attack At Mex...,"[city, ransomware, ransomware, attack, mexico,..."
2,"Mexico's Pemex Oil Suffers Ransomware Attack, ...","[mexicos, pemex, oil, suffers, ransomware, att..."
3,A Mexican oil company was hit by ransomware at...,"[mexican, oil, company, hit, ransomware, attack]"
4,Pemex Struck by Ransomware Attack\n,"[pemex, struck, ransomware, attack]"


In [15]:
cleaned_text_tokens = texts_df["cleaned_text"].values.tolist()
cleaned_text_tokens[0]

['ransomware',
 'attack',
 'mexicos',
 'pemex',
 'halts',
 'work',
 'threatens',
 'cripple',
 'computers']

In [16]:
w2v_model = Word2Vec(sentences=cleaned_text_tokens, vector_size=100, workers=2)

In [17]:
w2v_model.wv.most_similar("ransomware")

[('ransomwareasaservice', 0.845509946346283),
 ('new', 0.8409730792045593),
 ('attack', 0.8195723295211792),
 ('malware', 0.8159984350204468),
 ('’', 0.8083234429359436),
 ('attacks', 0.7910552024841309),
 ('pemex', 0.7884857058525085),
 ('web', 0.7850452661514282),
 ('says', 0.7849128246307373),
 ('windows', 0.7797765731811523)]

In [20]:
def create_vectors(docs: List[List[str]]) -> List[List[float]]:
    all_vectors = []

    for text_tokens in docs:
        zero_vector = np.zeros(w2v_model.vector_size)
        vectors = []
        for token in text_tokens:
            if token in w2v_model.wv:
                try:
                    vectors.append(w2v_model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vector = vectors.mean(axis=0)
            all_vectors.append(avg_vector)
        else:
            all_vectors.append(zero_vector)
    return all_vectors

In [21]:
all_text_vectors = create_vectors(cleaned_text_tokens)

In [22]:
all_text_vectors[0]

array([-5.5727372e-03,  8.3173057e-03, -2.0547304e-06,  5.9127840e-03,
        2.8268525e-03, -1.4639927e-02,  5.6710178e-03,  2.2862189e-02,
       -9.2939995e-03, -9.5118741e-03, -1.0346512e-03, -7.2166752e-03,
       -1.9704122e-03,  3.7084701e-03,  4.5130742e-03, -7.2766491e-03,
        1.9144794e-03, -7.2980174e-03, -4.5835874e-03, -2.0325737e-02,
        1.7756679e-03,  8.7142759e-04,  9.0232799e-03, -8.0896141e-03,
       -1.3344975e-03, -1.9431966e-03, -1.0051595e-02, -7.8808181e-03,
       -5.6690080e-03,  5.0887247e-03,  2.2517827e-03, -1.2107082e-03,
        9.9226218e-03, -6.0237329e-03, -3.0643758e-03,  1.0659773e-02,
        5.5624386e-03, -2.6292414e-03, -2.9476839e-03, -9.4883842e-03,
        4.2398134e-03, -9.3127470e-03, -6.3075712e-03, -5.3480186e-04,
        8.3259689e-03,  1.8627252e-03, -1.4845835e-04, -6.8645040e-04,
        7.3149544e-03,  5.2429810e-03,  6.3971882e-03, -8.6655710e-03,
        2.5357906e-04, -1.9455819e-03, -3.5632057e-03,  6.6816951e-03,
      

In [23]:
len(all_text_vectors[0])

100

In [25]:
kmeans_model = KMeans(n_clusters=6).fit(all_text_vectors)

In [27]:
texts_df["cluster_label"] = kmeans_model.labels_

In [28]:
texts_df.head()

Unnamed: 0,texts,cleaned_text,cluster_label
0,Ransomware attack at Mexico's Pemex halts work...,"[ransomware, attack, mexicos, pemex, halts, wo...",4
1,#city | #ransomware | Ransomware Attack At Mex...,"[city, ransomware, ransomware, attack, mexico,...",4
2,"Mexico's Pemex Oil Suffers Ransomware Attack, ...","[mexicos, pemex, oil, suffers, ransomware, att...",4
3,A Mexican oil company was hit by ransomware at...,"[mexican, oil, company, hit, ransomware, attack]",4
4,Pemex Struck by Ransomware Attack\n,"[pemex, struck, ransomware, attack]",0
