### Importing the required libraries

In [None]:
import pandas as pd
import spacy
import numpy as np
from tqdm import tqdm

In [2]:
import gensim
import gensim.downloader as api

In [3]:
from gensim.models import Word2Vec

In [4]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [5]:
df0 = pd.read_csv("medium_articles.csv")
df0.head()

Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [6]:
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192368 entries, 0 to 192367
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   title      192363 non-null  object
 1   text       192368 non-null  object
 2   url        192368 non-null  object
 3   authors    192368 non-null  object
 4   timestamp  192366 non-null  object
 5   tags       192368 non-null  object
dtypes: object(6)
memory usage: 8.8+ MB


In [7]:
df_ner = pd.read_csv("NER_entities_final.csv")
df_ner

Unnamed: 0.1,Unnamed: 0,articles_ner
0,0,"['Josh Riemer Unsplash', 'Ryan', 'Juliette', '..."
1,1,"['Pexels', 'Zoom', 'ACE2', 'New England Journa..."
2,2,"['Ann-Sophie Barwich', 'Johannes Frasnelli', '..."
3,3,[]
4,4,"['Phineas Gage', 'Cherry', 'Phineas Gage', 'Ph..."
...,...,...
192362,192362,"['Cleaners', 'North Shore', 'Sydney North Shore']"
192363,192363,"['Hemp', 'Cotton']"
192364,192364,"['DIY', 'Bond Cleaning Adelaide']"
192365,192365,"['Grin', 'Bucharest', 'Transylvania', 'SEH-RA'..."


In [8]:
df0.drop(33875,inplace = True)

In [9]:
df0.reset_index(inplace = True, drop = True)

In [10]:
df_ner.iloc[33875]

Unnamed: 0         33875
articles_ner    ['APIS']
Name: 33875, dtype: object

In [11]:
df_fin = pd.concat([df0,df_ner],axis = 1)
df_fin

Unnamed: 0.1,title,text,url,authors,timestamp,tags,Unnamed: 0,articles_ner
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci...",0,"['Josh Riemer Unsplash', 'Ryan', 'Juliette', '..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P...",1,"['Pexels', 'Zoom', 'ACE2', 'New England Journa..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We...",2,"['Ann-Sophie Barwich', 'Johannes Frasnelli', '..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P...",3,[]
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology...",4,"['Phineas Gage', 'Cherry', 'Phineas Gage', 'Ph..."
...,...,...,...,...,...,...,...,...
192362,Why do you need a cleaning service?,What could be more important than having a tid...,https://medium.com/@ozneedcleaningau/why-do-yo...,[],2021-11-16 08:17:08.950000+00:00,"['Cleaning', 'Cleaning Services', 'Cleaning Co...",192362,"['Cleaners', 'North Shore', 'Sydney North Shore']"
192363,Daily cleaning and maintenance of bedding,Daily cleaning and maintenance of bedding\n\nW...,https://medium.com/@a198blwt/daily-cleaning-an...,[],2021-11-16 05:27:05.359000+00:00,"['Bedding', 'Cleaning', 'Maintain']",192363,"['Hemp', 'Cotton']"
192364,Beneficial Advice on Bond Cleaning!,The most important chore at the end is bond cl...,https://medium.com/@princegohil/beneficial-adv...,['Prince Shrawan'],2021-11-26 08:20:27.660000+00:00,"['Cleaning', 'End Of Lease Cleaning', 'Cleaners']",192364,"['DIY', 'Bond Cleaning Adelaide']"
192365,How I Learned Romanian in 37 Easy Steps,How I Learned Romanian in 37 Easy Steps\n\nHey...,https://medium.com/@lifeinromania/how-i-learne...,['Sam Ursu'],2017-11-27 08:09:19.025000+00:00,"['Romania', 'Language Learning', 'Storyofmylife']",192365,"['Grin', 'Bucharest', 'Transylvania', 'SEH-RA'..."


### loading the pretrained model

In [14]:
wv = api.load('word2vec-google-news-300')

In [16]:
wv

<gensim.models.keyedvectors.KeyedVectors at 0x1c984330550>

In [19]:
wv.most_similar("Zoom")

[('Speed_Craw', 0.5614227056503296),
 ('Sweden_Orsa', 0.49537304043769836),
 ('Olympus_Camedia_C', 0.4901363253593445),
 ('Tierpark_Neumuenster', 0.48986688256263733),
 ('Fujifilm_FinePix_F###', 0.48315995931625366),
 ('Fujifilm_FinePix_A###', 0.48149988055229187),
 ('drummer_DJ_Bonebrake', 0.47490739822387695),
 ('www.zoom.com', 0.47464051842689514),
 ('Gorillapod_SLR', 0.4675545394420624),
 ('Gyration_Air', 0.4626712501049042)]

In [22]:
try:
    model = gensim.models.KeyedVectors.load_word2vec_format(wv, binary=True)
except TypeError:
    print("ignored")
    



ignored


### vectorizing the words (word2vec)

In [24]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(df_fin.articles_ner, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(192367, 100)

### clustering the text with kmeans

In [30]:
def mbkmeans_clusters(X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [36]:
clustering, cluster_labels = mbkmeans_clusters(
X = vectorized_docs,
    k=5,
    mb=25000,
    print_silhouette_values=True,
)

For n_clusters = 5
Silhouette coefficient: 0.13
Inertia:10.943522789412539
Silhouette values:
    Cluster 2: Size:13561 | Avg:1.00 | Min:0.04 | Max: 1.00
    Cluster 4: Size:3151 | Avg:0.14 | Min:-0.23 | Max: 0.44
    Cluster 1: Size:77717 | Avg:0.13 | Min:-0.02 | Max: 0.30
    Cluster 0: Size:72009 | Avg:0.06 | Min:-0.05 | Max: 0.24
    Cluster 3: Size:25929 | Avg:-0.14 | Min:-0.39 | Max: 0.12


#### Top clusters are 2,4,1

In [34]:
df_clusters = pd.DataFrame({
    "text": df_fin.text,
    "cluster": cluster_labels
})

In [35]:
df_clusters

Unnamed: 0,text,cluster
0,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,1
1,Your Brain On Coronavirus\n\nA guide to the cu...,1
2,Mind Your Nose\n\nHow smell training can chang...,1
3,Passionate about the synergy between science a...,2
4,"You’ve heard of him, haven’t you? Phineas Gage...",1
...,...,...
192362,What could be more important than having a tid...,0
192363,Daily cleaning and maintenance of bedding\n\nW...,0
192364,The most important chore at the end is bond cl...,1
192365,How I Learned Romanian in 37 Easy Steps\n\nHey...,1
