In [1]:
import pandas as pd
import numpy as np
from nltk.cluster import KMeansClusterer
import nltk
from sentence_transformers import SentenceTransformer

In [2]:
data = pd.read_csv('../../data/paragraph_clean_data.csv',dtype={'article_text':'string'})

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  79 non-null     int64 
 1   paragraph   79 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.4+ KB


In [4]:
encoder_model = SentenceTransformer('all-MiniLM-L6-v2')
# Utility function for generating sentence embedding from the text
def get_embeddinngs(text):
    return encoder_model.encode(text)


In [5]:

# Generating sentence embedding from the text
data['embeddings'] = data['paragraph'].apply(get_embeddinngs)

In [12]:
X = np.array(data['embeddings'].tolist())

In [26]:


def clustering_news(data,NUM_CLUSTERS = 15):

    X = np.array(data['embeddings'].tolist())

    nltk_cluster_model = KMeansClusterer(
        NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
        repeats=25,avoid_empty_clusters=True)

    assigned_clusters = nltk_cluster_model.cluster(X, assign_clusters=True)

    data['cluster'] = pd.Series(assigned_clusters, index=data.index)
    data['centroid'] = data['cluster'].apply(lambda x: nltk_cluster_model.means()[x])

    return data,nltk_cluster_model


 

In [27]:
clust_data,nltk_cluster_model = clustering_news(data,7)

In [28]:
type(clust_data)

pandas.core.frame.DataFrame

In [29]:
clust_data.head()

Unnamed: 0.1,Unnamed: 0,paragraph,embeddings,cluster,centroid
0,0,The TESLA logo is seen outside a dealership in...,"[-0.027433814, 0.014448788, 0.050572325, 0.063...",3,"[-0.034090396, -0.0013210782, 0.045484196, 0.0..."
1,1,Business Tesla weighs China sales reset Posted...,"[0.0013775178, 0.0052929916, 0.052649338, 0.03...",3,"[-0.034090396, -0.0013210782, 0.045484196, 0.0..."
2,2,A Jeep logo is seen on a car at a showroom of ...,"[-0.10454293, 0.026172081, 0.005620138, -0.030...",0,"[-0.026943447, 0.015383076, 0.028313272, 0.013..."
3,3,Register now for FREE unlimited access to Reut...,"[-0.04478642, 0.008372527, 0.050375167, 0.0615...",1,"[-0.04939384, 0.016903222, 0.010598234, 0.0331..."
4,4,A view of Tesla Inc is U S vehicle factory in ...,"[-0.038692232, 0.09202919, -0.012038713, 0.040...",4,"[-0.04657135, 0.057493903, 0.040230125, 0.0185..."


In [31]:
clust_data[clust_data['cluster']==4]

Unnamed: 0.1,Unnamed: 0,paragraph,embeddings,cluster,centroid
4,4,A view of Tesla Inc is U S vehicle factory in ...,"[-0.038692232, 0.09202919, -0.012038713, 0.040...",4,"[-0.04657135, 0.057493903, 0.040230125, 0.0185..."
6,6,Register now for FREE unlimited access to Reut...,"[-0.0505838, 0.06705427, 0.06424659, 0.0210842...",4,"[-0.04657135, 0.057493903, 0.040230125, 0.0185..."
7,7,The logo of car manufacturer Tesla is seen at ...,"[-0.049751263, 0.041635334, 0.0655589, 0.05343...",4,"[-0.04657135, 0.057493903, 0.040230125, 0.0185..."
8,8,A Tesla logo is seen on a wheel rim during the...,"[-0.062302493, 0.03082036, 0.046739396, -0.030...",4,"[-0.04657135, 0.057493903, 0.040230125, 0.0185..."
9,9,Register now for FREE unlimited access to Reut...,"[-0.04074379, 0.002250556, 0.045334995, 0.0202...",4,"[-0.04657135, 0.057493903, 0.040230125, 0.0185..."
64,65,Register now for FREE unlimited access to Reut...,"[-0.055110168, 0.033948712, 0.019066695, 0.029...",4,"[-0.04657135, 0.057493903, 0.040230125, 0.0185..."
65,66,The logo of car manufacturer Tesla is seen at ...,"[-0.02881586, 0.1347212, 0.05270187, -0.005016...",4,"[-0.04657135, 0.057493903, 0.040230125, 0.0185..."


In [32]:
import pickle
pickle.dump(nltk_cluster_model, open("../models/nltk_cluster_model.pkl", "wb"))

In [34]:
# Inference Code
saved_model = pickle.load(open('../models/nltk_cluster_model.pkl', 'rb'))
test_text = "Tesla is in Austin,Texas.Tesla CEO is Elon Musk.The logo of car manufacturer Tesla is seen recently"
vector = get_embeddinngs(test_text)
print(saved_model.classify(vector))

4
