In [1]:
#Importing the library and loading the file

In [2]:
from tensorflow import keras
import pandas as pd
import numpy as np
from bpemb import BPEmb
import tensorflow as tf
import pickle

In [3]:
pwd

'/home/rahul/Oriserve'

In [4]:
corpus = pd.read_csv("/home/rahul/Oriserve/preprocessed_data")
corpus.drop(corpus.filter(regex="Unnamed"),axis=1, inplace=True)

In [5]:
corpus = corpus[:3000]

In [6]:
corpus['Text'] = corpus['Text'].replace(np.nan, '')

In [7]:
corpus.head(3)

Unnamed: 0,Text
0,moeller student run newspaper the crusader con...
1,in the crusader first place second highest hon...
2,the squire student literary journal features s...


In [8]:
training_data = corpus["Text"]
training_data = list(training_data)

In [9]:
#Bpemb pretrained word embeddings

In [10]:
bpemb_en = BPEmb(lang="en", vs=50000, dim=100)

In [11]:
sentences = np.array([np.array(bpemb_en.embed(x)) for x in training_data])

  """Entry point for launching an IPython kernel.


In [12]:
rnn = keras.layers.SimpleRNN(3, kernel_initializer=keras.initializers.ones, recurrent_initializer=keras.initializers.zeros, activation="tanh")
max_len = max(map(len, sentences))
rnn.build(input_shape=(1,max_len,100))

In [13]:
#Create an RNN, which creates a new vector of the embeddings, by summing them up their dimensions

In [14]:
def calculate_distances(sentences):
    values = {}
    for i in range(len(sentences)):
        rnn.set_weights([rnn.get_weights()[0], tf.constant(1/len(sentences[i]),shape=(3,3)), tf.constant(1/len(sentences[i]),shape=(3))])
        values[training_data[i]] = rnn(np.array([sentences[i]]))
    distances = [[np.absolute((l-i2)[0][0]) for i2 in values.values()] for l in values.values()]
    return distances  

In [15]:
d = calculate_distances(sentences)

In [16]:
with open('distance.pickle', 'wb') as handle:
    pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
d = np.array(d)

In [19]:
#Clustering using the threshold value

In [20]:
import torch
from torch import Tensor, device

In [21]:
def cos_sim(a: Tensor, b: Tensor):
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)
    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)
    if len(a.shape) == 1:
        a = a.unsqueeze(0)
    if len(b.shape) == 1:
        b = b.unsqueeze(0)
    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

In [22]:
def cluster_detection(embeddings, threshold, min_community_size, batch_size=32):
    if not isinstance(embeddings, torch.Tensor):
        embeddings = torch.tensor(embeddings)
    threshold = torch.tensor(threshold, device=embeddings.device)
    extracted_communities = []
    min_community_size = min(min_community_size, len(embeddings))
    sort_max_size = min(max(2 * min_community_size, 50), len(embeddings))
    for start_idx in range(0, len(embeddings), batch_size):
        cos_scores = cos_sim(embeddings[start_idx:start_idx + batch_size], embeddings)
        top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)
        for i in range(len(top_k_values)):
            if top_k_values[i][-1] >= threshold:
                new_cluster = []
                top_val_large, top_idx_large = cos_scores[i].topk(k=sort_max_size, largest=True)
                while top_val_large[-1] > threshold and sort_max_size < len(embeddings):
                    sort_max_size = min(2 * sort_max_size, len(embeddings))
                    top_val_large, top_idx_large = cos_scores[i].topk(k=sort_max_size, largest=True)
                for idx, val in zip(top_idx_large.tolist(), top_val_large):
                    if val < threshold:
                        break
                    new_cluster.append(idx)
                extracted_communities.append(new_cluster)
        del cos_scores
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)
    unique_communities = []
    extracted_ids = set()
    for cluster_id, community in enumerate(extracted_communities):
        community = sorted(community)
        non_overlapped_community = []
        for idx in community:
            if idx not in extracted_ids:
                non_overlapped_community.append(idx)
        if len(non_overlapped_community) >= min_community_size:
            unique_communities.append(non_overlapped_community)
            extracted_ids.update(non_overlapped_community)
    unique_communities = sorted(unique_communities, key=lambda x: len(x), reverse=True)
    return unique_communities

In [56]:
clusters = cluster_detection(d, min_community_size=1, threshold=0.95)

In [57]:
#Results of clustering

In [58]:
for i, cluster in enumerate(clusters):
    print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster[0:5]:
        print("\t", training_data[sentence_id])


Cluster 1, #1501 Elements 
	 moeller student run newspaper the crusader consistently recognized one top region
	 paul keels play play announcer ohio state university football basketball wbns fm
	 joe uecker ohio state senator r
	 tony hunter professional football player buffalo bills los angeles rams
	 jack norris president co founder vegan outreach

Cluster 2, #1407 Elements 
	 in the crusader first place second highest honor high school newspaper receive american scholastic press association ref
	 the squire student literary journal features stories poems essays written moeller students
	 rico murray undrafted free agent signee cincinnati bengals
	 moeller high school opened doors september along la salle high school fellow cincinnati archdiocesan school
	 when opened doors moeller high school received students parishes northeastern part greater cincinnati area drawing roger bacon high school purcell marian high school two male comprehensive cincinnati archdiocesan schools

Cluster 

In [59]:
cluster_list = []
for i, cluster in enumerate(clusters):
  cluster_list.append(cluster)

In [60]:
sen = []
for i, cluster in enumerate(clusters):
  for id in cluster[:]:
    sen.append([i, training_data[id]])

In [61]:
df=pd.DataFrame(sen,columns=['cluster_no','sentences'])
df.head(20)

Unnamed: 0,cluster_no,sentences
0,0,moeller student run newspaper the crusader con...
1,0,paul keels play play announcer ohio state univ...
2,0,joe uecker ohio state senator r
3,0,tony hunter professional football player buffa...
4,0,jack norris president co founder vegan outreach
5,0,archbishop moeller high school established fal...
6,0,starting freshman class moeller high school ad...
7,0,the lacrosse team two state titles constant fo...
8,0,this say circular flow diagram useful understa...
9,0,the circular flow income concept better unders...


In [62]:
#Saving csv file of cluster

In [63]:
df.to_csv('bpemb_clustering.csv')