### Importing the library and loading the file

In [None]:
!pip install autoreload

In [2]:
%matplotlib inline
from random import randint
import numpy as np
import torch
import shutil
import string
import nltk.data
import pandas as pd
import pickle

In [3]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
corpus = pd.read_csv("/content/preprocessed_data")
corpus.drop(corpus.filter(regex="Unnamed"),axis=1, inplace=True)

In [5]:
corpus = corpus[:20000]
corpus['Text'] = corpus['Text'].replace(np.nan, '')
corpus.head(3)

Unnamed: 0,Text
0,moeller student run newspaper the crusader con...
1,in the crusader first place second highest hon...
2,the squire student literary journal features s...


In [6]:
training_data = corpus["Text"]
training_data = list(training_data)

### Loading infersent model and embedding

In [34]:
model_version = 1
MODEL_PATH = "/content/gdrive/MyDrive/infersent1.pkl" 
W2V_PATH = '/content/gdrive/MyDrive/glove.840B.300d.txt'
VOCAB_SIZE = 1e5  

In [35]:
from models import InferSent
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [36]:
model.set_w2v_path(W2V_PATH)
model.build_vocab_k_words(K=VOCAB_SIZE)

Vocab size : 100000.0


In [37]:
#embeddings = model.encode(training_data, bsize=128, tokenize=False, verbose=True)
#print('nb sentences encoded : {0}'.format(len(embeddings)))

In [38]:
#with open('infersent_sentence_embeddings.pickle', 'wb') as handle:
#    pickle.dump(embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [39]:
with open('/content/gdrive/MyDrive/infersent_sentence_embeddings.pickle', 'rb') as handle:
    embeddings = pickle.load(handle)

In [40]:
#embeddings

### Clustering using the threshold value

In [26]:
import torch
from torch import Tensor, device

In [27]:
def cos_sim(a: Tensor, b: Tensor):
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)
    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)
    if len(a.shape) == 1:
        a = a.unsqueeze(0)
    if len(b.shape) == 1:
        b = b.unsqueeze(0)
    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

In [28]:
def cluster_detection(embeddings, threshold, min_community_size, batch_size=32):
    if not isinstance(embeddings, torch.Tensor):
        embeddings = torch.tensor(embeddings)
    threshold = torch.tensor(threshold, device=embeddings.device)
    extracted_communities = []
    min_community_size = min(min_community_size, len(embeddings))
    sort_max_size = min(max(2 * min_community_size, 50), len(embeddings))
    for start_idx in range(0, len(embeddings), batch_size):
        cos_scores = cos_sim(embeddings[start_idx:start_idx + batch_size], embeddings)
        top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)
        for i in range(len(top_k_values)):
            if top_k_values[i][-1] >= threshold:
                new_cluster = []
                top_val_large, top_idx_large = cos_scores[i].topk(k=sort_max_size, largest=True)
                while top_val_large[-1] > threshold and sort_max_size < len(embeddings):
                    sort_max_size = min(2 * sort_max_size, len(embeddings))
                    top_val_large, top_idx_large = cos_scores[i].topk(k=sort_max_size, largest=True)
                for idx, val in zip(top_idx_large.tolist(), top_val_large):
                    if val < threshold:
                        break
                    new_cluster.append(idx)
                extracted_communities.append(new_cluster)
        del cos_scores
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)
    unique_communities = []
    extracted_ids = set()
    for cluster_id, community in enumerate(extracted_communities):
        community = sorted(community)
        non_overlapped_community = []
        for idx in community:
            if idx not in extracted_ids:
                non_overlapped_community.append(idx)
        if len(non_overlapped_community) >= min_community_size:
            unique_communities.append(non_overlapped_community)
            extracted_ids.update(non_overlapped_community)
    unique_communities = sorted(unique_communities, key=lambda x: len(x), reverse=True)
    return unique_communities

In [25]:
clusters = cluster_detection(embeddings, min_community_size=5, threshold=0.65)

### Results of clustering

In [44]:
for i, cluster in enumerate(clusters):
    print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster[0:3]:
        print("\t", training_data[sentence_id])


Cluster 1, #11082 Elements 
	 moeller student run newspaper the crusader consistently recognized one top region
	 in the crusader first place second highest honor high school newspaper receive american scholastic press association ref
	 the squire student literary journal features stories poems essays written moeller students

Cluster 2, #1064 Elements 
	 tony hunter professional football player buffalo bills los angeles rams
	 jack norris president co founder vegan outreach
	 in government sector the leakage government sector provides collection revenue taxes t provided households firms government

Cluster 3, #336 Elements 
	 measurement national income national income estimation aggregation economic activity circular flow
	 households spend income y goods services consumption c
	 real flow exchange goods services household firms whereas money flow monetary exchange two sectors

Cluster 4, #313 Elements 
	 knowledge interdependence circular flow income signifies interdependence activ

In [45]:
cluster_list = []
for i, cluster in enumerate(clusters):
  cluster_list.append(cluster)

In [46]:
sen = []
for i, cluster in enumerate(clusters):
  for id in cluster[:]:
    sen.append([i, training_data[id]])

In [47]:
df=pd.DataFrame(sen,columns=['cluster_no','sentences'])
df.head(20)

Unnamed: 0,cluster_no,sentences
0,0,moeller student run newspaper the crusader con...
1,0,in the crusader first place second highest hon...
2,0,the squire student literary journal features s...
3,0,archbishop moeller high school established fal...
4,0,when opened doors moeller high school received...
5,0,quite number countermeasures installed counter...
6,0,the moeller art program nationally recognized ...
7,0,the lacrosse team two state titles constant fo...
8,0,moeller high school athletic teams exception b...
9,0,recently moeller unveiled plans build multipur...


### Saving csv file of cluster

In [48]:
df.to_csv('infersent_clustering.csv')

### Sentence Embeddings using Siamese BERT-Networks

In [10]:
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [53]:
#sentence_embeddings = model.encode(training_data)

In [54]:
#with open('sentence_embeddings_bert.pickle', 'wb') as handle:
#    pickle.dump(sentence_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [55]:
with open('/content/gdrive/MyDrive/sentence_embeddings_bert.pickle', 'rb') as handle:
    sentence_embeddings = pickle.load(handle)

In [56]:
print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

Sample BERT embedding vector - length 768


In [58]:
clusters1 = cluster_detection(sentence_embeddings, min_community_size=5, threshold=0.65)

### Results of clustering

In [59]:
for i, cluster in enumerate(clusters1):
    print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster[0:3]:
        print("\t", training_data[sentence_id])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Cluster 253, #14 Elements 
	 afm probes manufactured mems technology
	 system environment mode feature as used run ocl rpg ii programs os
	 disk space system organized blocks

Cluster 254, #14 Elements 
	 chapters collected heading from side action takes place argentina
	 horacio returns buenos aires stopping la maga hometown montevideo way
	 south american explorers founded journalist donald james montague south american explorers club

Cluster 255, #14 Elements 
	 a complete khmer sentence consists four basic elements include optional topic optional subject obligatory predicate various adverbials particles ref
	 passing four digit code assembler routine returned associated text
	 there four different ways contracts set aside

Cluster 256, #13 Elements 
	 her second volume cables rage mainly written tenure poet residence tougaloo college mississippi addressed themes love betrayal childbirth complexities raising children

In [60]:
cluster_list1 = []
for i, cluster in enumerate(clusters1):
  cluster_list1.append(cluster)

In [61]:
sen1 = []
for i, cluster in enumerate(clusters1):
  for id in cluster[:]:
    sen1.append([i, training_data[id]])

In [62]:
df1=pd.DataFrame(sen1,columns=['cluster_no','sentences'])

### Saving the csv file of clustering

In [63]:
df1.to_csv('cluster_using_Siamese_BERT-Networks.csv')

### Sentence Embeddings Elmo, loading the model

In [17]:
corpus1 = corpus[:8000]
corpus1['Text'] = corpus1['Text'].replace(np.nan, '')
corpus1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Text
0,moeller student run newspaper the crusader con...
1,in the crusader first place second highest hon...
2,the squire student literary journal features s...
3,paul keels play play announcer ohio state univ...
4,joe uecker ohio state senator r
...,...
7995,the presence kingdom
7996,false presence kingdom
7997,ex g se des nouveaux lieux communs
7998,the politics god politics man


In [18]:
training_data1 = corpus1["Text"]
training_data1 = list(training_data1)

In [19]:
elmo_url = "https://tfhub.dev/google/elmo/3"
embed = hub.KerasLayer(elmo_url)

In [20]:
embedded_lines = []
for line in corpus1.values:
    embedded_line = embed(np.array(line))
    embedded_lines.append(embedded_line)

In [None]:
embedded_lines

In [22]:
len(embedded_lines) 

8000

In [23]:
with open('sentence_embeddings_elmo.pickle', 'wb') as handle:
    pickle.dump(embedded_lines, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
with open('sentence_embeddings_elmo.pickle', 'rb') as handle:
    embedded_lines = pickle.load(handle)

In [34]:
embedded_lines = np.vstack(embedded_lines)

In [35]:
embedded_lines = np.array(embedded_lines)

### Clustering results

In [36]:
clusters2 = cluster_detection(embedded_lines, min_community_size=5, threshold=0.65)

In [37]:
for i, cluster in enumerate(clusters2):
    print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster[0:3]:
        print("\t", training_data[sentence_id])


Cluster 1, #2522 Elements 
	 paul keels play play announcer ohio state university football basketball wbns fm
	 joe uecker ohio state senator r
	 rico murray undrafted free agent signee cincinnati bengals

Cluster 2, #621 Elements 
	 moeller high school athletic teams exception boxing lacrosse skiing volleyball sanctioned ohio high school athletic association ohsaa compete greater catholic league south along elder st xavier la salle high schools
	 the watts family one central families introduced viewers eastenders began created show co creators tony holland julia smith
	 the callen lorde community health center organization new york city named michael callen audre lorde dedicated providing medical health care city lgbt population without regard ability pay

Cluster 3, #207 Elements 
	 quite number countermeasures installed counteract misuse computers including web filtering monitor students use computers
	 the abstraction ignores linear throughput matter energy must power continuous m

In [38]:
cluster_list2 = []
for i, cluster in enumerate(clusters2):
  cluster_list2.append(cluster)

In [39]:
sen2 = []
for i, cluster in enumerate(clusters2):
  for id in cluster[:]:
    sen2.append([i, training_data[id]])

In [40]:
df2=pd.DataFrame(sen2,columns=['cluster_no','sentences'])

In [41]:
df2.to_csv('cluster_using_Elmo-Networks.csv')