In [63]:
from sklearn import datasets

categories = ["misc.forsale","sci.space","sci.electronics","soc.religion.christian"]
train_dict = {} 
test_dict = {} 
for cat in categories:
    train_dict[cat] = datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=[cat]).data
    test_dict[cat] = datasets.fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=[cat]).data

In [64]:
import gensim

def tokenize(text, stopwords, max_len = 20):
    return [token for token in gensim.utils.simple_preprocess(text, max_len=max_len) if token not in stopwords]

tagged_train_dict = {} 
tagged_test_dict = {} 

offset = 0 
for k, v in train_dict.items():
    tagged_train_dict[k] = [gensim.models.doc2vec.TaggedDocument(tokenize(text, [], max_len=200), [i+offset]) for i, text in enumerate(v)]
    offset += len(v)

offset = 0
for k, v in test_dict.items():
    tagged_test_dict[k] = [tokenize(text, [], max_len=200) for i, text in enumerate(v)]
    offset += len(v)
    
corpus = [taggeddoc for taggeddoc_list in list(tagged_train_dict.values()) for taggeddoc in taggeddoc_list]
print(corpus)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



TFIDF for doc similarity


In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidfV=TfidfVectorizer(max_features=100)
tfidfV.fit(tagged_train_dict)
tfidf_vectors=tfidfV.transform(tagged_train_dict)
tfidf_vectors.shape

(4, 8)

In [66]:
tfidf_vectors=tfidf_vectors.toarray()
print (tfidf_vectors[0])

[0.         0.         0.70710678 0.70710678 0.         0.
 0.         0.        ]


In [67]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
pairwise_similarities=np.dot(tfidf_vectors,tfidf_vectors.T)
pairwise_differences=euclidean_distances(tfidf_vectors)

In [68]:
print (tfidf_vectors[0])
print (pairwise_similarities.shape)
print (pairwise_similarities[0][:])

[0.         0.         0.70710678 0.70710678 0.         0.
 0.         0.        ]
(4, 4)
[1. 0. 0. 0.]


COSINE, Euclidean distance for doc similarity TFIDF


In [69]:
def most_similar(doc_id,similarity_matrix,matrix):
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    print(similar_ix)
    for ix in similar_ix:
        if ix==doc_id:
            continue
        print('\n')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')
        
#change the number to see the number of similar documents
most_similar(2,pairwise_similarities,'Cosine Similarity')
most_similar(2,pairwise_similarities,'Euclidean Distance')


[2 1 3 0]


Cosine Similarity : 0.38332232403179184


Cosine Similarity : 0.0


Cosine Similarity : 0.0
[0 3 1 2]


Euclidean Distance : 0.0


Euclidean Distance : 0.0


Euclidean Distance : 0.38332232403179184


Word2vector for similarity


In [70]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer=Tokenizer()
tokenizer.fit_on_texts(tagged_train_dict)
tokenized_documents=tokenizer.texts_to_sequences(tagged_train_dict)
tokenized_paded_documents=pad_sequences(tokenized_documents,maxlen=64,padding='post')
vocab_size=len(tokenizer.word_index)+1

print (tokenized_paded_documents)

[[2 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [6 7 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


BERT Model


In [None]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.2 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 11.5 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 38.8 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 25.7 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 404 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
document_embeddings = sbert_model.encode(list(tagged_train_dict))
pairwise_similarities = cosine_similarity(document_embeddings)
pairwise_differences = euclidean_distances(document_embeddings)
print(pairwise_differences)
print(pairwise_similarities)
cosine_bert = most_similar(3,pairwise_similarities,'Cosine Similarity')
most_similar(0,pairwise_differences,'Euclidean Distance')

[[ 0.       15.978504 15.331786 16.298649]
 [15.978504  0.       14.099411 18.80374 ]
 [15.331786 14.099411  0.       17.949936]
 [16.298649 18.80374  17.949936  0.      ]]
[[1.0000002  0.5106809  0.5459242  0.4771697 ]
 [0.5106809  1.0000002  0.65020376 0.36872968]
 [0.5459242  0.65020376 1.         0.42050904]
 [0.4771697  0.36872968 0.42050904 0.99999964]]
[3 0 2 1]


Cosine Similarity : 0.4771696925163269


Cosine Similarity : 0.4205090403556824


Cosine Similarity : 0.36872968077659607
[0 2 1 3]


Euclidean Distance : 15.331786155700684


Euclidean Distance : 15.978504180908203


Euclidean Distance : 16.298648834228516


GLOVE Embeddings:


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls
!pwd

--2021-12-09 03:09:19--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-12-09 03:09:19--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-12-09 03:09:19--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-1

In [71]:
embeddings_index = dict()

with open('glove.6B.100d.txt') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [72]:
embedding_matrix = np.zeros((vocab_size,100))

for word,i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [73]:
document_embeddings = np.zeros((len(tokenized_paded_documents),100))
words = tfidfV.get_feature_names()

for i in range(len(tagged_train_dict)):
    for j in range(len(words)):
        document_embeddings[i]+=embedding_matrix[tokenizer.word_index[words[j]]]*tfidf_vectors[i][j]
        
document_embeddings=document_embeddings/np.sum(tfidf_vectors,axis=1).reshape(-1,1)



In [74]:
# document_embeddings.shape
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)
most_similar(0,pairwise_similarities,'Cosine Similarity')
most_similar(0,pairwise_differences,'Euclidean Distance')


[0 2 1 3]


Cosine Similarity : 0.21010920695118707


Cosine Similarity : 0.08380337094280509


Cosine Similarity : 0.005325669148156862
[0 2 3 1]


Euclidean Distance : 4.574255621196699


Euclidean Distance : 4.618934603569525


Euclidean Distance : 4.978210975507444


DOC2VEC for similarity


In [75]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=30, min_count=2, epochs=40, window=2)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [76]:
metadata = {}
inferred_vectors_test = {} # Contains, category-wise, inferred doc vecs for each document in the test set
for cat, docs in tagged_test_dict.items():
    inferred_vectors_test[cat] = [model.infer_vector(doc) for doc in list(docs)]
    metadata[cat] = len(inferred_vectors_test[cat])

In [77]:
import csv

def write_to_csv(input, output_file, delimiter='\t'):
    with open(output_file, "w") as f:
        writer = csv.writer(f, delimiter=delimiter)
        writer.writerows(input)
        
veclist_metadata = []
veclist = []
for cat in train_dict.keys():
    for tag in [cat]*metadata[cat]:
        veclist_metadata.append([tag])
    for vec in inferred_vectors_test[cat]:
        veclist.append(list(vec))
write_to_csv(veclist, "doc2vec_20Newsgroups_vectors.csv")
write_to_csv(veclist_metadata, "doc2vec_20Newsgroups_vectors_metadata.csv")

In [78]:
import random
    
cat_id = {id:cat for id, cat in enumerate(categories)} # Give each category a numerical id
test_doc_pairs = {tuple(sorted([id,id2])):[] for id in cat_id for id2 in cat_id}
for pair_id in test_doc_pairs:
    if pair_id[0] == pair_id[1]:
        test_doc_pairs[pair_id] = [(doc, tagged_test_dict[cat_id[pair_id[0]]][i]) for doc_index, doc in enumerate(list(tagged_test_dict[cat_id[pair_id[0]]])) for i in range(doc_index+1, len(list(tagged_test_dict[cat_id[pair_id[0]]])))]
    else:
        test_doc_pairs[pair_id] = [(doc, doc2) for doc in list(tagged_test_dict[cat_id[pair_id[0]]]) for doc2 in list(tagged_test_dict[cat_id[pair_id[1]]])]
similarities_test = {pair_id:[] for pair_id in test_doc_pairs}
for id in cat_id:
    for id2 in cat_id:
        similarities_test[tuple(sorted([id, id2]))] = [model.docvecs.similarity_unseen_docs(model, pair[0], pair[1]) for pair in random.sample(test_doc_pairs[tuple(sorted([id,id2]))],len(test_doc_pairs[tuple(sorted([id,id2]))]))[:500]] # Create a similarity list of selected pairs

In [79]:
for id in cat_id:
    main_avg_vec_sim = 0
    avg_vec_sims = []
    for pair_id, pair_sim_list in similarities_test.items():
        if id in pair_id:
            if pair_id[0] == pair_id[1]:
                main_avg_vec_sim = sum(pair_sim_list)/len(pair_sim_list)
            else:
                avg_vec_sims.append(sum(pair_sim_list)/len(pair_sim_list))
    mean_diff = sum([main_avg_vec_sim - x for x in avg_vec_sims]) / (len(categories)-1)
    print("Category: {}".format(cat_id[id]))
    print("\tMean difference: {:.2}, Same-category average similarity: {:.2}".format(mean_diff, main_avg_vec_sim))

Category: misc.forsale
	Mean difference: 0.2, Same-category average similarity: 0.46
Category: sci.space
	Mean difference: 0.089, Same-category average similarity: 0.33
Category: sci.electronics
	Mean difference: 0.093, Same-category average similarity: 0.35
Category: soc.religion.christian
	Mean difference: 0.28, Same-category average similarity: 0.45
