<a href="https://colab.research.google.com/github/thomouvic/txtanalytics/blob/main/sim_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://github.com/thomouvic/txtanalytics/raw/main/txts.zip
!unzip -q txts.zip

--2023-01-22 15:40:00--  https://github.com/thomouvic/txtanalytics/raw/main/txts.zip
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/thomouvic/txtanalytics/main/txts.zip [following]
--2023-01-22 15:40:01--  https://raw.githubusercontent.com/thomouvic/txtanalytics/main/txts.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3274958 (3.1M) [application/zip]
Saving to: ‘txts.zip’


2023-01-22 15:40:01 (65.1 MB/s) - ‘txts.zip’ saved [3274958/3274958]



In [2]:
path = 'txts'

In [3]:
from pathlib import Path

all_txt_files =[]
for file in Path(f"./{path}/").rglob("*.txt"):
  all_txt_files.append(file.parent / file.name)

n_files = len(all_txt_files)
print(n_files)

130


In [4]:
# A pattern I use often is to load the text files into a pandas dataframe. 
# This is way I get consistent docid's of documents by using the dataframe index. 
# Also, we can later add useful columns to the dataframe, such as 'tokens', see later cells.  

import pandas as pd

# Initialize an empty list to store the data
data = []

# Loop through the file names
for filename in all_txt_files:
    # Read the file into a string
    with open(filename, 'r') as f:
        text = f.read()
    
    # Append the string to the list
    data.append( (filename, text) )

# Create a dataframe from the list of strings
docdf = pd.DataFrame(data, columns=['filename', 'text'])

# Only keep those docs with keywords
docdf = docdf[ docdf['text'].str.contains('Keywords:', case=False) ]
docdf.reset_index(drop=True, inplace=True)
docdf['docid'] = docdf.index
docdf

Unnamed: 0,filename,text,docid
0,txts/Siemens-and-The-INKE-Research-Group-2019-...,"Siemens, Lynne and INKE Research Group. 2019. ...",0
1,txts/Siemens-and-Shawver-2009-New-Paths-for-Co...,New Paths for Computing Humanists\n\n1 of 21\n...,1
2,txts/Robinson-and-Saklofske-2017-Connecting-th...,Connecting the dots: Integrating modular netwo...,2
3,txts/Arbuckle-and-Maxwell-2019-Modelling-Open-...,"Arbuckle, Alyssa and John Maxwell. 2019. Model...",3
4,txts/Siemens-et-al-2009-Implementing-New-Knowl...,"See discussions, stats, and author profiles fo...",4
5,txts/Arbuckle-et-al-2019-Introduction-Beyond-O...,"Arbuckle, Alyssa, et al. 2019. Introduction, B...",5
6,txts/Siemens-2016-Faster-Alone-Further-Togethe...,"Faster Alone, Further Together: Reflections on...",6
7,txts/Arbuckle_KULA_OpenPlus_02-27-19.txt,"Arbuckle, Alyssa. 2019. Open+: Versioning Open...",7
8,txts/Siemens-2005-Text-Analysis-and-the-Dynami...,Text Analysis and the Dynamic Edition? A Worki...,8
9,txts/El-Khatib-et-al-2019-Foundations-for-On-C...,"El Khatib, Randa, et al. 2019. Foundations for...",9


In [5]:
# Given a text, extract the "Keyword:" line
def get_keyword_line(text):
  matched_lines = [line for line in text.split('\n') if 'keywords:' in line.lower()]
  return matched_lines[0]

docdf['keywords'] = docdf['text'].apply(get_keyword_line)
docdf

Unnamed: 0,filename,text,docid,keywords
0,txts/Siemens-and-The-INKE-Research-Group-2019-...,"Siemens, Lynne and INKE Research Group. 2019. ...",0,Keywords: collaboration; project management; INKE
1,txts/Siemens-and-Shawver-2009-New-Paths-for-Co...,New Paths for Computing Humanists\n\n1 of 21\n...,1,"Keywords: Ian Lancashire, bibliography, biogra..."
2,txts/Robinson-and-Saklofske-2017-Connecting-th...,Connecting the dots: Integrating modular netwo...,2,"Keywords: Narrative, Networks, Modularity, Dig..."
3,txts/Arbuckle-and-Maxwell-2019-Modelling-Open-...,"Arbuckle, Alyssa and John Maxwell. 2019. Model...",3,Keywords: open access; open scholarship; schol...
4,txts/Siemens-et-al-2009-Implementing-New-Knowl...,"See discussions, stats, and author profiles fo...",4,"Keywords: Text, Interface, Prototype, Reading,..."
5,txts/Arbuckle-et-al-2019-Introduction-Beyond-O...,"Arbuckle, Alyssa, et al. 2019. Introduction, B...",5,Keywords: open social scholarship; scholarly c...
6,txts/Siemens-2016-Faster-Alone-Further-Togethe...,"Faster Alone, Further Together: Reflections on...",6,Keywords: Collaboration; Networked scholarship...
7,txts/Arbuckle_KULA_OpenPlus_02-27-19.txt,"Arbuckle, Alyssa. 2019. Open+: Versioning Open...",7,Keywords: open scholarship; open access; commu...
8,txts/Siemens-2005-Text-Analysis-and-the-Dynami...,Text Analysis and the Dynamic Edition? A Worki...,8,"Keywords: Electronic editing, text analysis, T..."
9,txts/El-Khatib-et-al-2019-Foundations-for-On-C...,"El Khatib, Randa, et al. 2019. Foundations for...",9,Keywords: social knowledge creation; open soci...


## TF-IDF Similarity

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt_tfidf = tfidf.fit_transform(docdf["text"])
dt_tfidf



<22x7956 sparse matrix of type '<class 'numpy.float64'>'
	with 35249 stored elements in Compressed Sparse Row format>

In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# dt is a document matrix; each row is a document vector
# It returns a (docid1, docid2, score) dataframe of triples.
def sims(dt):
  r = cosine_similarity(dt, dt, dense_output=False)
  m = r.tocoo()
  tuples = zip(m.row, m.col, m.data)
  df = pd.DataFrame(tuples, columns =['docid1', 'docid2', 'score'])
  df

  # Filtering out duplicates. There are quite a few duplicates in the dataset. 
  dup_pairs = df[ (0.99 < df['score']) & (df['docid1']<df['docid2']) ] 
  # print( dup_pairs )
  dup_list = dup_pairs['docid2'].tolist()
  # print(dup_list)
  df = df[~df['docid1'].isin(dup_list)]
  df = df[~df['docid2'].isin(dup_list)]
  # dup_pairs = df[ (0.99 < df['score']) & (df['docid1']<df['docid2']) ]
  # print( dup_pairs )

  # A: Let's keep only rows with similarity greater than a threshold. 
  # A: The threshold is chosen so that we have an average degree of not more than 10.
  # A: We set the threshold so that each document has on average about 30 neighbors wrt similarity
  # A: Also, let's remove rows with similarity close to 1. These are self-similarities. 
  threshold = 0.18
  df = df[ threshold <= df['score'] ]
  df = df[ 0.99 > df['score'] ]
  df.reset_index(drop=True, inplace=True)
  df = df[ df['docid1'] < df['docid2'] ]
  return df.sort_values('score', ascending=False).head(10)


df_tfidf = sims(dt_tfidf)
df_tfidf

Unnamed: 0,docid1,docid2,score
103,11,13,0.780588
2,0,6,0.761722
80,7,18,0.645967
57,5,7,0.62605
30,3,7,0.620732
26,3,5,0.607011
58,5,18,0.591632
4,0,19,0.590002
55,5,9,0.577017
32,3,18,0.571092


In [8]:
def save_sim_df(df, filename):
  merged_df = pd.merge(left=df, right=docdf, left_on='docid1', right_on='docid').drop(columns=['text', 'docid'])
  merged_df2 = pd.merge(left=merged_df, right=docdf, left_on='docid2', right_on='docid').drop(columns=['text', 'docid'])
  merged_df2.to_csv(filename)

save_sim_df(df_tfidf, 'sim_tfidf.csv')

## Doc2Vec Similarity

In [9]:
from gensim.models.doc2vec import Doc2Vec
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from scipy import sparse

model= Doc2Vec.load('/content/drive/MyDrive/shera/doc2vec.bin')



In [10]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

document_example = "This is an example sentence for the document to be compared"

def preprocess(text):
  lowered = str.lower(text)
  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(lowered)
  
  words = []
  for w in word_tokens:
    if w not in stop_words:
      if w not in string.punctuation:
        words.append(w)
  
  return words

preprocess(document_example)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['example', 'sentence', 'document', 'compared']

In [11]:
def doc2vec(text):
  tokens = preprocess(text)
  tokens = list(filter(lambda x: x in model.wv.vocab.keys(), tokens))
  vector = model.infer_vector(tokens)
  return vector

dt_doc2vec_dense = docdf['text'].apply(doc2vec)
dt_doc2vec_dense

0     [-0.49068642, 0.07469278, 0.31665835, -0.13006...
1     [0.903564, -0.24591884, -0.038880717, -0.14804...
2     [0.06106275, 0.048167016, 0.10426839, -0.01746...
3     [-0.20750976, -0.5585526, 0.25707212, -0.25441...
4     [0.36032456, -0.49724472, 0.20178095, -0.54525...
5     [0.7024578, -0.17134711, 0.39844993, -0.026404...
6     [-0.9408148, 0.00307966, 0.22112143, -0.353368...
7     [0.006513775, -0.03827238, 0.13243476, 0.10010...
8     [0.5070267, -0.63163304, -0.11373251, -0.21168...
9     [0.027561897, -0.20668669, 0.2344342, -0.03839...
10    [-0.3083119, -0.13399573, 0.35134232, 0.305334...
11    [-0.062051598, 0.30114388, -0.05588931, -0.338...
12    [-0.1763378, 0.06185944, 0.20127673, -0.140429...
13    [-0.44034958, -0.22691314, -0.18424396, -0.297...
14    [-0.065701395, -0.5263963, 0.40944353, 0.64940...
15    [-0.24889563, 0.034294095, -0.28235918, -0.316...
16    [-0.41453242, 0.19355202, -0.33348468, -0.0519...
17    [0.18745805, -0.16982722, 0.49092892, 0.00

In [12]:
dt_doc2vec = sparse.csr_matrix( np.array(dt_doc2vec_dense.tolist()) )
dt_doc2vec

<22x300 sparse matrix of type '<class 'numpy.float32'>'
	with 6600 stored elements in Compressed Sparse Row format>

In [13]:
df_doc2vec = sims(dt_doc2vec)
df_doc2vec

Unnamed: 0,docid1,docid2,score
235,11,13,0.701667
76,3,7,0.685127
15,0,6,0.677461
149,7,18,0.641875
191,9,18,0.631668
66,3,18,0.626445
107,5,18,0.608478
2,0,19,0.605771
42,2,21,0.601383
291,14,18,0.599329


In [14]:
save_sim_df(df_doc2vec, 'sim_doc2vec.csv')

## USE Similarity

In [15]:
import tensorflow as tf
import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = hub.load(module_url)

In [16]:
def use(text):
  return model([text])[0]

use("This is an example sentence for the document to be compared").shape

TensorShape([512])

In [17]:
def to_numpy(tftensor):
  return tftensor.numpy()

In [18]:
dt_use_dense = docdf['text'].apply(use)
dt_use_dense = dt_use_dense.apply(to_numpy)
dt_use_dense

0     [-0.047383524, -0.047771018, 0.048405226, -0.0...
1     [-0.0064803576, 0.04243474, 0.04687779, 0.0465...
2     [0.04470262, -0.048396595, -0.020505523, 0.036...
3     [-0.018492918, -0.049685128, 0.05085665, 0.024...
4     [0.051596325, -0.023792407, 0.05370922, 0.0205...
5     [-0.017877271, -0.01642961, 0.052407853, 0.000...
6     [-0.032186873, -0.040531196, 0.04841761, -0.03...
7     [0.015427085, -0.051223624, 0.049564242, 0.048...
8     [0.027748859, -0.04240939, 0.036967475, 0.0009...
9     [0.012957092, 0.040157944, 0.04882988, -0.0492...
10    [-0.04984623, -0.049898334, 0.048771687, -0.04...
11    [0.04754907, -0.047649406, -0.04759527, 0.0476...
12    [-0.046665713, -0.040490314, -0.047341637, 0.0...
13    [-0.021947784, -0.04761666, 0.042557202, 0.047...
14    [-0.049300987, -0.050294984, 0.050025493, 0.03...
15    [-0.046769988, -0.047931947, -0.047929525, 0.0...
16    [-0.035485998, -0.047514655, 0.04760239, -0.03...
17    [-0.047818214, -0.04904259, 0.049033813, 0

In [19]:
dt_use = sparse.csr_matrix( np.array(dt_use_dense.tolist()) )
dt_use

<22x512 sparse matrix of type '<class 'numpy.float32'>'
	with 11264 stored elements in Compressed Sparse Row format>

In [20]:
df_use = sims(dt_use)
df_use

Unnamed: 0,docid1,docid2,score
15,0,6,0.889713
2,0,19,0.887812
117,5,9,0.876814
5,0,16,0.834966
108,5,18,0.829423
77,3,7,0.825007
239,11,13,0.822477
192,9,18,0.811454
131,6,16,0.804566
128,6,19,0.798055


In [21]:
save_sim_df(df_use, 'sim_use.csv')

## Bert Similarity

In [22]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4

In [26]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk import sent_tokenize

from sentence_transformers import SentenceTransformer

base_document = "This is an example sentence for the document to be compared"

# bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
bert_model = SentenceTransformer('gtr-t5-xl')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.48G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

In [27]:
def bert(text):
  # Although it is not explicitly stated in the official document of sentence transformer, 
  # the original BERT is meant for a shorter sentence. 
  # We will feed the model by sentences instead of the whole documents.
  sentences = sent_tokenize(text)
  base_embeddings_sentences = bert_model.encode(sentences)
  base_embeddings = np.mean(np.array(base_embeddings_sentences), axis=0)
  return base_embeddings

bert(base_document).shape

(768,)

In [29]:
# dt_bert_dense = docdf['text'].apply(bert)
# dt_bert_dense

# # bert is not able to run in reasonable time