# Analyze Similar Pairs
This small project will complete jobs below:
1. Get all the similar pairs that output from LSH and SetJoin
2. Analyze them (Know how many documents among them, verify if there is any false positive)
3. Compare them (Is there any inclusion relationship)


## Load Similar Pairs

In [2]:
# Load From Slimpajama-LSH
import tqdm
from glob import glob
import argparse
import util
# load from LSH
dataset_name = "arxiv"
thres = 0.8
dup_dir_path = f"/research/projects/zp128/RedPajama_Analysis/LSH/similar_pairs/{dataset_name}_{thres}/"
files = glob(f"{dup_dir_path}/*")
simp_lsh = set()
for fp in files:
    with open(fp, "r") as f:
        for line in tqdm.tqdm(f):
            pair = tuple(line.strip().split(" :: "))
            # pair = tuple([part.split("@")[1] for part in ori_pair])
            if pair[0] != pair[1]:
                simp_lsh.add((int(pair[0]), int(pair[1])))

simp_lsh = util.correct_pair_order(simp_lsh)


7840it [00:00, 320712.98it/s]
7757it [00:00, 332270.02it/s]
7336it [00:00, 338959.79it/s]
7462it [00:00, 324838.83it/s]
7411it [00:00, 328569.48it/s]
7984it [00:00, 332080.43it/s]
7410it [00:00, 331383.47it/s]
7165it [00:00, 339722.46it/s]
13347it [00:00, 321030.94it/s]


In [3]:
# Load From My LSH
simp_dir_path = "/research/projects/zp128/RedPajama_Analysis/LSH/Lsh_C++/similar_pairs"
file_prefix = f"{dataset_name}_sim_pairs_{thres:.6f}"

simpairs_bin_path = util.find_file(simp_dir_path, file_prefix)
print(simpairs_bin_path)
idmap_bin_path = f"/research/projects/zp128/RedPajama_Analysis/SetJoin/sorted_sets/{dataset_name}_idmap.bin"
idmap = util.read_ints_from_binary(idmap_bin_path)
sim_pairs = util.read_pairs_from_binary(simpairs_bin_path)
simp_mylsh = util.map_elements(sim_pairs, idmap)

simp_mylsh = util.correct_pair_order(simp_mylsh)


/research/projects/zp128/RedPajama_Analysis/LSH/Lsh_C++/similar_pairs/arxiv_sim_pairs_0.800000_K128B9R13.bin


In [6]:
# load from SetJoin
import util

simpairs_bin_path = f"/research/projects/zp128/RedPajama_Analysis/SetJoin/similar_pairs/{dataset_name}_sim_pairs_{thres:.6f}.bin"
idmap_bin_path = f"/research/projects/zp128/RedPajama_Analysis/SetJoin/sorted_sets/{dataset_name}_idmap.bin"

idmap = util.read_ints_from_binary(idmap_bin_path)
sim_pairs = util.read_pairs_from_binary(simpairs_bin_path)
simp_setjoin = util.map_elements(sim_pairs, idmap)

simp_setjoin = util.correct_pair_order(simp_setjoin)

In [15]:
# load from OverlapJoin
K = 32
C = 29

simpairs_bin_path = f"/research/projects/zp128/RedPajama_Analysis/OverlapJoin/similar_pairs/{dataset_name}_simPair_K{K}_C{C}/sim_pairs.bin"
idmap_bin_path = f"/research/projects/zp128/RedPajama_Analysis/OverlapJoin/similar_pairs/{dataset_name}_simPair_K{K}_C{C}/idmap.bin"

idmap = util.read_ints_from_binary(idmap_bin_path)
sim_pairs = util.read_pairs_from_binary(simpairs_bin_path)
simp_ovlp = util.map_elements(sim_pairs, idmap)

simp_ovlp = util.correct_pair_order(simp_ovlp)

## Analyze them

In [4]:
# A function that can check the validity of the sim pairs whether meet the jaccard threshold requirement
def check_jaccard_similarity(dataset, simp, thres):
    # Go through each pair in simp_setjoin
    total_invalid_pairs_amount = 0
    for pair in simp:
        # Calculate the Jaccard similarity of the two documents in the pair
        jacc = util.jaccard_similarity(dataset[pair[0]], dataset[pair[1]])
        if jacc < thres:
#             print(jacc)
            total_invalid_pairs_amount  = total_invalid_pairs_amount + 1
    if total_invalid_pairs_amount == 0:
        print("All pairs have Jaccard similarity >= thres")
    else:
        print(f"There are {total_invalid_pairs_amount} pairs among {len(simp)} pairs' similarity lower than threshold.")

In [1]:
# Load the documents of simp in setjoin
# load the real documents
ids_setjoin = util.extract_elements(simp_setjoin)
# docs_setjoin = util.read_pajama_dataset_selected_docs(dataset_name,ids_setjoin)
ids_setjoin_size = len(ids_setjoin)
print(f"Setjoin finds {ids_setjoin_size} unique documents.")


NameError: name 'util' is not defined

In [6]:
# Load the documents of simp in Slimpajama-LSH
ids_lsh = util.extract_elements(simp_lsh)
docs_lsh = util.read_pajama_dataset_selected_docs(dataset_name,ids_lsh)
ids_lsh_size = len(ids_lsh)
print(f"LSH Method finds {ids_lsh_size} unique documents.")
check_jaccard_similarity(docs_lsh,simp_lsh, thres )

There are total 1558306 documents in this /research/projects/zp128/RedPajama-Data-1T/RedPajama-Data-1T/arxiv/tokenized_text_document.idx
LSH Method finds 25387 unique documents.
There are 9808 pairs among 20819 pairs' similarity lower than threshold.


In [4]:
# Load the documents of simp in My-LSH
ids_mylsh = util.extract_elements(simp_mylsh)
# docs_mylsh = util.read_pajama_dataset_selected_docs(dataset_name,ids_mylsh)
ids_mylsh_size = len(ids_mylsh)
print(f"C++'s LSH Method finds {ids_mylsh_size} unique documents.")
# check_jaccard_similarity(docs_mylsh,simp_mylsh, thres )

C++'s LSH Method finds 77084 unique documents.


In [19]:
# Load the documents of simp in OverlapJoin
ids_ovlp = util.extract_elements(simp_ovlp)
docs_ovlp = util.read_pajama_dataset_selected_docs(dataset_name,ids_ovlp)
ids_ovlp_size = len(ids_ovlp)
print(f"OverlapJoin Method finds {ids_ovlp_size} unique documents.")
check_jaccard_similarity(docs_ovlp,simp_ovlp, thres )

There are total 1558306 documents in this /research/projects/zp128/RedPajama-Data-1T/RedPajama-Data-1T/arxiv/tokenized_text_document.idx
OverlapJoin Method finds 6034 unique documents.
There are 2873 pairs among 30903 pairs' similarity lower than threshold.


## Compare them

### SetJoin VS LSH

In [35]:
print("Now Analyze the property of there similar pairs")
print(f"There are {len(simp_setjoin)} pairs in simp_setjoin")
print(f"There are {len(simp_lsh)} pairs in simp_lsh")

union_set = simp_setjoin.union(simp_lsh)
union_size = len(union_set)
print(f"The union of two sets includes {union_size} unique pairs.")

# Intersection
intersection_set = simp_setjoin.intersection(simp_lsh)
intersection_size = len(intersection_set)
print(f"The intersection of two sets includes {intersection_size} common pairs.")

# Difference
difference_set = simp_setjoin.difference(simp_lsh)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(B - A) includes {difference_size} different pairs.")


Now Analyze the property of there similar pairs
There are 3285654 pairs in simp_setjoin
There are 8010 pairs in simp_lsh
The union of two sets includes 3285992 unique pairs.
The intersection of two sets includes 7672 common pairs.
The difference  of two sets(B - A) includes 3277982 different pairs.


In [36]:
print("Now Analyze the property of their documents")
print(f"There are {len(ids_setjoin)} documents in simp_setjoin")
print(f"There are {len(ids_lsh)} documents in simp_lsh")

union_set = ids_setjoin.union(ids_lsh)
union_size = len(union_set)
print(f"The union of two sets includes {union_size} documents.")

# Intersection
intersection_set = ids_setjoin.intersection(ids_lsh)
intersection_size = len(intersection_set)
print(f"The intersection of two sets includes {intersection_size} common documents.")

# Difference
difference_set = ids_setjoin.difference(ids_lsh)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(B - A) includes {difference_size} different documents.")

# Difference
difference_set = ids_lsh.difference(ids_setjoin)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(A - B) includes {difference_size} different documents.")

Now Analyze the property of their documents
There are 8333 documents in simp_setjoin
There are 8293 documents in simp_lsh
The union of two sets includes 8610 documents.
The intersection of two sets includes 8016 common documents.
The difference  of two sets(B - A) includes 317 different documents.
The difference  of two sets(A - B) includes 277 different documents.


### SetJoin vs OverlapJoin

In [28]:
print("Now Analyze the property of there similar pairs")
print(f"There are {len(simp_setjoin)} pairs in simp_setjoin")
print(f"There are {len(simp_ovlp)} pairs in simp_ovlp")

union_set = simp_setjoin.union(simp_ovlp)
union_size = len(union_set)
print(f"The union of two sets includes {union_size} unique pairs.")

# Intersection
intersection_set = simp_setjoin.intersection(simp_ovlp)
intersection_size = len(intersection_set)
print(f"The intersection of two sets includes {intersection_size} common pairs.")

# Difference
difference_set = simp_setjoin.difference(simp_ovlp)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(B - A) includes {difference_size} different pairs.")

Now Analyze the property of there similar pairs
There are 3319410 pairs in simp_setjoin
There are 30903 pairs in simp_ovlp
The union of two sets includes 3322770 unique pairs.
The intersection of two sets includes 27543 common pairs.
The difference  of two sets(B - A) includes 3291867 different pairs.


In [29]:
print("Now Analyze the property of their documents")
print(f"There are {len(ids_setjoin)} documents in simp_setjoin")
print(f"There are {len(ids_ovlp)} documents in simp_ovlp")

union_set = ids_setjoin.union(ids_ovlp)
union_size = len(union_set)
print(f"The union of two sets includes {union_size} documents.")

# Intersection
intersection_set = ids_setjoin.intersection(ids_ovlp)
intersection_size = len(intersection_set)
print(f"The intersection of two sets includes {intersection_size} common documents.")

# Difference
difference_set = ids_setjoin.difference(ids_ovlp)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(B - A) includes {difference_size} different documents.")

# Difference
difference_set = ids_ovlp.difference(ids_setjoin)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(A - B) includes {difference_size} different documents.")

Now Analyze the property of their documents
There are 10695 documents in simp_setjoin
There are 6034 documents in simp_ovlp
The union of two sets includes 12664 documents.
The intersection of two sets includes 4065 common documents.
The difference  of two sets(B - A) includes 6630 different documents.
The difference  of two sets(A - B) includes 1969 different documents.


### My LSH vs Slimpjama-lSH

In [None]:
print("Only Analyze the property of their documents")
print(f"There are {len(ids_mylsh)} documents in ids_mylsh")
print(f"There are {len(ids_lsh)} documents in ids_lsh")

union_set = ids_mylsh.union(ids_lsh)
union_size = len(union_set)
print(f"The union of two sets includes {union_size} documents.")

# Intersection
intersection_set = ids_mylsh.intersection(ids_lsh)
intersection_size = len(intersection_set)
print(f"The intersection of two sets includes {intersection_size} common documents.")

# Difference
difference_set = ids_mylsh.difference(ids_lsh)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(B - A) includes {difference_size} different documents.")

# Difference
difference_set = ids_lsh.difference(ids_mylsh)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(A - B) includes {difference_size} different documents.")

### SetJoin vs MyLSH

In [8]:
print("Now Analyze the property of there similar pairs")
print(f"There are {len(simp_setjoin)} pairs in simp_setjoin")
print(f"There are {len(simp_mylsh)} pairs in simp_mylsh")

union_set = simp_setjoin.union(simp_mylsh)
union_size = len(union_set)
print(f"The union of two sets includes {union_size} unique pairs.")

# Intersection
intersection_set = simp_setjoin.intersection(simp_mylsh)
intersection_size = len(intersection_set)
print(f"The intersection of two sets includes {intersection_size} common pairs.")

# Difference
difference_set = simp_setjoin.difference(simp_mylsh)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(B - A) includes {difference_size} different pairs.")

# Difference
difference_set = simp_mylsh.difference(simp_setjoin)  # A - B
difference_size = len(difference_set)
print(f"The difference  of two sets(A - B) includes {difference_size} different pairs.")

Now Analyze the property of there similar pairs
There are 3319410 pairs in simp_setjoin
There are 3403710 pairs in simp_mylsh
The union of two sets includes 3417425 unique pairs.
The intersection of two sets includes 3305695 common pairs.
The difference  of two sets(B - A) includes 13715 different pairs.
The difference  of two sets(A - B) includes 98015 different pairs.


In [9]:
print("Now Analyze the property of their documents")
print(f"There are {len(ids_setjoin)} documents in simp_setjoin")
print(f"There are {len(ids_mylsh)} documents in simp_mylsh")

union_set = ids_setjoin.union(ids_mylsh)
union_size = len(union_set)
print(f"The union of two sets includes {union_size} documents.")

# Intersection
intersection_set = ids_setjoin.intersection(ids_mylsh)
intersection_size = len(intersection_set)
print(f"The intersection of two sets includes {intersection_size} common documents.")

# Difference
difference_set = ids_setjoin.difference(ids_mylsh)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(B - A) includes {difference_size} different documents.")

# Difference
difference_set = ids_mylsh.difference(ids_setjoin)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(A - B) includes {difference_size} different documents.")

Now Analyze the property of their documents
There are 10695 documents in simp_setjoin
There are 77084 documents in simp_mylsh
The union of two sets includes 77809 documents.
The intersection of two sets includes 9970 common documents.
The difference  of two sets(B - A) includes 725 different documents.
The difference  of two sets(A - B) includes 67114 different documents.
