# Analyze Similar Pairs
This small project will complete jobs below:
1. Get all the similar pairs that output from LSH and SetJoin
2. Analyze them (Know how many documents among them, verify if there is any false positive)
3. Compare them (Is there any inclusion relationship)


## Load Similar Pairs

In [10]:
import tqdm
from glob import glob
import argparse

# load from LSH

dup_dir_path = "/research/projects/zp128/RedPajama-Data-1T/RedPajama-Data-1T/RedPajama_norm/dup"
files = glob(f"{dup_dir_path}/*")
simp_lsh = set()
for fp in files:
    with open(fp, "r") as f:
        for line in tqdm.tqdm(f):
            ori_pair = tuple(line.strip().split(" :: "))
            pair = tuple([part.split("@")[1] for part in ori_pair])
            if pair[0] != pair[1]:
                simp_lsh.add((int(pair[0]), int(pair[1])))




7349it [00:00, 175442.05it/s]
7223it [00:00, 183165.91it/s]
7405it [00:00, 186342.49it/s]
7430it [00:00, 172445.60it/s]
7314it [00:00, 58968.08it/s]
7316it [00:00, 57421.49it/s]
7234it [00:00, 224626.10it/s]
7317it [00:00, 108846.56it/s]
7458it [00:00, 140151.52it/s]


In [11]:
# load from SetJoin
import util
simpairs_bin_path = "/research/projects/zp128/RedPajama_Analysis/SetJoin/similar_pairs/stackexchange_sim_pairs_0.800000.bin"
idmap_bin_path = "/research/projects/zp128/RedPajama_Analysis/SetJoin/sorted_sets/stackexchange_idmap.bin"

idmap = util.read_ints_from_binary(idmap_bin_path)
sim_pairs = util.read_pairs_from_binary(simpairs_bin_path)
simp_setjoin = util.map_elements(sim_pairs, idmap)

simp_setjoin = util.correct_pair_order(simp_setjoin)
simp_lsh = util.correct_pair_order(simp_lsh)

## Analyze them

In [12]:
import struct
thres = 0.8
# load the real documents
ids_setjoin = util.extract_elements(simp_setjoin)
dataset = util.read_pajama_dataset_selected_docs("stackexchange",ids_setjoin)
ids_setjoin_size = len(ids_setjoin)
print(f"IT includes {ids_setjoin_size} unique pairs.")


There are total 29825086 documents in this /research/projects/zp128/RedPajama-Data-1T/RedPajama-Data-1T/stackexchange/tokenized_text_document.idx
IT includes 44793 unique pairs.


In [13]:
thres = 0.8

ids_lsh = util.extract_elements(simp_lsh)
dataset = util.read_pajama_dataset_selected_docs("stackexchange",ids_lsh)
ids_lsh_size = len(ids_lsh)
print(f"IT includes {ids_lsh_size} unique pairs.")
util.check_jaccard_similarity(dataset,simp_lsh, thres )

There are total 29825086 documents in this /research/projects/zp128/RedPajama-Data-1T/RedPajama-Data-1T/stackexchange/tokenized_text_document.idx
IT includes 32104 unique pairs.


TypeError: 'int' object is not subscriptable

In [22]:
def check_jaccard_similarity_(dataset, simp, thres):
    # Go through each pair in simp_setjoin
    for pair in simp:
        # Calculate the Jaccard similarity of the two documents in the pair
        jacc = util.jaccard_similarity(dataset[pair[0]], dataset[pair[1]])
        if jacc < thres:
            print(jacc)
#             return False  # If you want to stop at the first error
    print("All pairs have Jaccard similarity >= thres")
    return True  # If all pairs pass the check


In [23]:
check_jaccard_similarity_(dataset,simp_lsh, thres )

0.625


False

## Compare them

In [11]:
union_set = simp_setjoin.union(simp_lsh)
union_size = len(union_set)
print(f"The union of two sets includes {union_size} unique pairs.")

# Intersection
intersection_set = simp_setjoin.intersection(simp_lsh)
intersection_size = len(intersection_set)
print(f"The intersection of two sets includes {intersection_size} common pairs.")

# Difference
difference_set = simp_setjoin.difference(simp_lsh)  # B - A
difference_size = len(difference_set)
print(f"The difference  of two sets(B - A) includes {difference_size} common pairs.")


The union of two sets includes 49339 unique pairs.
The intersection of two sets includes 0 common pairs.
The intersection of two sets includes 31950 common pairs.
