In [1]:
import sys
sys.path.append("../..")

In [2]:
# Ensure the datasketch package is installed for running this notebook
!pip install datasketch

In [3]:
import pandas as pd
from tqdm.notebook import tqdm
from datasketch import MinHash, MinHashLSH
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import csv
from src.preprocess import clean_html, clean_punctuation, clean_uppercase, clean_lemmatize

# Read data

Set path to train and test datasets

In [4]:
BASE_PATH = "../../bigdata2023duplicatedetection/q_3_1/"
PATH_TO_TRAIN_DATA = BASE_PATH + "data/train_q_3.1.csv"
PATH_TO_TEST_DATA = BASE_PATH + "data/test_without_labels_q_3.1.csv"

Read train dataset

In [6]:
data = []
with open(PATH_TO_TRAIN_DATA, 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter='\n') 
    for row in reader:
        data.append(row[0])  

train_df = pd.DataFrame(data, columns=['Question'])
train_df = train_df.drop(0)
train_df = train_df.reset_index(drop=True)

Read test dataset

In [7]:
data = []
with open(PATH_TO_TEST_DATA, 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter='\n') 
    for row in reader:
        data.append(row[0])  

test_df = pd.DataFrame(data, columns=['Question'])
test_df = test_df.drop(0)
test_df = test_df.reset_index(drop=True)

# Preprocess - Transform Data

For this task, we implement a more streamlined preprocessing approach to the data, aiming to retain as much potentially valuable information as possible for effectively comparing the questions

In [8]:
def preprocess(df, modify_columns):
    df = clean_html(df, modify_columns)
    print("HTML clean done")

    df = clean_punctuation(df, modify_columns)
    print("Punctation clean done")

    df = clean_uppercase(df, modify_columns)
    print("Uppercase clean done")

    df = clean_lemmatize(df, modify_columns)
    print("Lemmatize done")

    return df

In [9]:
train_df = preprocess(train_df, ["Question"])
test_df = preprocess(test_df, ["Question"])

HTML clean done
Punctation clean done
Uppercase clean done
Lemmatize done
HTML clean done
Punctation clean done
Uppercase clean done
Lemmatize done


In [10]:
# Vectorize documents using TF-IDF for computing cosine similarities
tfidf_vectorizer = TfidfVectorizer() 
X_tfidf = tfidf_vectorizer.fit_transform(train_df['Question'].tolist() + test_df['Question'].tolist())

n_train = train_df.shape[0]
X_train_tfidf = X_tfidf[:n_train]
X_test_tfidf = X_tfidf[n_train:]

Create sets of shingles (unique words) from question

In [11]:
train_df['Shingles'] = train_df['Question'].apply(lambda x: set(x.split()))
test_df['Shingles'] = test_df['Question'].apply(lambda x: set(x.split()))
train_shingles = train_df['Shingles'].tolist()
test_shingles = test_df['Shingles'].tolist()

In [12]:
train_df

Unnamed: 0,Question,Shingles
0,what be the step by step guide to invest in sh...,"{be, in, india, to, invest, by, market, what, ..."
1,what be the story of kohinoor kohinoor diamond,"{be, kohinoor, story, diamond, what, of, the}"
2,how can i increase the speed of my internet co...,"{my, connection, i, use, vpn, while, increase,..."
3,why be i mentally very lonely how can i solve it,"{be, i, why, solve, it, very, lonely, mentally..."
4,which one dissolve in water quikly sugar salt ...,"{oxide, in, one, di, water, and, dissolve, sug..."
...,...,...
2995,what type of diet can you follow to lose 5 pou...,"{5, type, you, in, diet, pound, to, follow, 2,..."
2996,which be the best commerce college in mangalore,"{be, in, best, mangalore, which, college, the,..."
2997,be network a good field to have a career in wh...,"{good, effective, proceed, a, if, career, yes,..."
2998,can i use letter stamp a postcard stamp,"{i, use, postcard, letter, stamp, a, can}"


In [13]:
test_df

Unnamed: 0,Question,Shingles
0,what can someone do if theyve lose the wireles...,"{wireless, their, someone, keyboard, if, logit..."
1,why india need to elect prime minister,"{india, why, prime, to, minister, need, elect}"
2,how can i make money online with free of cost,"{online, i, with, make, money, can, of, how, f..."
3,do mdma affect the first and high order moment...,"{moment, affect, high, neuron, if, first, mdma..."
4,i be a saudi national and have sr 3 million in...,"{with, advice, and, a, 20, approx, sr, possibl..."
...,...,...
995,whats your story,"{whats, your, story}"
996,how do i know if a girl like me back or not,"{not, me, i, if, or, a, how, back, girl, know,..."
997,im gay how do i come out to my friend and family,"{my, i, out, gay, to, come, and, family, how, ..."
998,wheaton college do gretchen dutschkeklotz rudi...,"{craven, conservative, each, rudis, former, wh..."


In [14]:
evaluation_results = []

# Exact Cosine Similarity

In [15]:
# Compute cosine similarity for each pair of test and train documents
# return query time, number of duplicates
def exact_cosine_similarity(train_vectors, test_vectors):
    start_time = time.time()
    duplicates = 0

    for test_vec in test_vectors:
        similarities = cosine_similarity(test_vec, train_vectors)
        if any(similarity >= 0.8 for similarity in similarities[0]):
            duplicates += 1

    end_time = time.time()
    query_time = end_time - start_time
    return duplicates, query_time

In [16]:
cosine_duplicates, cosine_query_time = exact_cosine_similarity(X_train_tfidf, X_test_tfidf)
print(f"Exact Cosine Similarity found: {cosine_duplicates} duplicates after {cosine_query_time} secs")
evaluation_results.append({
        "Type": "Exact-Cosine",
        "BuildTime": 0,
        "QueryTime": cosine_query_time,
        "TotalTime": cosine_query_time,
        "#Duplicates": cosine_duplicates,
        "Parameters": "-"
    })

Exact Cosine Similarity found: 39 after 1.4869410991668701 secs


# Exact Jaccard Similarity

In [17]:
# Returns jaccard similarity of the given sets 
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

In [18]:
# Compute jaccard similarity for each pair of test and train documents
# return query time, number of duplciates
def exact_jaccard_similarity(train_docs, test_docs, threshold=0.8):
    start_time = time.time()
    duplicates = 0
    for test_doc in test_docs:
        for train_doc in train_docs:
            sim = jaccard_similarity(test_doc, train_doc)
            if sim > threshold:
                duplicates += 1

    return duplicates, time.time() - start_time


In [19]:
jaccard_duplicates, jaccard_query_time  = exact_jaccard_similarity(train_shingles, test_shingles)
print(f"Exact Jaccard Similarity found: {jaccard_duplicates} duplicates after {jaccard_query_time} secs")
evaluation_results.append({
        "Type": "Exact-Jaccard",
        "BuildTime": 0,
        "QueryTime": jaccard_query_time,
        "TotalTime": jaccard_query_time,
        "#Duplicates": jaccard_duplicates,
        "Parameters": "-"
    })

Exact Jaccard Similarity found: 34 after 3.991042137145996 secs


# LSH with cosine distance

In [20]:
# Generates a matrix of random vectors from a normal distribution.
def generate_random_vectors(dimensions, n_vectors):
    return np.random.randn(dimensions, n_vectors)

# Calculates the bin indices for vectors based 
# on their dot product with a set of random vectors.
# Returns an array where each element corresponds to 
# the bin index of a vector in the training dataset.
def get_bin_indices(X_vec, random_vectors):
    bin_indices_bits = X_vec.dot(random_vectors) >= 0
    powers_of_two = 1 << np.arange(random_vectors.shape[1] - 1, -1, -1)
    return bin_indices_bits.dot(powers_of_two)

# Trains a Locality Sensitive Hashing model 
# using cosine similarity for vector data.
def train_cosine_lsh(X_train_vec, n_vectors):
    # define random hyperplanes for hashing the vectors in the dataset.
    random_vectors = generate_random_vectors(X_train_vec.shape[1], n_vectors)

    # each vector in X_train_vec is hashed to a bin index based 
    # on its orientation relative to the random hyperplanes
    bin_indices = get_bin_indices(X_train_vec, random_vectors)
    
    # Group the indices of vectors into the same bin based on their bin index
    hash_table = defaultdict(list)
    for idx, bin_idx in enumerate(bin_indices):
        hash_table[bin_idx].append(idx)
    
    return hash_table, random_vectors

# Queries the LSH model to find candidates similar 
# to a given query vector, based on cosine similarity.
def query_cosine_lsh(hash_table, random_vectors, query_vector, X_train_vec, threshold=0.8):
    bin_idx = get_bin_indices(query_vector, random_vectors)[0]
    candidate_indices = hash_table.get(bin_idx, [])
    
    if not candidate_indices:
        return []
    
    candidates = X_train_vec[candidate_indices]
    similarities = cosine_similarity(query_vector, candidates).flatten()
    
    return [candidate_indices[i] for i, similarity in enumerate(similarities) if similarity > threshold]

# Compute cosine similarity for each pair of test 
# and train documents using Locality Sensitive Hashing
# return build time, query time, number of duplciates
def cosine_lsh(X_train_vec, X_test_vec, K):
    build_start = time.time()
    hash_table, random_vectors = train_cosine_lsh(X_train_vec, K)
    build_time = time.time() - build_start

    query_start = time.time()
    duplicates = 0
    for i in range(X_test_vec.shape[0]):
        similar_docs = query_cosine_lsh(hash_table, random_vectors, X_test_tfidf[i], X_train_vec, threshold=0.8)
        if similar_docs:
            duplicates += 1
    query_time = time.time() - query_start

    return duplicates, build_time, query_time


In [21]:
K_values = [i for i in range(1, 11)]

In [22]:
# Cosine Similarity with LSH
for K in tqdm(K_values, desc="LSH with cosine similarity"):
    duplicates, build_time, query_time = cosine_lsh(X_train_tfidf, X_test_tfidf, K)
    evaluation_results.append({
        "Type": "LSH-Cosine",
        "BuildTime": build_time,
        "QueryTime": query_time,
        "TotalTime": build_time + query_time,
        "#Duplicates": duplicates,
        "Parameters": f"K={K}"
    })


LSH with cosine similarity:   0%|          | 0/10 [00:00<?, ?it/s]

# LSH with jaccard distance

In [23]:
def create_minhash(doc_shingles, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for shingle in doc_shingles:
        m.update(shingle.encode('utf8'))
    return m

In [24]:
def jaccard_lsh(train_data, test_data, num_perm):
    # Create MinHashes
    start_build_time = time.time()
    lsh = MinHashLSH(threshold=0.8, num_perm=num_perm)
    train_minhashes = [create_minhash(shingles, num_perm) for shingles in train_data]

    for idx, minhash in enumerate(train_minhashes):
        lsh.insert(idx, minhash)
    build_time = time.time() - start_build_time

    # Query LSH indexz
    start_query_time = time.time()
    duplicates = 0
    for test_shingles in test_data:
        test_minhash = create_minhash(test_shingles, num_perm)
        candidate_idxs = lsh.query(test_minhash)
        if candidate_idxs:
            duplicates += 1
    query_time = time.time() - start_query_time

    return duplicates, build_time, query_time


In [25]:
# Jaccard Similarity with LSH
permutations = [16, 32, 64]
for num_perm in permutations:
    duplicates, build_time, query_time = jaccard_lsh(train_shingles, test_shingles, num_perm)
    evaluation_results.append({
        "Type": "LSH-Jaccard",
        "BuildTime": build_time,
        "QueryTime": query_time,
        "TotalTime": build_time + query_time,
        "#Duplicates": duplicates,
        "Parameters": f"Perm={num_perm}"
    })


# Results in Table

In [26]:
# Convert results to a DataFrame
results_df = pd.DataFrame(evaluation_results)
display(results_df)

Unnamed: 0,Type,BuildTime,QueryTime,TotalTime,#Duplicates,Parameters
0,Exact-Cosine,0.0,1.486941,1.486941,39,-
1,Exact-Jaccard,0.0,3.991042,3.991042,34,-
2,LSH-Cosine,0.002566,1.81703,1.819596,37,K=1
3,LSH-Cosine,0.001682,1.516204,1.517886,33,K=2
4,LSH-Cosine,0.001159,1.133409,1.134567,29,K=3
5,LSH-Cosine,0.001687,1.215225,1.216912,29,K=4
6,LSH-Cosine,0.003776,1.12934,1.133116,28,K=5
7,LSH-Cosine,0.003907,1.16832,1.172227,26,K=6
8,LSH-Cosine,0.001716,1.383646,1.385362,30,K=7
9,LSH-Cosine,0.004131,1.337832,1.341964,23,K=8


# Comments on results

**Exact Similarity Searches**:

The Exact-Cosine method is faster than Exact-Jaccard. The number of duplicates found by Exact-Cosine is slightly higher (39) than that by Exact-Jaccard (34), indicating a possible difference in sensitivity to the data's similarity characteristics.

**LSH with Cosine Similarity**:

LSH-Cosine methods demonstrate varying performance with different K values (number of random projections). As K increases, there's a general trend towards faster query times until a certain point, after which the improvement plateaus. The build time is minimal across all K values, indicating efficient setup but varying effectiveness, with the highest number of duplicates found being 37 (at K=1) and the lowest being 23 (as K increases to 8, 9, and 10).
The best balance between query time and number of duplicates found seems to occur at lower K values (e.g., K=1 or K=2), suggesting that fewer hash functions might be sufficient for this dataset under the cosine similarity measure. 

**LSH with Jaccard Similarity**:

LSH-Jaccard methods show a significant increase in build time compared to LSH-Cosine, especially as the number of permutations increases. 
The query times for LSH-Jaccard are shorter than those for Exact-Jaccard, showcasing the efficiency gains from using LSH. The number of duplicates found is relatively consistent (32 to 34), which is competitive with the Exact-Jaccard method, indicating good effectiveness.
There's a clear increase in total time as the number of permutations increases, suggesting diminishing returns on computational investment for higher Perm values.

**Overall Observations**:

LSH methods provide a flexible trade-off between computational efficiency and the accuracy of similarity searches.
For cosine similarity, lower K values seem more effective, suggesting that a simpler model may suffice for this particular dataset.
For Jaccard similarity, LSH significantly reduces query times compared to the exact method, with a modest increase in build time as the number of permutations increases.