In [None]:
%reset -sf

In [None]:
import os, collections, random, itertools

import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

random.seed(42)
np.random.seed(42)

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'): 
    for filename in filenames: print(os.path.join(dirname, filename))

In [None]:
# load data
df = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
df["question1"] = df["question1"].astype(str)  # resolve nan
df["question2"] = df["question2"].astype(str)  # resolve nan

In [None]:
df.sample(10)

In [None]:
TEST_SET_SIZE = 1000
RANKED_LIST_SIZE = 100

# Preprocessing Dataset

In [None]:
# all questions are identified with its qid
qid_to_question = {}
for qid1, qid2, question1, question2 in zip(df["qid1"], df["qid2"], df["question1"], df["question2"]):
    qid_to_question[qid1] = question1
    qid_to_question[qid2] = question2

In [None]:
# extract 1000 questions for testing
test_query_qids = set()

df_duplicate = df[df["is_duplicate"] == 1].sample(frac=1, random_state=42)
for qid1, qid2, is_duplicate in zip(df_duplicate["qid1"], df_duplicate["qid2"], df_duplicate["is_duplicate"]):
    if is_duplicate and qid1 not in test_query_qids:
        test_query_qids.add(qid2)
    if qid1 in test_query_qids and qid2 in test_query_qids:
        assert False
    if len(test_query_qids) == TEST_SET_SIZE:
        break

test_query_qids_list = sorted(test_query_qids)
train_query_qids_list = sorted(set(qid_to_question.keys()) - test_query_qids)
assert test_query_qids_list[:3] == [145, 332, 400]   # to check random state fixed

In [None]:
# extract duplicate relationship of training set

qid_to_duplicate_qids = collections.defaultdict(set)
qid_to_nonduplicate_qids = collections.defaultdict(set)

for qid1, qid2, is_duplicate in zip(df["qid1"], df["qid2"], df["is_duplicate"]):
    if not (qid1 in test_query_qids or qid2 in test_query_qids):
        if is_duplicate:
            qid_to_duplicate_qids[qid1].add(qid2)
            qid_to_duplicate_qids[qid2].add(qid1)
        else:
            qid_to_nonduplicate_qids[qid1].add(qid2)
            qid_to_nonduplicate_qids[qid2].add(qid1)

In [None]:
# complete graph of duplicate relationships

qid_to_duplicate_qids_complete = collections.defaultdict(set)
qid_to_qid_group_leader = {}
qid_group_leader_to_duplicate_qid_group = collections.defaultdict(set)

visited_qids = set()
for train_qid in train_query_qids_list:
    if train_qid in visited_qids:
        continue
    current_qids_group = set([train_qid])
    qid_to_qid_group_leader[train_qid] = train_qid
    stack = [train_qid]
    
    while stack:
        cur_qid = stack.pop()
        for nex_qid in qid_to_duplicate_qids[cur_qid]:
            if nex_qid in current_qids_group:
                continue
            qid_to_qid_group_leader[nex_qid] = train_qid
            stack.append(nex_qid)
            current_qids_group.add(nex_qid)

    # complete the graph
    for qid1, qid2 in itertools.combinations(current_qids_group, r=2):
        qid_to_duplicate_qids_complete[qid1].add(qid2)
        qid_to_duplicate_qids_complete[qid2].add(qid1)
    qid_group_leader_to_duplicate_qid_group[train_qid] = current_qids_group
    visited_qids.update(current_qids_group)

In [None]:
# extract duplicate relationship of the test set

test_qid_to_duplicate_qids = collections.defaultdict(set)
test_qid_to_duplicate_qids_complete = collections.defaultdict(set)

for qid1, qid2, is_duplicate in zip(df_duplicate["qid1"], df_duplicate["qid2"], df_duplicate["is_duplicate"]):
    if qid2 in test_query_qids:
        qid1, qid2 = qid2, qid1
    if qid1 in test_query_qids:
        if qid2 in test_query_qids:
            continue
        test_qid_to_duplicate_qids[qid1].add(qid2)
        test_qid_to_duplicate_qids_complete[qid1].add(qid2)
        for train_qid in qid_group_leader_to_duplicate_qid_group[qid_to_qid_group_leader[qid2]]:
            test_qid_to_duplicate_qids_complete[qid1].add(train_qid)

In [None]:
# count inconsistencies in dataset

cnt = 0
for qid1, qid_list in qid_to_duplicate_qids_complete.items():
    for qid2 in qid_list:
        if qid1 in qid_to_nonduplicate_qids[qid2]:
            # print(qid1, qid2)
            cnt += 1
print(cnt)

In [None]:
test_mask = (df["qid1"].isin(test_query_qids)) | (df["qid2"].isin(test_query_qids))
train_df = df[~test_mask].copy()
test_df = df[test_mask].copy()

In [None]:
# clean up
del qid_to_qid_group_leader, qid_group_leader_to_duplicate_qid_group
del cnt
del test_query_qids   # not sorted, use test_query_qids_list
del df                # all data you can train on is in train_df

# enable use of complete graphs
test_qid_to_duplicate_qids = test_qid_to_duplicate_qids_complete
qid_to_duplicate_qids = qid_to_duplicate_qids_complete

# Evaluation Metrics

In [None]:
def method_random_guess(test_qid):
    return random.choices(train_query_qids_list, k=RANKED_LIST_SIZE)

method_random_guess_ranklists = [method_random_guess(test_qid) for test_qid in test_query_qids_list] # 1000 x 100 (the ranked list of similar qn for each of the 1000 test qns)
print(len(method_random_guess_ranklists))
print(len(method_random_guess_ranklists[0]))

In [None]:
def sample_results(test_qid, method_ranklist, method_scores=[0]*RANKED_LIST_SIZE, num_to_show=8):
    print("Query: {}".format(qid_to_question[test_qid]))
    for rank, (score, result_qid) in enumerate(zip(method_scores, method_ranklist[:num_to_show]), start=1):
        relevance = "Registered" if result_qid in test_qid_to_duplicate_qids[test_qid] else "Unregistered"
        print("Rank {} - Score {} - {}:  \t{}".format(rank, score, relevance, qid_to_question[result_qid]))

sample_results(test_query_qids_list[0], method_random_guess(test_query_qids_list[0]))

In [None]:
def evaluation_with_basic_methods(method_ranklists):
    assert np.array(method_ranklists).shape == (TEST_SET_SIZE, RANKED_LIST_SIZE)
    ranks = [RANKED_LIST_SIZE]*TEST_SET_SIZE # init rank of first duplicate found for each test qn as 100 (signifying that first duplicate is out of ranked list range)
    for i, (test_qid, ranklist) in enumerate(zip(test_query_qids_list, method_ranklists)):
        for rank, result_qid in enumerate(ranklist, start=1):
            if result_qid in test_qid_to_duplicate_qids[test_qid]: # the first duplicate identified, if any; may not be entered if all duplicates are out of range
                ranks[i] = rank # rank of the first duplicate identified
                break
                
    plt.figure(figsize=(14,4))
    plt.title("Rank of duplicate questions")
    plt.hist(ranks, bins=np.arange(RANKED_LIST_SIZE+1))
    plt.xlabel("Rank")
    plt.ylabel("Frequency")
    plt.show()
    
    print("Median rank: {}".format(np.median(ranks)))
    print("Bottom 25% rank: {}".format(np.percentile(ranks, 75)))
    print("Bottom 10% rank: {}".format(np.percentile(ranks, 90)))
    print("Logarithmic average rank: {:.2f}".format(np.exp(np.mean(np.log(ranks)))))
    print("Proportion out of result: {:.3f}".format(ranks.count(RANKED_LIST_SIZE)/len(ranks)))

In [None]:
evaluation_with_basic_methods(method_random_guess_ranklists)

In [None]:
# def evaluation_with_auc(method_ranklists, k=10):
#     # df version for more complicated computations if needed
#     assert np.array(method_ranklists).shape == (TEST_SET_SIZE, RANKED_LIST_SIZE) # method_ranklists size is (1000,100)
# #     ranks = [RANKED_LIST_SIZE]*TEST_SET_SIZE
#     auc_df = pd.DataFrame({"rank":[i for i in range(1,k+1)], "count":[0 for i in range(1,k+1)]})
    
#     ## Identify duplicates among top K ranks for each test
#     for i, (test_qid, ranklist) in enumerate(zip(test_query_qids_list, method_ranklists)): # iter over 1->1000 tests
#         # construct df to hold results for top k results
#         df = pd.DataFrame({"rank":[i for i in range(1,k+1)], "result_qid":ranklist[:k]})
#         df["is_duplicate"] = df["result_qid"].apply(lambda result_qid: 1 if result_qid in test_qid_to_duplicate_qids[test_qid] else 0)
#         df = df.assign(is_duplicate=[random.choice([0,1]) for i in range(1,k+1)]) # uncomment this to test if it works
#         auc_df["count"] += df["is_duplicate"]
    
#     ## Calculate AUC
#     print(auc_df)
#     auc = sum(auc_df["count"])
#     best_auc_at_k = TEST_SET_SIZE * k
#     return auc/best_auc_at_k
    
# evaluation_with_auc(method_random_guess_ranklists)

In [None]:
def evaluation_with_auc(method_ranklists, k=10, weights=None):
    # Comparing models with auc only makes sense if k is kept constant between comparisons
    assert np.array(method_ranklists).shape == (TEST_SET_SIZE, RANKED_LIST_SIZE) # method_ranklists size is (1000,100)
    
    ## Init count and weights
    counts = np.array([0 for i in range(k)]) # init counts at each of the top k ranks; each count will increase if a duplicate is found at that rank
    if weights == None:
        weights = [1 for i in range(k)] # default weights is 1 for each rank; alternatively can weigh higher ranks more by inputting weights list
    weights = np.array(weights) 
    
    ## Identify duplicates among top K ranks for each test
    for i, (test_qid, ranklist) in enumerate(zip(test_query_qids_list, method_ranklists)): # iter over 1->1000 tests
        topk = ranklist[:k]
        is_duplicate = np.array([1 if (result_qid in test_qid_to_duplicate_qids[test_qid]) else 0 for result_qid in topk]) # 1 if result is a duplicate
#         is_duplicate = np.array([random.choice([0,1]) for result_qid in topk]) # TEST LINE: uncomment this to test if func works; expect auc=0.5
        counts += is_duplicate # increment the counts
        # proceed to next test qn
    
    ## Calculate AUC
    counts *= weights # akin to scaling the breadth of each hist bar by a corresponding weight factor
    auc = sum(counts)
    best_auc_at_k = TEST_SET_SIZE * sum(weights) # assumes for each test qn: each of the top k has a duplicate found
    return auc/best_auc_at_k # between [0,1], 1 is perfect

print("Tests (rmb to uncomment TEST LINE before running test)")
print(evaluation_with_auc(method_random_guess_ranklists))
print(evaluation_with_auc(method_random_guess_ranklists, weights = [10,9,8,7,6,5,4,3,2,1]))

In [None]:
def single_r_precision(test_qid, test_qid_to_duplicate_qids_dict, ranklist):
    # use this to check a single test query
    num_duplicate = len(test_qid_to_duplicate_qids[test_qid]) # this dict needs to be updated when train:test set separation is updated
    if num_duplicate == 0:
        return 0, 0, 0
    top_r = ranklist[:num_duplicate]
    num_duplicates_in_top_r = sum([1 if (result_qid in test_qid_to_duplicate_qids[test_qid]) else 0 for result_qid in top_r])
    r_precision = num_duplicates_in_top_r/num_duplicate
    return num_duplicate, num_duplicates_in_top_r, r_precision

def evaluation_with_r_precision(method_ranklists, report_k=0):
    assert np.array(method_ranklists).shape == (TEST_SET_SIZE, RANKED_LIST_SIZE) # method_ranklists size is (1000,100)
    total_num_duplicates = np.array([0 for i in range(TEST_SET_SIZE)])
    r_precision = np.array([0 for i in range(TEST_SET_SIZE)])
    
    ## Iter over 1->1000 tests
    for i, (test_qid, ranklist) in enumerate(zip(test_query_qids_list, method_ranklists)): # iter over 1->1000 tests
        total_num_duplicates[i], num_duplicates_in_top_r, r_precision[i] = single_r_precision(test_qid, test_qid_to_duplicate_qids, ranklist)
    
    # note: if want do error analysis, intervene here to find test cases with low r precision
    if report_k > 0:
        k_lowest_r_precision_idx = np.argpartition(r_precision, k)[:k]
        k_lowest_r_precision_test_qids = np.array(test_query_qids_list)[k_lowest_r_precision_idx]
    
    ## Calculate metrics
    avg_r_precision = r_precision.mean()
    weighted_avg_r_precision = np.multiply(r_precision, total_num_duplicates).sum() / total_num_duplicates.sum()
    if not report_k: return avg_r_precision, weighted_avg_r_precision
    else:
        return avg_r_precision, weighted_avg_r_precision, k_lowest_r_precision_test_qids
       
evaluation_with_r_precision(method_random_guess_ranklists)
    
    

In [None]:
def analyse_qid(test_qid, method_ranklists=None):
    qn = qid_to_question[test_qid]
    dup_qids = test_qid_to_duplicate_qids[test_qid]
    dup_qns = [qid_to_question[qid] for qid in dup_qids]
    
    return_dt = {"test_qn": qn, "dup_qids": dup_qids, "dup_qns":dup_qns}
    if method_ranklists: 
        idx_ranklist = test_query_qids_list.index(test_qid)
        ranklist = method_ranklists[idx_ranklist]
        r_prec = single_r_precision(test_qid, test_qid_to_duplicate_qids, ranklist)
        return_dt["r_prec" ] = r_prec
    
    return return_dt

Lists available for all
- `train_query_qids_list`
- `test_query_qids_list`

Objects you are allowed to train with
- `qid_to_question`  (only the questions NOT in `test_query_qids_list`)
- `qid_to_duplicate_qids`
- `qid_to_nonduplicate_qids`
- `train_df`

Objects you are NOT allowed to train with
- `qid_to_question`  (only the questions in `test_query_qids_list`)
- `test_qid_to_duplicate_qids`
- `test_df`

# Baseline Method
Order by the number of overlapping root words

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopword_set = set(stopwords.words())
stopword_set.add("?")

def tokenise(sentence):
    tokens = word_tokenize(sentence.lower())
    return set(word for word in tokens if not word in stopword_set)

In [None]:
def preprocess(train_query_qids_list):        
    token_to_qids = collections.defaultdict(list)
    qid_processed = set()
    for qid in tqdm.tqdm(train_query_qids_list):
        if qid in qid_processed:
            continue
        qid_processed.add(qid)
        for token in tokenise(qid_to_question[qid]):
            token_to_qids[token].append(qid)
    
    return token_to_qids

In [None]:
token_to_qids = preprocess(train_query_qids_list)

In [None]:
def method_overlapping_root_word_count(query_qid):
    query_tokens = tokenise(qid_to_question[query_qid])
    counter = collections.Counter()
    
    for dummy_qid in random.choices(train_query_qids_list, k=RANKED_LIST_SIZE):
        counter[dummy_qid] = 0.01
    
    for query_token in query_tokens:
        counter += collections.Counter(token_to_qids[query_token])
    
    query_results = list(counter.items())
    random.shuffle(query_results)  # so that qids are not ordered
    query_results = sorted(query_results, key=lambda x:x[1], reverse=True)[:RANKED_LIST_SIZE]

    return [x[0] for x in query_results], [x[1] for x in query_results]

In [None]:
sample_results(test_query_qids_list[0], *method_overlapping_root_word_count(test_query_qids_list[0]))

In [None]:
method_overlapping_root_word_count_ranklists = [
    method_overlapping_root_word_count(test_qid)[0] for test_qid in tqdm.tqdm(test_query_qids_list)]

In [None]:
evaluation_with_basic_methods(method_overlapping_root_word_count_ranklists)

In [None]:
k = 2
auc = evaluation_with_auc(method_overlapping_root_word_count_ranklists, k=k, weights = [1 for i in range(k)])
print(f"{auc:.2%} of top {k} results are duplicates")

k = 9
auc = evaluation_with_auc(method_overlapping_root_word_count_ranklists, k=k, weights = [1 for i in range(k)])
print(f"{auc:.2%} of top {k} results are duplicates")

k = 51
auc = evaluation_with_auc(method_overlapping_root_word_count_ranklists, k=k, weights = [1 for i in range(k)])
print(f"{auc:.2%} of top {k} results are duplicates")

In [None]:
avg_rp, weighted_avg_rp, k_lowest_r_precision_test_qids = evaluation_with_r_precision(method_overlapping_root_word_count_ranklists, report_k = 10) 
print(f"Average R-Precision = {avg_rp:.2%}; Weighted Average R-Precision by proportion of duplicates = {weighted_avg_rp:.2%}") 
if avg_rp > weighted_avg_rp: print("A higher average R-Precisions suggests that there are many test queries with high R-Precision but there are some test queries with high number of duplicates that model is not effective with.")


In [None]:
# View the questions for each test qid
[qid_to_question[qid] for qid in k_lowest_r_precision_test_qids]

In [None]:
# Example: pick a qid to analyse
qid_to_analyse = k_lowest_r_precision_test_qids[0]
analyse_qid(qid_to_analyse, method_ranklists=method_overlapping_root_word_count_ranklists)