In [None]:
import os, collections, random, itertools

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# load data
df = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
df["question1"] = df["question1"].astype(str)  # resolve nan
df["question2"] = df["question2"].astype(str)
df["qid1"] -= 1  #  index
df["qid2"] -= 1

In [None]:
# all questions are identified with its qid
qid_to_question = {}
for qid1, qid2, question1, question2 in zip(df["qid1"], df["qid2"], df["question1"], df["question2"]):
    qid_to_question[qid1] = question1
    qid_to_question[qid2] = question2
questions_by_idx = [qid_to_question[qid] for qid in range(max(qid_to_question) + 1)]
assert len(questions_by_idx) == len(qid_to_question)

# Obtain tokenised and spell checked questions as token list

## spaCy Tokeniser

In [None]:
!pip install -U spacy==2.3.5

In [None]:
import spacy
from spacy.tokenizer import Tokenizer # https://spacy.io/api/tokenizer

!python3 -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
print("Loaded en_core_web_sm")
# from spacy.lang.en import English
# nlp = English()

tokenizer = Tokenizer(nlp.vocab)
tokenizer.add_special_case("[math]", [{"ORTH": "[math]"}]) # see qid=7: '[math]23^{24}[/math]' becomes one token
# add more special cases here if found

def tokenise(text, lower=False, split_last_punc=True):
    """
    returns a list of tokens given a question text
    note: each punctuation is also considered a token
    note: "\n" is a token
    note: "'s" is a token
    note: '(Koh-i-Noor)' is a token
    
    see tokenizer instantiation code for special cases or to add
    
    lowercase text only after spell check
    """
    if lower: text = text.lower()
    tokens = tokenizer(text)
    token_list = [token.text for token in tokens]

    # further split tokens that end with certain punct e.g. "me?" => "me", "?"
    if split_last_punc: 
        split_lists = [[token[:-1], token[-1]] if (token[-1] in ["!","?",",",":"]) else [token] for token in token_list]
        token_list = [token for sublist in split_lists for token in sublist]
    return token_list

## SymSpell Spell Checker

In [None]:
!pip install symspellpy
from symspellpy.symspellpy import SymSpell, Verbosity  # https://github.com/mammothb/symspellpy
import pkg_resources

# instantiate spellchecker
sym = SymSpell(max_dictionary_edit_distance=2, prefix_length=7, count_threshold=1)
# https://symspellpy.readthedocs.io/en/latest/api/symspellpy.html
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym.load_dictionary(dictionary_path, 0, 1) # might take a short while

## Spell checker types
Choice of which spell checker to use - single or compound - depends on pipeline. Both have tradeoffs. I prefer `spellcheck_single` after including additional rules to make it more robust.

In [None]:
def spellcheck_single(word):
    # returns top correct spelling or the same word if no correction found within max_edit_distance
    
    # handle non ascii case
    if not word.isascii(): return word # do not spellcheck non ascii words e.g. シ
    
    # obtain list of suggestions
    suggestions = sym.lookup(word, Verbosity.CLOSEST, max_edit_distance=2,
        include_unknown=True, # a mispelled word with no found corrections is returned as is
        ignore_token=r"[:,.!?\\-]" # use if want to avoid correcting certain phrases
        )
    # get the term from the suggestItem object
    suggested_words = [suggestion._term for suggestion in suggestions]
    
    # check if the input word is legit and return if so else return corrected word
    word_lower = word.lower()
    if word_lower in suggested_words: return word_lower # do not correct if input is a legit word
    else: return suggested_words[0] # top suggestion

def spellcheck_compound(sent):
    # spellchecks a sentence
    suggestions = sym.lookup_compound(sent, max_edit_distance=2)
    return suggestions[0]._term # returns the top suggestion

In [None]:
# Check behaviour of spell checker
assert spellcheck_single("What") == "what", "Common word should be preserved"
assert spellcheck_single("DNS") == "DNS", f"Abbreviations should be preserved" 
assert spellcheck_single("シ") == "シ", f"Non ascii is preserved" 
assert spellcheck_single("?![].,") == "?![].,", f"Punctuation preserved" 

# Spell checker & tokeniser pipelines
1. Spellcheck compound then tokenise
2. Tokenise then spellcheck each word

See last section for comparison of pipelines

In [None]:
# Pipelines
# always lower only after spell check is done
def spellcheck_then_tokenise(sent):
    checked_sent = spellcheck_compound(sent)
    tokens = tokenise(checked_sent, lower=True) # lower after spell check
    return tokens

def tokenise_then_spellcheck(sent):
    # 8 times faster than spellcheck_then_tokenise
    tokens = tokenise(sent)
    checked_tokens = [spellcheck_single(token).lower() for token in tokens] # lower after spell check
    return checked_tokens

# Convert dataset
1. `qid_to_processed_token_list`
2. `token_to_qid`
2. `qid_to_vec` using spacy built-in model

In [None]:
# Process the full set with variable tokenise_pipeline_func
import time
def pe(start,end,num_iter=1):
    print(f"Duration: {end-start:.5f}s Time/Iter: {(end-start)/num_iter:.5f}")

import pickle

def convert_dataset(tokenise_pipeline_func, dir="/kaggle/working"):
    ## qid_to_token
    qid_to_processed_token_list = {}
    start = time.time()
    for qid, question in tqdm(enumerate(questions_by_idx)):
        qid_to_processed_token_list[qid] = tokenise_pipeline_func(question)
    end = time.time()
    pe(start,end, num_iter = len(questions_by_idx))

    fn = f"{dir}/qid_to_processed_token_list_{tokenise_pipeline_func.__name__}.pkl"
    with open(fn, "wb") as f:
        pickle.dump(qid_to_processed_token_list, f)

    ## token_to_qid
    token_to_qid = {}
    start = time.time()
    for qid, token_list in tqdm(qid_to_processed_token_list.items()):
        for token in token_list:
            if token in token_to_qid.keys(): token_to_qid[token].append(qid) # append to existing list given existing token
            else: token_to_qid[token] = [qid] # start new list given new token
    end = time.time()
    pe(start,end, num_iter = len(questions_by_idx))

    fn = f"{dir}/token_to_qid_{tokenise_pipeline_func.__name__}.pkl"
    with open(fn, "wb") as f:
        pickle.dump(token_to_qid, f)


In [None]:
## Convert using the 3 pipelines
convert_dataset(tokenise_then_spellcheck, dir=".") # 33 mins
convert_dataset(spellcheck_then_tokenise, dir=".") # 58 mins
convert_dataset(tokenise, dir=".") # 25 sec

In [None]:
## Load pkls
fn = "/kaggle/working/qid_to_processed_token_list_tokenise_then_spellcheck.pkl"
with open(fn, "rb") as f:
    qid_to_token = pickle.load(f)

fn = "/kaggle/working/token_to_qid_tokenise_then_spellcheck.pkl"
with open(fn, "rb") as f:
    token_to_qid = pickle.load(f)

In [None]:
# Example
qid_to_token[0]

In [None]:
def to_vec(token_or_list):
    # converts a token string or a list of tokens into a word or doc vec respectively
    if type(token_or_list) == list:
        # token list needs to be joined into a sentence first
        token_or_list = ' '.join(token_or_list)
    return nlp(token_or_list).vector

In [None]:
qid_to_vec = {}
for qid, token_list in tqdm(qid_to_token.items()):
    qid_to_vec[qid] = to_vec(token_list)

In [None]:
# Save
import pickle
with open("/kaggle/working/qid_to_vec.pkl", "wb") as f:
    pickle.dump(qid_to_vec, f)

# how to load
# with open("/kaggle/working/qid_to_vec.pkl", "rb") as f:
#     qid_to_vec = pickle.load(f)

In [None]:
# Different spacy model
!python3 -m spacy download en_core_web_lg
nlp2 = spacy.load("en_core_web_lg")
print("Loaded en_core_web_lg")
def to_vec2(token_or_list):
    # converts a token string or a list of tokens into a word or doc vec respectively
    if type(token_or_list) == list:
        # token list needs to be joined into a sentence first
        token_or_list = ' '.join(token_or_list)
    return nlp2(token_or_list).vector

qid_to_vec = {}
for qid, token_list in tqdm(qid_to_token.items()):
    qid_to_vec[qid] = to_vec2(token_list)

with open("/kaggle/working/qid_to_vec_trf.pkl", "wb") as f:
    pickle.dump(qid_to_vec, f)

# Compare performance of pipelines

In [None]:
# example_qids = [7, 10, 16, 84, 94]

# just_tokenise = []
# start=time.time()
# for qid in tqdm(example_qids):
#     text = qid_to_question[qid]
#     tokens = tokenise(text, lower=True)
#     just_tokenise.append(tokens)
# end=time.time()
# pe(start,end,num_iter=len(example_qids))
    
# sc_then_t = []
# start=time.time()
# for qid in tqdm(example_qids):
#     text = qid_to_question[qid]
#     sc_then_t.append(spellcheck_then_tokenise(text))
# end=time.time()
# pe(start,end,num_iter=len(example_qids))
# dur1 = end-start

# t_then_sc = []
# start=time.time()
# for qid in tqdm(example_qids):
#     text = qid_to_question[qid]
#     t_then_sc.append(tokenise_then_spellcheck(text))
# end=time.time()
# pe(start,end,num_iter=len(example_qids))
# dur2 = end-start

# print(f"\nspellcheck_compound then tokenise takes {dur1/dur2:.3f}x longer.")

In [None]:
# # Compare token by token
# from itertools import zip_longest

# w=30
# for qid, (l0, l1, l2) in enumerate(zip(just_tokenise,sc_then_t,t_then_sc)):
#     print("\n")
#     print("qid: ",qid)
#     print('{}{}{}'.format("T".ljust(w),"SC->T".ljust(w),"T->SC"))
#     for t0, t1, t2 in zip_longest(l0, l1,l2, fillvalue = " "):
#         print('{}{}{}'.format(t0.ljust(w),t1.ljust(w),t2))
#     if qid == 10: break