In [93]:
from ordered_set import OrderedSet
import pandas as pd
import numpy as np
from numpy.typing import NDArray
from math import log
import json

In [94]:
def clean_text(text: str) -> list[str]:
        return ''.join(c.lower() if c.isalnum() or c == " " else '' for c in text).split()

In [95]:
def wordFreq(words: str) -> dict[str, int]:
    freqs = {}
    for word in words:
        if word in freqs.keys():
            freqs[word] += 1
        else:
            freqs[word] = 1

    return freqs

In [96]:
def createBOW(s_words: list[str], t_words: list[str]) -> list[str]:
    return list(OrderedSet(s_words + t_words))

In [97]:
def vectorizeBOW(s: str, t: str) -> tuple[NDArray[np.int64], NDArray[np.int64]]:
    s_words = clean_text(s)
    t_words = clean_text(t)

    words = createBOW(s_words, t_words)

    map_s = wordFreq(s_words)
    map_t = wordFreq(t_words)

    vec_s = []
    vec_t = []

    for word in words:
        if word in map_s.keys():
            vec_s.append(map_s[word])
        else:
            vec_s.append(0)

        if word in map_t.keys():
            vec_t.append(map_t[word])
        else:
            vec_t.append(0)

    return np.array(vec_s), np.array(vec_t)


In [98]:
def calculateSimilarity(vec1: NDArray[np.int64], vec2: NDArray[np.int64]) -> float:
    dot = np.dot(vec1, vec2)
    vec1_mag = np.linalg.norm(vec1)
    vec2_mag = np.linalg.norm(vec2)

    if vec1_mag == 0 or vec2_mag == 0:
        return 0

    return dot / (vec1_mag * vec2_mag)


In [99]:
def vectorizeTFIDF(s: str, t: str) -> tuple[NDArray[np.int64], NDArray[np.int64]]:
    s_words = clean_text(s)
    t_words = clean_text(t)

    words = createBOW(s_words, t_words)

    s_freq = wordFreq(s_words)
    t_freq = wordFreq(t_words)

    tf_s = []
    tf_t = []
    idf = []

    for word in words:
        contained_in = 0
        if word in s_freq.keys():
            tf_s.append(s_freq[word] / len(s_words))
            contained_in += 1
        else:
            tf_s.append(0)

        if word in t_freq.keys():
            tf_t.append(t_freq[word] / len(t_words))
            contained_in += 1
        else:
            tf_t.append(0)

        idf.append(log(2 / (contained_in + 1)) + 1)

    vec_s = [i * j for i, j in zip(tf_s, idf)]
    vec_t = [i * j for i, j in zip(tf_t, idf)]

    return (np.array(vec_s), np.array(vec_t))

In [100]:
def generateMarkovChain(s: str, t: str) -> list[list[int]]:
    s_words = clean_text(s)
    t_words = clean_text(t)

    bow = createBOW(s_words, t_words)
    s_matrix = [[0 for _ in range(len(bow))] for _ in range(len(bow))]
    t_matrix = [[0 for _ in range(len(bow))] for _ in range(len(bow))]

    for i in range(1, len(s_words)):
        currIdx = bow.index(s_words[i])
        prevIdx = bow.index(s_words[i-1])
        s_matrix[prevIdx][currIdx] += 1

    for i in range(1, len(t_words)):
        currIdx = bow.index(t_words[i])
        prevIdx = bow.index(t_words[i-1])
        t_matrix[prevIdx][currIdx] += 1

    for matrix in [s_matrix, t_matrix]:
        for row in matrix:
            total = sum(row)
            if total == 0:
                continue
            for i in range(len(row)):
                row[i] = row[i] / total

    return (np.array(s_matrix), np.array(t_matrix))

In [101]:
def calculateSimilarityBOW(data):
    result = pd.DataFrame(columns=["cos_BOW", "q1_vecBoW", "q2_vecBoW"])

    for index, row in data.iterrows():
        q1, q2 = row["question1"], row["question2"]
        v1, v2 = vectorizeBOW(q1, q2)
        sim = calculateSimilarity(v1, v2)

        result.loc[index] = [sim, v1, v2]

    data = pd.concat([data, result], axis=1)
    return data

In [102]:
def calculateSimilarityTFIDF(data):
    result = pd.DataFrame(columns=["cos_TFID", "q1_vecTFIDF", "q2_vecTFIDF"])

    for index, row in data.iterrows():
        q1, q2 = row["question1"], row["question2"]
        v1, v2 = vectorizeTFIDF(q1, q2)
        sim = calculateSimilarity(v1, v2)

        result.loc[index] = [sim, v1, v2]

    data = pd.concat([data, result], axis=1)
    return data

In [None]:
def calculateSimilarityMarkov(data):
    result = pd.DataFrame(columns=["cos_MARK", "q1_vecMARK", "q2_vecMARK"])

    for index, row in data.iterrows():
        q1, q2 = row["question1"], row["question2"]
        m1, m2 = generateMarkovChain(q1, q2)
        v1 = np.array(np.matrix.flatten(m1))
        v2 = np.array(np.matrix.flatten(m2))
        sim = calculateSimilarity(v1, v2)

        result.loc[index] = [sim, v1, v2]

    data = pd.concat([data, result], axis=1)
    return data

In [104]:
data = pd.read_csv("quora.csv")
data.dropna(inplace=True)
data = calculateSimilarityBOW(data[:1000])
data = calculateSimilarityTFIDF(data)
data = calculateSimilarityMarkov(data)
data.to_csv("similarity.csv")

  data = pd.read_csv("quora.csv")


In [105]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cos_BOW,q1_vecBoW,q2_vecBoW,cos_TFID,q1_vecTFIDF,q2_vecTFIDF,cos_MARK,q1_vecMARK,q2_vecMARK
0,0.0,1.0,2.0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0.0,0.944911,"[1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1]","[1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0]",0.900277,"[0.04246677799227397, 0.04246677799227397, 0.0...","[0.049544574324319635, 0.049544574324319635, 0...",0.923381,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1.0,3.0,4.0,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0.0,0.613572,"[1, 1, 1, 1, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 2, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1]",0.372365,"[0.07431686148647945, 0.125, 0.074316861486479...","[0.04573345322244889, 0.0, 0.09146690644489779...",0.071067,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
2,2.0,5.0,6.0,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0.0,0.338062,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, ...",0.1537,"[0.04246677799227397, 0.04246677799227397, 0.0...","[0.05945348918918356, 0.05945348918918356, 0.0...",0.09245,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3.0,7.0,8.0,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0.0,0.0,"[1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...",0.0,"[0.09090909090909091, 0.09090909090909091, 0.1...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4.0,9.0,10.0,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0.0,0.419314,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]","[1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1]",0.208544,"[0.04573345322244889, 0.07692307692307693, 0.0...","[0.08493355598454794, 0.0, 0.0, 0.084933555984...",0.0,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
