In [None]:
import sys, time, json, re, random
from anytree import Node, RenderTree, PreOrderIter, PostOrderIter, Walker
from anytree.exporter import JsonExporter
from anytree.util import commonancestors
import pandas as pd
from tqdm import tqdm
from itertools import product, combinations
from scipy import spatial
from sentence_transformers import SentenceTransformer

sys.setrecursionlimit(100000)

In [None]:

def group_arguments(tableau):
    argGroup = tableau[0]
    i = 1

    while True:

        if i > len(tableau) - 1:
            # Return if you reach the end of the tableau
            return [argGroup]

        stance = re.search(r"(Con|Pro)(?::)", tableau[i])
        if stance == None:
            argGroup = argGroup + " " + tableau[i]
            i+=1
        else:
            return [argGroup] + group_arguments(tableau[i:]) 


In [None]:
def rawKialo2Json(input_file):
    """Uses kialoParser script to parse the Kialo file and convert it to a json file.
    script by Edoardo Guido
    edoardo.guido.93@gmail.com
    https://edoardoguido.com

    Args:
        input_file (str): Filename of the Kialo debate downloaded as txt.

    Returns:
        dict: dictionnary containing information about each node, accessible by the node id.
    """
    with open(input_file, 'r') as fi:
        lines = []
        for line in fi:
            if line.startswith("Sources:"):
                break
            lines.append(line.strip())

        lines = [x for x in lines if x]

        # list containing each parsed comment
        result = []

        # we remove the first two lines of the text
        # as we don't need the header
        header = []
        for line in range(0, 4):
            header.append(lines.pop(0))

        subject = header[1]

        lines = group_arguments(lines)

        ##                                            ##
        ##                 REGEDITS                   ##
        ##                                            ##
        # iterate every row in the text file
        counter = 1
        for line in lines:

            # find the tree position the comment is in
            tree =  re.search(r"^(\d{1,}.)+", line)

            # find if the comment is Pro or Con
            stance = re.search(r"(Con|Pro)(?::)", line)

            # find the text of the comment
            content = re.search(r"((Con|Pro)(?::\s))(.*)", line)

            # define the hierarchy of the current comment
            # which is based on the tree structure

            parsed = re.findall(r"(\d{1,}(?=\.))+", tree.group())
            level = len(parsed)-1

            # make a dictionary with the single entry
            # and put it at the end of the list
            result.append({
                "Tree": tree.group(),
                "Level": level,
                "Stance": stance.group(1),
                "ToneInput": content.group(3),
                "node_id":subject.replace(" ","_")+"_"+str(counter)
            })

            counter+=1
        
        to_write = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))

    trees = [x["Tree"] for x in result]
    trees = ['1.'] + trees

    resultAsDict = { x["Tree"]: x for x in result }

    id2Node = {}


    for idNode in trees:
        if idNode == '1.':
            id2Node[idNode] = Node(idNode, node_id=-1)
        else:
            parentId = idNode[:idNode[:-1].rfind(".")+1]
            id2Node[idNode] = Node(idNode,
                                    parent=id2Node[parentId],
                                    tree=resultAsDict[idNode]["Tree"], 
                                    level=resultAsDict[idNode]["Level"], 
                                    stance=resultAsDict[idNode]["Stance"], 
                                    toneInput=resultAsDict[idNode]["ToneInput"], 
                                    subject=subject,
                                    node_id=resultAsDict[idNode]["node_id"]
    )

    return id2Node

In [None]:
def pickRandomNodePair(tree):
    nodes = [node.name for node in PreOrderIter(tree['1.'])]
    # Remove root from list of choices
    nodes.pop(0)
    node1_name, node2_name = random.sample(nodes, 2)
    return node1_name, node2_name

def getCommonAncestor(tree, node1_name, node2_name):
    return commonancestors(tree[node1_name], tree[node2_name])

def distanceBetweenPair(tree, node1_name, node2_name):
    walker = Walker()
    path = walker.walk(tree[node1_name], tree[node2_name])
    upwards, _, downwards = path
    return len(upwards) + len(downwards)

def getNeutralPair(tree, threshold):
    """From given tree, pick 2 nodes that are at least `threshold` distance apart and have the root as their only common ancestor (i.e. arguments that aren't directly related)

    Args:
        tree (dict): dictionary of nodes
        threshold (int): minimum distance between nodes

    Raises:
        LookupError: Raised if a pair of nodes that satisfy the conditions cannot be found after 1000 attempts

    Returns:
        tuple(str, str): Pair of node names
    """
    attempt_limit = 1000
    for _ in range(attempt_limit):
        n1, n2 = pickRandomNodePair(tree)
        rootIsOnlyCommonAncestor = len(getCommonAncestor(tree, n1, n2)) == 1
        if not rootIsOnlyCommonAncestor:
            continue
        # First version used a tree walk to compute distance
        # distance = distanceBetweenPair(tree, n1, n2)
        # Use the already set 'level' attribute of the node to compute it instead
        distance = tree[n1].level + tree[n2].level
        if distance < threshold:
            continue
        return n1, n2
    raise LookupError(f"Could not find a pair of nodes that satisfy the conditions after {attempt_limit} attempts")

def getAllNeutralPairsFromSameTree(tree, threshold):
    """Get all pairs of nodes that are at least `threshold` distance apart and have the root as their only common ancestor (i.e. arguments that aren't directly related)

    Args:
        tree (dict): dictionary of nodes representing the tree
        threshold (int): minimum distance between nodes

    Returns:
        list(tuple(str, str)): List of pairs of node names
    """
    root = tree['1.']
    if root.children is None:
        return []

    branches = []
    for child in root.children:
        # generate list of nodes for each 
        branches.append([node.name for node in PostOrderIter(child)])

    nodePairs = []    
    for branchPair in combinations(branches, 2):
        # generate combination of each node in each branch
        nodePairs += list(product(branchPair[0], branchPair[1]))
    
    neutralPairs = []
    for n1, n2 in tqdm(nodePairs, desc="Finding neutral pairs"):
        if n1 == n2:
            continue
        if (n1, n2) in neutralPairs:
            continue
        rootIsOnlyCommonAncestor = len(getCommonAncestor(tree, n1, n2)) == 1
        if not rootIsOnlyCommonAncestor:
            continue
        # First version used a tree walk to compute distance
        # distance = distanceBetweenPair(tree, n1, n2)
        # Use the already set 'level' attribute of the node to compute it instead
        distance = tree[n1].level + tree[n2].level
        if distance < threshold:
            continue
        # Here, could add test via embedding similarity
        neutralPairs.append((n1, n2))
    return neutralPairs

def getAllNeutralPairsFromDiffTrees(t1, t2):
    """Get all pairs of nodes that are from different trees (i.e. arguments that aren't directly related)

    Args:
        t1 (dict): dictionnary of nodes for the first tree
        t2 (dict): dictionnary of nodes for the second tree

    Returns:
        list[tuple[str, str]]: List of pairs of node names
    """
    nodes1 = [node.name for node in PostOrderIter(t1['1.'])]
    nodes2 = [node.name for node in PostOrderIter(t2['1.'])]
    # Remove roots from lists of choices
    nodes1.pop()
    nodes2.pop()
    neutralPairs = list(product(nodes1, nodes2))
    return neutralPairs

def getNNeutralPairsFromSameTrees(tree, threshold, n = 1000):
    """Get `n` pairs of nodes that are at least `threshold` distance apart and have the root as their only common ancestor (i.e. arguments that aren't directly related)

    Args:
        t1 (dict): dictionnary of nodes
        threshold (int): minimum distance between nodes
        n (int, optional): Number of pairs to generate. Defaults to 1000.

    Returns:
        list[tuple[str, str]]: List of pairs of node names
    """
    root = tree['1.']
    if root.children is None:
        return []

    # Only generate pairs of arguments that are in different branches
    # This guarantees that the root is the only common ancestor
    branches = []
    for child in root.children:
        # generate list of nodes for each 
        branches.append([node.name for node in PostOrderIter(child)])

    nodePairs = []
    for branchPair in combinations(branches, 2):
        # generate combination of each node in each branch
        # same as [(x,y) for x in branchPair[0] for y in branchPair[1]]
        nodePairs += list(product(branchPair[0], branchPair[1]))
    
    # Shuffle the list of pairs to avoid bias
    random.shuffle(nodePairs)
    neutralPairs = []
    # Keep generating pairs until we have enough
    while len(neutralPairs) < n and nodePairs:
        n1, n2 = nodePairs.pop()
        # The three tests below are redundant, but kept just in case
        if n1 == n2:
            continue
        if (n1, n2) in neutralPairs:
            continue
        rootIsOnlyCommonAncestor = len(getCommonAncestor(tree, n1, n2)) == 1
        if not rootIsOnlyCommonAncestor:
            continue
        # First version used a tree walk to compute distance
        # distance = distanceBetweenPair(tree, n1, n2)
        # Use the already set 'level' attribute of the node to compute it instead
        distance = tree[n1].level + tree[n2].level
        if distance < threshold:
            continue
        # Here, could add test via embedding similarity
        neutralPairs.append((n1, n2))
    
    return neutralPairs

def getNNeutralPairsFromDiffTrees(t1, t2, n = 1000):
    """Get `n` pairs of nodes that are from different trees (i.e. arguments that aren't directly related)

    Args:
        t1 (dict): dictionnary of nodes for the first tree
        t2 (dict): dictionnary of nodes for the second tree
        n (int, optional): Number of pairs to generate. Defaults to 1000.

    Returns:
        list[tuple[str, str]]: List of pairs of node names
    """
    allNeutralPairs = getAllNeutralPairsFromDiffTrees(t1, t2)
    neutralPairs = random.sample(allNeutralPairs, n)
    return neutralPairs


In [None]:
def argumentTree2argumentPairTree(node, domains):    
    pairs = []
    
    if len(node.children) == 0:
        return pairs
    elif node.children != None:
        for child in node.children:
            if node.name != "1.":
                pair = {
                    "topArgument"       :   node.toneInput,
                    "subArgument"       :   child.toneInput,
                    "subject"           :   child.subject,
                    "subArgumentLevel"  :   child.level,
                    "domain"            :   domains,
                    "sameTree"        :   True
                }
                if child.stance == "Con":
                    pair["relation"] = "attack"
                else:
                    pair["relation"] = "support"
                pairs.append(pair)

            pairs += argumentTree2argumentPairTree(child, domains)
        
    return pairs

In [None]:
def nodePair2NeutralArgPair(node1, node2, domains_n1, domains_n2, same_tree):
    pair = {
            "topArgument"       :   node1.toneInput,
            "subArgument"       :   node2.toneInput,
            "relation"          :   "neutral"
        }

    if same_tree:
        pair["subject"] = node1.subject
        pair["domain"] = domains_n1
        pair["sameTree"] = True
    else:
        pair["subject"] = node1.subject + " & " + node2.subject
        pair["domain"] = domains_n1 + " & " + domains_n2 if domains_n2 else domains_n1
        pair["sameTree"] = False

    return pair

def namePairs2NeutralArgPairs(tree, nodes, domains_n1, tree2=None, domains_n2=None, same_tree=True):
    pairs = []
    for nodeName1, nodeName2 in nodes:
        node1 = tree[nodeName1]
        if same_tree:
            node2 = tree[nodeName2]
        else:
            node2 = tree2[nodeName2]

        pair = nodePair2NeutralArgPair(node1, node2, domains_n1, domains_n2, same_tree)
        pairs.append(pair)

        if same_tree:
            domains_n2 = domains_n1
        reverse_pair = nodePair2NeutralArgPair(node2, node1, domains_n2, domains_n1, same_tree)
        pairs.append(reverse_pair)

    return pairs

In [None]:
import pandas as pd

kialoUrlIds = pd.read_csv("../../rawData/kialo/kialo-url-ids.csv", index_col=0)
pairs = []

prev_d = None
prev_kialoUrlId = None
prev_t = None

for i, x in tqdm(kialoUrlIds.iterrows(), total=kialoUrlIds.shape[0]):
  try:
    d = x.tags
    kialoUrlId = x.kialoUrlId

    # print(x)

    t = rawKialo2Json("../../rawData/kialo/debates/en/"+ kialoUrlId +".txt")
    
    pairs = pairs + argumentTree2argumentPairTree(t['1.'], d)
    
    # neutralPairsSameTree = getAllNeutralPairsFromSameTree(t, 10)
    neutralPairsSameTree = getNNeutralPairsFromSameTrees(t, 10, len(t))
    # print("Len neutral same tree", len(neutralPairsSameTree))

    pairs = pairs + namePairs2NeutralArgPairs(t, neutralPairsSameTree, d)
    
    if prev_d is not None and prev_t is not None:
      # neutralPairsDiffTree = getAllNeutralPairsFromDiffTrees(t, prev_t)
      neutralPairsDiffTree = getNNeutralPairsFromDiffTrees(t, prev_t, max(len(t), len(prev_t)))
      # print("Len neutral diff tree", len(neutralPairsDiffTree))
      
      pairs = pairs + namePairs2NeutralArgPairs(t, neutralPairsDiffTree, d, prev_t, prev_d, same_tree=False)

    prev_d = d
    prev_kialoUrlId = kialoUrlId
    prev_t = t
  except Exception as e:
    continue

In [None]:
pairs

In [None]:
argSrc = [x["subArgument"] for x in pairs]
argTrg = [x["topArgument"] for x in pairs]
topic = [x["domain"] for x in pairs]
relations = [x["relation"] for x in pairs]
sameTree = [x["sameTree"] for x in pairs]

d = pd.DataFrame.from_dict({
  "topic": topic,
  "relation" : relations,
  "argSrc" : argSrc,
  "argTrg" : argTrg,
  "sameTree" : sameTree,
})

d

In [None]:
d.value_counts("relation")

In [None]:
d.value_counts('topic')
d['topic'].isnull().sum()

# Clean up 

The arguments still contain some sources, noted by `[124]` for example. We want to remove those.  
Ideally, we should also remove leftover artifacts like mentions of a page number or stuff like `(p. i.)` but that is another hassle for another day. 

In [None]:
# Remove rows with "See" in either argument
pattern = r"-> See (\d\.)*"
d = d[~d['argSrc'].str.contains(pattern)]
d = d[~d['argTrg'].str.contains(pattern)]

# Remove sources from argSrc and argTrg
pattern = r"\s*\[\d+\]"
d['argSrc'] = d['argSrc'].str.replace(pattern, "", regex=True)
d['argTrg'] = d['argTrg'].str.replace(pattern, "", regex=True)

# Remove artifacts like (p. 1), (p. i), (p. ii), (p. 65-66), etc.
pattern = r"\(\s*p\.\s*[\di]+(-\d+)*\s*\)"
d['argSrc'] = d['argSrc'].str.replace(pattern, "", regex=True)
d['argTrg'] = d['argTrg'].str.replace(pattern, "", regex=True)

d

# Compute Cosine similarity from embeddings

The intuition being that neutral arguments would tend to have orthogonal embeddings, and thus a cosine similarity of 0.

In [None]:
# model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
# model = SentenceTransformer("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)

# This model provides the similarity close to 0 for neutrals as opposed to the two above
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", trust_remote_code=True)

def getEmbeddingSimilarity(arg1, arg2):
    return 1 - spatial.distance.cosine(arg1, arg2)

def getEmbeddingsFromArgs(args, model):
    # For nomic models with prompt prefix
    # prompt_prefix = 'clustering: '    
    prompt_prefix = ''    
    sentences = []
    for arg in list(args):
        sentence = prompt_prefix+arg
        sentences.append(sentence)
    embeddings = model.encode(sentences)
    return embeddings

In [None]:
# SUPPORT
embedsSupp = getEmbeddingsFromArgs(["Purity pledges lead to people not having a proper understanding of the potential consequences of sex.", "Purity pledges lead to people having a poorer understanding of sexuality."], model)

# ATTACK
embedsAtt = getEmbeddingsFromArgs(["There is no guarantee that a pledger will be part of an abstinence programme.", "Abstinence programmes are likely to lead to people engaging in unsafe sex."], model)

# NEUTRAL
embedsNeut = getEmbeddingsFromArgs(["In the US, around 2.2 million people are employed in the banking industry in 2023.", "Some coffee producers argue that both Fairtrade and Direct Trade standards are necessary to continue sustainable operations."], model)

print("Support : ", getEmbeddingSimilarity(embedsSupp[0], embedsSupp[1]))
print("Attack", getEmbeddingSimilarity(embedsAtt[0], embedsAtt[1]))
print("Neutral", getEmbeddingSimilarity(embedsNeut[0], embedsNeut[1]))

In [None]:
tqdm.pandas(desc="Computing similarity for all pairs")

def computeRowCosineSimilarity(row):
    embeddings = getEmbeddingsFromArgs([row["argSrc"], row["argTrg"]], model)
    return getEmbeddingSimilarity(embeddings[0], embeddings[1])

d['similarity'] = d.progress_apply(lambda row: computeRowCosineSimilarity(row), axis=1)

supp = d[d['relation'] == 'support']
att = d[d['relation'] == 'attack']
neut_sameTree = d[(d['relation'] == 'neutral') & (d['sameTree'] == True)]
neut_diffTree = d[(d['relation'] == 'neutral') & (d['sameTree'] == False)]
neut = pd.concat([neut_sameTree, neut_diffTree], ignore_index=True)

d.to_csv("kialoPairsRaw.csv", index=False)

In [None]:
# Mean cosine similarity and std for each relation
print("Support : ", supp["similarity"].mean(), '+/-', supp["similarity"].std())
print("Attack : ", att["similarity"].mean(), '+/-', att["similarity"].std())
print("Neutral : ", neut["similarity"].mean(), '+/-', neut["similarity"].std())

### Cosine similarity stats on mixed Neutral relations (same and different tree)

In [None]:
print(neut["similarity"].describe())
neut["similarity"].hist()

### Cosine similarity stats on neutral relations from same Tree

In [None]:
print(neut_sameTree["similarity"].describe())
neut_sameTree["similarity"].hist()

### Cosine similarity stats on neutral relations from different Trees

In [None]:
print(neut_diffTree["similarity"].describe())
neut_diffTree["similarity"].hist()

# Post processing

The idea here is to keep only the pairs of neutral arguments that are most neutral, by using the computed Cosine similarity between their embeddings.

In [None]:
import pandas as pd

kp = pd.read_csv("./kialoPairsRaw.csv")

kp 

## Checking the neutrality based on different similarity scores

Let's see if the pairs with negative or close to 0.0 similarity scores are more neutral than others.

### Same Tree

In [None]:
from textwrap import fill

kp_neut_sameTree= kp[(kp['relation'] == 'neutral') & (kp['sameTree'] == True)]
below_zero = kp_neut_sameTree[kp_neut_sameTree['similarity'] < 0]

samples = below_zero.sample(3).values
for row in range(3): 
    print("'topic : ", samples[row][0])
    print("Arg 1 : ", fill(samples[row][2], 140))
    print("Arg 2 : ", fill(samples[row][3], 140))
    print("Similarity ", samples[row][5])
    print("------")

In [None]:
mean_sim = kp_neut_sameTree['similarity'].mean()
std_sim = kp_neut_sameTree['similarity'].std()
print("Mean", mean_sim, "\nStd", std_sim,"\n")
around_mean = kp_neut_sameTree[(kp_neut_sameTree['similarity'] > mean_sim - std_sim) & (kp_neut_sameTree['similarity'] < mean_sim + std_sim)]

samples = around_mean.sample(3).values
for row in range(3): 
    print("'topic : ", samples[row][0])
    print("Arg 1 : ", fill(samples[row][2], 140))
    print("Arg 2 : ", fill(samples[row][3], 140))
    print("Similarity ", samples[row][5])
    print("------")

In [None]:
print("Number of rows below zero : ", below_zero.value_counts('relation')['neutral'])
print("Number of rows around the mean : ", around_mean.value_counts('relation')['neutral'])

The pairs with similarity below zero are clearly not related, they almost look like they're from different trees altogether. It is a safe bet to use them for our dataset.

### Different Trees

In [None]:
kp_neut_diffTree= kp[(kp['relation'] == 'neutral') & (kp['sameTree'] == False)]
below_zero = kp_neut_diffTree[kp_neut_diffTree['similarity'] < 0]

samples = below_zero.sample(3).values
for row in range(3): 
    print("'topic : ", samples[row][0])
    print("Arg 1 : ", fill(samples[row][2], 140))
    print("Arg 2 : ", fill(samples[row][3], 140))
    print("Similarity ", samples[row][5])
    print("------")

In [None]:
mean_sim = kp_neut_diffTree['similarity'].mean()
std_sim = kp_neut_diffTree['similarity'].std()
print("Mean", mean_sim, "\nStd", std_sim,"\n")
around_mean = kp_neut_diffTree[(kp_neut_diffTree['similarity'] > mean_sim - std_sim) & (kp_neut_diffTree['similarity'] < mean_sim + std_sim)]

samples = around_mean.sample(3).values
for row in range(3): 
    print("'topic : ", samples[row][0])
    print("Arg 1 : ", fill(samples[row][2], 140))
    print("Arg 2 : ", fill(samples[row][3], 140))
    print("Similarity ", samples[row][5])
    print("------")

In [None]:
print("Number of rows below zero : ", below_zero.value_counts('relation'))
print("Number of rows around the mean : ", around_mean.value_counts('relation'))

### Filter out neutral rows

The objective is to have roughly 100k neutrals with a 50:50 split between sameTree and !sameTree in order to have a balanced dataset to sample from.  
Using the conclusion from the previous exploration, we can determine that it is safe to keep only the most dissimilar pairs (i.e. based on the similarity score in ascending order).

In [None]:
nb_supp = kp.value_counts('relation')['support']
nb_att = kp.value_counts('relation')['attack']

target_nb_neut = (nb_supp + nb_att)//2

nb_supp, nb_att, target_nb_neut, target_nb_neut//2

In [None]:
# sort by similarity and keep enough rows reach target_nb_neut
kp_neut_sameTree.sort_values('similarity', inplace=True)
kp_neut_sameTree = kp_neut_sameTree[:target_nb_neut//2]
kp_neut_sameTree

In [None]:
# sort by similarity and keep enough rows to reach target_nb_neut
kp_neut_diffTree.sort_values('similarity', inplace=True)
kp_neut_diffTree = kp_neut_diffTree[:target_nb_neut//2]
kp_neut_diffTree

# Concatenate enough neutrals to create a balanced dataset

In [None]:
kp_supp = kp[kp['relation'] == 'support']
kp_att = kp[kp['relation'] == 'attack']

kp_final = pd.concat([kp_supp, kp_att, kp_neut_sameTree, kp_neut_diffTree], ignore_index=True)

kp_final.value_counts('relation')

# Save the final Dataset

In [None]:
kp_final.to_csv("kialoPairs.csv", index=False)