In [1]:
import jsonlines
input_file = '/root/Xiangpeng/RAG/paperag/ask-question/jsonl/raw_entity.jsonl'
with jsonlines.open(input_file, mode='r') as reader:
    documents = [doc for doc in reader]

# Entity Resolution

In [2]:
print(documents[0]['entity'])

entity{tuple_delimiter}STUDIES{tuple_delimiter}RESEARCH, ANALYSIS{tuple_delimiter}STUDIES refer to investigations and examinations conducted to gather information and draw conclusions about a particular topic or phenomenon. In this case, the studies are focused on the impact of autonomous vehicles on urban parking demand. 
entity{tuple_delimiter}PARKING DEMAND{tuple_delimiter}SOCIAL PHENOMENON, TRANSPORTATION{tuple_delimiter}PARKING DEMAND refers to the amount of space required for parking vehicles in a given area, typically within urban cities. It is influenced by factors such as population density, vehicle ownership rates, and land use patterns.
entity{tuple_delimiter}URBAN CITIES{tuple_delimiter}GEOGRAPHICAL LOCATION, POPULATION CENTER{tuple_delimiter}URBAN CITIES are densely populated areas characterized by high concentrations of buildings, infrastructure, and residents. They often face challenges related to transportation, parking, and urban planning.
entity{tuple_delimiter}AUTONO

In [3]:
# Function to parse the input data
import re

tuple_delimiter = "{tuple_delimiter}"

def standardize_delimiter(input_string, standard_delimiter="{tuple_delimiter}"):
    # Define the regex pattern to match any case variation of {tuple_delimiter}
    pattern = re.compile(r'\{tuple_delimiter\}', re.IGNORECASE)
    
    # Replace all variations with the standard delimiter
    standardized_string = pattern.sub(standard_delimiter, input_string)
    
    return standardized_string
def parse_input(data):
    entities = []
    relationships = []

    for record in data:
        lines = record['entity'].split('\n')
        for line in lines:
            line = standardize_delimiter(line)
            parts = line.split(tuple_delimiter)
            if parts[0] == "entity":
                entities.append((parts[1], parts[2], parts[3], record['document']))
            elif parts[0] == "relationship":
                relationships.append((parts[1], parts[2], parts[3], int(parts[4]), record['document']))
    
    return entities, relationships

In [4]:
entities, relationships = parse_input(documents)

In [28]:
# remove duplicated based on name and type
def remove_duplicates(tuples_list):
    unique_items = {}
    for item in tuples_list:
        name = item[0]
        if name not in unique_items:
            unique_items[name] = item
    return list(unique_items.values())

unique_entities = remove_duplicates(entities)

In [43]:
# add embeddings to entities
import ollama

# Function to calculate embedding for each point
def calculate_embedding(point):
    embedding = ollama.embeddings(model='mxbai-embed-large', prompt=point)["embedding"]
    return embedding

for i, entity in enumerate(unique_entities):
    embedding = calculate_embedding('name: ' + entity[0] + 'type: ' + entity[1] + 'description: ' + entity[2])
    unique_entities[i] = (entity[0], entity[1], entity[2], entity[3], embedding)

In [50]:
import igraph as ig
import leidenalg as la
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Calculate embeddings for all entity
embeddings = np.array([entity[4] for entity in unique_entities])

# Construct similarity graph using cosine similarity
similarity_matrix = cosine_similarity(embeddings)
np.fill_diagonal(similarity_matrix, 0)  # Remove self-similarity
edges = np.argwhere(similarity_matrix > 0)
weights = similarity_matrix[edges[:, 0], edges[:, 1]]

# Create igraph graph
g = ig.Graph()
g.add_vertices(len(unique_entities))
g.add_edges(edges)
g.es['weight'] = weights

# Apply Leiden Algorithm
partition = la.find_partition(g, la.CPMVertexPartition, weights='weight', resolution_parameter=1.5)


# Organize entity by cluster labels
clusters = {}
for idx, cluster_id in enumerate(partition.membership):
    if cluster_id not in clusters:
        clusters[cluster_id] = []
    clusters[cluster_id].append(unique_entities[idx])

In [64]:
# Access and print each group
# for label, group in clusters.items():
#     print(f"Group {label}:")
#     for data_point in group:
#         print((data_point[0]))

In [62]:
from collections import Counter
# entity resolution mapping
# in each group, find the most common name as the entity name
def most_common_or_first(my_list):
    if not my_list:
        return None
    
    counter = Counter(my_list)
    most_common = counter.most_common(1)
    
    return most_common[0][0] if most_common else my_list[0]

entity_mapping = {}
for key in clusters:
    group = clusters[key]
    names = [item[0] for item in group]
    common_value = most_common_or_first(names)
    for name in names:
        entity_mapping[name] = common_value

In [61]:
names

['STUDIES', 'RESEARCH', 'SURVEY']

# Leiden hierarchical clustering

In [67]:
import jsonlines
input_file1 = '/root/Xiangpeng/RAG/paperag/ask-question/jsonl/resolution_entities.jsonl'
input_file2 = '/root/Xiangpeng/RAG/paperag/ask-question/jsonl/resolution_relationships.jsonl'
with jsonlines.open(input_file1, mode='r') as reader:
    entities = [doc for doc in reader]
with jsonlines.open(input_file2, mode='r') as reader:
    relationships = [doc for doc in reader]

# Create a set of valid entity names
valid_entity_names = {entity['name'] for entity in entities}

# Filter relationships
filtered_relationships = [
    relationship for relationship in relationships
    if relationship['source'] in valid_entity_names and relationship['target'] in valid_entity_names
]

In [68]:
import networkx as nx
import leidenalg
import igraph as ig
import matplotlib.pyplot as plt

# Create a networkx graph
G = nx.Graph()

# Add entities as nodes
for entity in entities:
    G.add_node(entity['name'], type=entity['type'], description=entity['description'], original_text=entity['original_text'])

# Add relationships as edges with float weights
for relationship in filtered_relationships:
    #G.add_edge(relationship['source'], relationship['target'], weight=float(relationship['strength']))
    G.add_edge(relationship['source'], relationship['target'], weight=float(1))


In [69]:
from graspologic.utils import largest_connected_component
from typing import Any, cast, Optional, List, Dict

def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph:
    """Return the largest connected component of the graph, with nodes and edges sorted in a stable way."""
    graph = graph.copy()
    graph = cast(nx.Graph, largest_connected_component(graph))

    return _stabilize_graph(graph)

def _stabilize_graph(graph: nx.Graph) -> nx.Graph:
    """Ensure an undirected graph with the same relationships will always be read the same way."""
    fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph()

    sorted_nodes = graph.nodes(data=True)
    sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0])

    fixed_graph.add_nodes_from(sorted_nodes)
    edges = list(graph.edges(data=True))

    # If the graph is undirected, we create the edges in a stable way, so we get the same results
    # for example:
    # A -> B
    # in graph theory is the same as
    # B -> A
    # in an undirected graph
    # however, this can lead to downstream issues because sometimes
    # consumers read graph.nodes() which ends up being [A, B] and sometimes it's [B, A]
    # but they base some of their logic on the order of the nodes, so the order ends up being important
    # so we sort the nodes in the edge in a stable way, so that we always get the same order
    if not graph.is_directed():

        def _sort_source_target(edge):
            source, target, edge_data = edge
            if source > target:
                temp = source
                source = target
                target = temp
            return source, target, edge_data

        edges = [_sort_source_target(edge) for edge in edges]

    def _get_edge_key(source: Any, target: Any) -> str:
        return f"{source} -> {target}"

    edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1]))

    fixed_graph.add_edges_from(edges)
    return fixed_graph

In [70]:
import logging
from graspologic.partition import hierarchical_leiden
log = logging.getLogger(__name__)

def run(graph: nx.Graph, max_cluster_size: int, levels: Optional[List[int]], seed=42) -> Dict[int, Dict[str, List[str]]]:
    """Run method definition."""
    use_lcc = True
    

    
    log.info("Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size)

    node_id_to_community_map = _compute_leiden_communities(
        graph=graph,
        max_cluster_size=max_cluster_size,
        seed=seed,
    )

    # If they don't pass in levels, use them all
    if levels is None:
        levels = sorted(node_id_to_community_map.keys())

    results_by_level: Dict[int, Dict[str, List[str]]] = {}
    for level in levels:
        result = {}
        results_by_level[level] = result
        for node_id, raw_community_id in node_id_to_community_map[level].items():
            community_id = str(raw_community_id)
            if community_id not in result:
                result[community_id] = []
            result[community_id].append(node_id)
    return results_by_level

def _compute_leiden_communities(
    graph: nx.Graph,
    max_cluster_size: int,
    seed: int,
) -> Dict[int, Dict[str, int]]:
    """Return Leiden root communities."""
    graph = stable_largest_connected_component(graph)

    community_mapping = hierarchical_leiden(
        graph, max_cluster_size=max_cluster_size, random_seed=seed
    )
    results: Dict[int, Dict[str, int]] = {}
    for partition in community_mapping:
        results[partition.level] = results.get(partition.level, {})
        results[partition.level][partition.node] = partition.cluster

    return results


In [84]:
# Run the clustering
clustering_result = run(G, 20, None)

In [89]:
def print_clusters_with_text(graph, clustering_result):
    for level in sorted(clustering_result.keys()):
        print(f"\nLevel {level}:")
        communities = clustering_result[level]

        for community_id, nodes in communities.items():
            print(f"\nCommunity {community_id}:")
            unique = set()
            for node in nodes:
                if graph.nodes[node]['original_text'] not in unique:
                    unique.add(graph.nodes[node]['original_text'])
                    print(f"Node: {node}, Original Text: {graph.nodes[node]['original_text']}")
                else:
                    print(f"Node: {node}")

# Print the clusters with original text for each level
print_clusters_with_text(G, clustering_result)



Level 0:

Community 0:
Node: 5G ADVANCED, Original Text: 5G Advanced will offer intelligent network solutions for a broader range of use cases, including autonomous and advanced remote driving in metropolitan areas.
Node: INTELLIGENT NETWORK SOLUTIONS
Node: ACCESSIBILITY, Original Text: First, AV would influence urban transportation and human mobility by reducing vehicle ownership, public and active travel, traffic delay and congestion, travel costs, and by increasing accessibility, mobility, Vehicle Miles Traveled, and revenue generation for commercial operators.
Node: AUTONOMOUS VEHICLES, Original Text: A key challenge is making VRU data available in real time to nearby AVs for incorporating into trajectory prediction logic and enhancing VRU safety.
Node: ACTIVE TRAVEL, Original Text: AVs are considered a disruptive technology in urban transportation, potentially weakening public transit ridership and active transportation.
Node: ADAPTIVE CRUISE CONTROL (ACC), Original Text: AVs hav

In [88]:
def save_clusters_to_jsonl(graph, clustering_result, output_file):
    with jsonlines.open(output_file, mode='w') as writer:
        for level in sorted(clustering_result.keys()):
            communities = clustering_result[level]
            for community_id, nodes in communities.items():
                key = f"level_{level}_community_{community_id}"
                unique_texts = {graph.nodes[node]['original_text'] for node in nodes}
                value = list(unique_texts)
                writer.write({key: value})

# Define the output file
output_file = 'jsonl/hierarchical_cluster_result.jsonl'

# Save the clusters with original text to JSONL
save_clusters_to_jsonl(G, clustering_result, output_file)