In [1]:
import jsonlines
input_file = '/root/Xiangpeng/RAG/paperag/ask-question/jsonl/raw_entity.jsonl'
with jsonlines.open(input_file, mode='r') as reader:
    documents = [doc for doc in reader]

In [2]:
print(documents[0]['entity'])

entity{tuple_delimiter}STUDIES{tuple_delimiter}RESEARCH, ANALYSIS{tuple_delimiter}STUDIES refer to investigations and examinations conducted to gather information and draw conclusions about a particular topic or phenomenon. In this case, the studies are focused on the impact of autonomous vehicles on urban parking demand. 
entity{tuple_delimiter}PARKING DEMAND{tuple_delimiter}SOCIAL PHENOMENON, TRANSPORTATION{tuple_delimiter}PARKING DEMAND refers to the amount of space required for parking vehicles in a given area, typically within urban cities. It is influenced by factors such as population density, vehicle ownership rates, and land use patterns.
entity{tuple_delimiter}URBAN CITIES{tuple_delimiter}GEOGRAPHICAL LOCATION, POPULATION CENTER{tuple_delimiter}URBAN CITIES are densely populated areas characterized by high concentrations of buildings, infrastructure, and residents. They often face challenges related to transportation, parking, and urban planning.
entity{tuple_delimiter}AUTONO

In [3]:
# Function to parse the input data
import re

tuple_delimiter = "{tuple_delimiter}"

def standardize_delimiter(input_string, standard_delimiter="{tuple_delimiter}"):
    # Define the regex pattern to match any case variation of {tuple_delimiter}
    pattern = re.compile(r'\{tuple_delimiter\}', re.IGNORECASE)
    
    # Replace all variations with the standard delimiter
    standardized_string = pattern.sub(standard_delimiter, input_string)
    
    return standardized_string
def parse_input(data):
    entities = []
    relationships = []

    for record in data:
        lines = record['entity'].split('\n')
        for line in lines:
            line = standardize_delimiter(line)
            parts = line.split(tuple_delimiter)
            if parts[0] == "entity":
                entities.append((parts[1], parts[2], parts[3], record['document']))
            elif parts[0] == "relationship":
                relationships.append((parts[1], parts[2], parts[3], int(parts[4]), record['document']))
    
    return entities, relationships

In [4]:
entities, relationships = parse_input(documents)

In [28]:
# remove duplicated based on name and type
def remove_duplicates(tuples_list):
    unique_items = {}
    for item in tuples_list:
        name = item[0]
        if name not in unique_items:
            unique_items[name] = item
    return list(unique_items.values())

unique_entities = remove_duplicates(entities)

In [43]:
# add embeddings to entities
import ollama

# Function to calculate embedding for each point
def calculate_embedding(point):
    embedding = ollama.embeddings(model='mxbai-embed-large', prompt=point)["embedding"]
    return embedding

for i, entity in enumerate(unique_entities):
    embedding = calculate_embedding('name: ' + entity[0] + 'type: ' + entity[1] + 'description: ' + entity[2])
    unique_entities[i] = (entity[0], entity[1], entity[2], entity[3], embedding)

In [50]:
import igraph as ig
import leidenalg as la
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Calculate embeddings for all entity
embeddings = np.array([entity[4] for entity in unique_entities])

# Construct similarity graph using cosine similarity
similarity_matrix = cosine_similarity(embeddings)
np.fill_diagonal(similarity_matrix, 0)  # Remove self-similarity
edges = np.argwhere(similarity_matrix > 0)
weights = similarity_matrix[edges[:, 0], edges[:, 1]]

# Create igraph graph
g = ig.Graph()
g.add_vertices(len(unique_entities))
g.add_edges(edges)
g.es['weight'] = weights

# Apply Leiden Algorithm
partition = la.find_partition(g, la.CPMVertexPartition, weights='weight', resolution_parameter=1.5)


# Organize entity by cluster labels
clusters = {}
for idx, cluster_id in enumerate(partition.membership):
    if cluster_id not in clusters:
        clusters[cluster_id] = []
    clusters[cluster_id].append(unique_entities[idx])

In [59]:
# Access and print each group
for label, group in clusters.items():
    print(f"Group {label}:")
    for data_point in group:
        print((data_point[0]))

Group 43:
STUDIES
RESEARCH
SURVEY
Group 32:
PARKING DEMAND
PARKING FACILITIES
PARKING SPACES
PARKING
Group 2:
URBAN CITIES
URBAN AREAS
FUTURE URBAN AREAS
URBAN ENVIRONMENTS
CITY
CITY METROPOLITAN AREAS
METROPOLITAN AREAS
URBAN REGIONS
METROPOLITAN CITIES
DEVELOPED CITIES
URBAN AREA
CITIES
CROWDED CITIES
Group 0:
AUTONOMOUS VEHICLES
ACs
AVS
AUTONOMOUS MOBILITY
FULLY AUTOMATED AVS
AV TECHNOLOGY
AUTONOMOUS CARS
SELF-DRIVING CARS
AV
AUTONOMOUS VEHICLE ADOPTION
SELF-DRIVING TAXI
AUTONOMOUS VEHICLE TECHNOLOGY
SAV
DRIVERLESS VEHICLES
LEVEL 2 AUTONOMOUS VEHICLES
LEVEL 3 AUTONOMOUS VEHICLES
SELF-DRIVING VEHICLES
SAVS
AV TECHNOLOGIES
VEHICLE AUTOMATION
FULLY AUTONOMOUS
CONNECTED AUTONOMOUS CARS
INTERCONNECTED AUTONOMOUS VEHICLES
SELF-DRIVING AUTOMOBILES
AUTONOMOUS DRIVING
AUTONOMOUS VEHICLE
CONTROL
SELF-DRIVING FEATURES
DRIVING AUTONOMOUSLY
AUTONOMOUS ELECTRIC VEHICLES (AVs)
AUTOMOBILE TECHNOLOGY (AVS)
ELECTRIC AUTONOMOUS VEHICLES
AUTONOMOUS ELECTRIC VEHICLES
MOTION CONTROL
AUTONOMOUS CAR
AUTONO

In [62]:
from collections import Counter
# entity resolution mapping
# in each group, find the most common name as the entity name
def most_common_or_first(my_list):
    if not my_list:
        return None
    
    counter = Counter(my_list)
    most_common = counter.most_common(1)
    
    return most_common[0][0] if most_common else my_list[0]

entity_mapping = {}
for key in clusters:
    group = clusters[key]
    names = [item[0] for item in group]
    common_value = most_common_or_first(names)
    for name in names:
        entity_mapping[name] = common_value

In [63]:
entity_mapping

{'STUDIES': 'STUDIES',
 'RESEARCH': 'STUDIES',
 'SURVEY': 'STUDIES',
 'PARKING DEMAND': 'PARKING DEMAND',
 'PARKING FACILITIES': 'PARKING DEMAND',
 'PARKING SPACES': 'PARKING DEMAND',
 'PARKING': 'PARKING DEMAND',
 'URBAN CITIES': 'URBAN CITIES',
 'URBAN AREAS': 'URBAN CITIES',
 'FUTURE URBAN AREAS': 'URBAN CITIES',
 'URBAN ENVIRONMENTS': 'URBAN CITIES',
 'CITY': 'URBAN CITIES',
 'CITY METROPOLITAN AREAS': 'URBAN CITIES',
 'METROPOLITAN AREAS': 'URBAN CITIES',
 'URBAN REGIONS': 'URBAN CITIES',
 'METROPOLITAN CITIES': 'URBAN CITIES',
 'DEVELOPED CITIES': 'URBAN CITIES',
 'URBAN AREA': 'URBAN CITIES',
 'CITIES': 'URBAN CITIES',
 'CROWDED CITIES': 'URBAN CITIES',
 'AUTONOMOUS VEHICLES': 'AUTONOMOUS VEHICLES',
 'ACs': 'AUTONOMOUS VEHICLES',
 'AVS': 'AUTONOMOUS VEHICLES',
 'AUTONOMOUS MOBILITY': 'AUTONOMOUS VEHICLES',
 'FULLY AUTOMATED AVS': 'AUTONOMOUS VEHICLES',
 'AV TECHNOLOGY': 'AUTONOMOUS VEHICLES',
 'AUTONOMOUS CARS': 'AUTONOMOUS VEHICLES',
 'SELF-DRIVING CARS': 'AUTONOMOUS VEHICLES',

In [61]:
names

['STUDIES', 'RESEARCH', 'SURVEY']