Here we'll discuss the 4 different entity mapping methods

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import textwrap

from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate,SystemMessagePromptTemplate, PromptTemplate
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import LLMChain, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from dotenv import load_dotenv
load_dotenv('.env', override=True)

False

In [2]:
# Load from environment
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")

In [3]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

## Method 1
Here are we just doing a cosine similarity of the vectors that we have embedded.
Similar columns will have a high similarity score.

~~~sql
  MATCH (n1:column), (n2:column)
  WHERE n1.source <> n2.source
  AND (n1.col_type='reference' and n2.col_type='reference')
  RETURN n1.source, n1.name , n2.source, n2.name, 
  vector.similarity.cosine(n1.embedding, n2.embedding) as similarity
~~~


In [4]:
query = """
  MATCH (n1:column), (n2:column)
  WHERE n1.source <> n2.source
  AND (n1.col_type='reference' and n2.col_type='reference')
  RETURN n1.source, n1.name , n2.source, n2.name, vector.similarity.cosine(n1.embedding, n2.embedding) as similarity
"""
result = kg.query(query)
# result
df = pd.DataFrame(result)
s1 = df['n1.source']+'.'+df['n1.name']
s2 = df['n2.source']+'.'+df['n2.name']
df['unique'] = pd.Series(pd.concat([s1, s2], axis=1).values.tolist()).apply(set)
df.sort_values('similarity',ascending=False,inplace=True)
df = df.drop_duplicates('unique').reset_index(drop=True)
df.head(10)

Unnamed: 0,n1.source,n1.name,n2.source,n2.name,similarity,unique
0,foot_traffic,country,weather,country,0.983184,"{foot_traffic.country, weather.country}"
1,foot_traffic,symbol,sec_master,ticker,0.960115,"{sec_master.ticker, foot_traffic.symbol}"
2,foot_traffic,post_code,weather,zip_code,0.954602,"{weather.zip_code, foot_traffic.post_code}"
3,foot_traffic,symbol,sec_master,entity_name,0.936529,"{sec_master.entity_name, foot_traffic.symbol}"
4,web_traffic,website_owner,social_media,page_owner,0.935326,"{web_traffic.website_owner, social_media.page_..."
5,weather,zip_code,foot_traffic,country,0.935087,"{foot_traffic.country, weather.zip_code}"
6,foot_traffic,symbol,sec_master,security_name,0.93466,"{foot_traffic.symbol, sec_master.security_name}"
7,web_traffic,website_brand,social_media,page_owner,0.927484,"{web_traffic.website_brand, social_media.page_..."
8,web_traffic,website_owner,sec_master,entity_name,0.926716,"{web_traffic.website_owner, sec_master.entity_..."
9,foot_traffic,location_name,weather,zip_code,0.921447,"{weather.zip_code, foot_traffic.location_name}"


## Method 2
Non rag - we are giving the column descriptions in the prompts for two columns and we are asking if they are the same.

In [5]:
def get_column_details(col_name,col_source):
    query = f"""
    match (c:column)
    where c.name = '{col_name}' and c.source = '{col_source}'
    return c.name, c.description, c.values
    """
    result = kg.query(query)

    output_string = ""
    for key, value in result[0].items():
        output_string += f"{key.strip('.').replace('c.', '')}: {value},\n"
    return output_string

In [7]:
prompt_template = PromptTemplate.from_template(
"""
You are given two columns from different tables that need to be compared to determine if they represent the same identifiers. 
Here are the details of the columns:

Column 1:
{col1}

Column 2:
{col2}

Compare these columns and answer the following questions:
1.Do these columns seem to represent the same type of identifier based on name and the description?
2.Based on the sample values, can we infer that these values are similar enough that they are taken from the same identifier ?
"""
)
prompt_chain = LLMChain(llm=OpenAI(temperature=0.5),prompt=prompt_template)

  warn_deprecated(
  warn_deprecated(


In [8]:
row = df.iloc[2]

col_name = row['n2.name']
col_source = row['n2.source']
col2 = get_column_details(col_name,col_source)
print(col2)

col_name = row['n1.name']
col_source = row['n1.source']
col1 = get_column_details(col_name,col_source)
print(col1)
print()
result = prompt_chain.run(col1=col1, col2=col2)
print(result)

name: zip_code,
description: ZIP code of the location where the data was collected.,
values: 10001,

name: post_code,
description: Postal code where the location is situated.,
values: 10001,73070,




  warn_deprecated(



1. Yes, based on the name and description, it appears that both columns represent some type of geographic identifier related to a location.
2. Based on the sample values, it is likely that these values are taken from the same identifier. Both values are numerical and have the same number of digits, and the values in column 2 are a subset of the values in column 1. However, without more context or information it is not possible to definitively say that they are from the same identifier. 


In [161]:
row = df.iloc[3]

col_name = row['n2.name']
col_source = row['n2.source']
col2 = get_column_details(col_name,col_source)
print(col2)

col_name = row['n1.name']
col_source = row['n1.source']
col1 = get_column_details(col_name,col_source)
print(col1)
print()
result = prompt_chain.run(col1=col1, col2=col2)
print(result)

name: entity_name,
description: Name of the company or entity underlying the security.,
values: Starbucks Corp,Tesla Inc,Alphabet Inc,Nvidia Corp,Nike Inc,Costco Wholesale Corp,Apple Inc,

name: symbol,
description: Stock symbol or identifier for the business.,
values: SBUX,CVS,


1. Yes, these columns seem to represent the same type of identifier based on the name and description. Both columns are related to identifying businesses or entities, with column 1 specifically mentioning stock symbols and column 2 mentioning company or entity names.

2. No, we cannot infer that these values are similar enough to be taken from the same identifier. While some of the values in both columns may match (e.g. SBUX and Starbucks Corp), there are also values in column 2 that do not have a corresponding match in column 1 (e.g. Tesla Inc, Alphabet Inc). Additionally, the values in column 2 are not limited to just stock symbols, as there are also company names listed. Therefore, we cannot assume that th

In [159]:
row = df.iloc[6]
col_name = row['n2.name']
col_source = row['n2.source']
col2 = get_column_details(col_name,col_source)
print(col2)
col_name = row['n1.name']
col_source = row['n1.source']
col1 = get_column_details(col_name,col_source)
print(col1)
print()

result = prompt_chain.run(col1=col1, col2=col2)
print(result)

name: security_name,
description: Name of the security.,
values: Starbucks Corp,Tesla Inc,Alphabet Inc-Cl A,Nvidia Corp,Nike Inc -Cl B,Costco Wholesale Corp,Apple Inc,

name: symbol,
description: Stock symbol or identifier for the business.,
values: SBUX,CVS,



1. Yes, these columns appear to represent the same type of identifier. Both columns contain names or symbols that represent a specific business or company.

2. No, based on the sample values, we cannot infer that these values are similar enough to be taken from the same identifier. While some values in column 2 (security_name) may match with values in column 1 (symbol), there are also values in column 2 that do not have a direct match in column 1. Therefore, we cannot assume that all values in column 2 are taken from the same identifiers as those in column 1.


In [158]:
row = df.iloc[4]
col_name = row['n2.name']
col_source = row['n2.source']
col2 = get_column_details(col_name,col_source)
print(col2)
col_name = row['n1.name']
col_source = row['n1.source']
col1 = get_column_details(col_name,col_source)
print(col1)
print()

result = prompt_chain.run(col1=col1, col2=col2)
print(result)

name: page_owner,
description: Name of the owner or brand of the Instagram page.,
values: Nike,Adidas,

name: website_owner,
description: Corporation or entity that owns the website.,
values: Costco Wholesale Corporation,Nike Inc,


1. Based on the name and description, it appears that these columns represent the same type of identifier, as they both pertain to ownership of a brand or entity.
2. It is possible that the values in these columns are taken from the same identifier, as both columns contain values from well-known corporations and brands in the retail industry. However, without further context or information, it is not possible to definitively infer that these values are similar enough to be from the same identifier. 


In [164]:
# these are the pairs that we know are supposed to match
int_pairs = [[{'name':'post_code','source':'foot_traffic'}, {'name':'zip_code','source':'weather'}],
[{'name':'symbol','source':'foot_traffic'}, {'name':'ticker','source':'sec_master'}],
[{'name':'website_owner','source':'web_traffic'}, {'name':'entity_name','source':'sec_master'}],
[{'name':'website_brand','source':'web_traffic'}, {'name':'page_owner','source':'social_media'}]]

In [166]:
for p in int_pairs:
    col1 = get_column_details(p[0]['name'],p[0]['source'])
    col2 = get_column_details(p[1]['name'],p[1]['source'])
    print(col1)
    print(col2)
    print()
    result = prompt_chain.run(col1=col1, col2=col2)
    print(result)
    print("--------------------------------------------------")

name: post_code,
description: Postal code where the location is situated.,
values: 10001,73070,

name: zip_code,
description: ZIP code of the location where the data was collected.,
values: 10001,



1. Yes, these columns seem to represent the same type of identifier based on their names and descriptions. Both columns are related to location and have codes that identify specific areas.

2. Based on the sample values, it is possible that these values are similar enough that they are taken from the same identifier. The value "10001" appears in both columns, which suggests that they could be referring to the same location. However, without more information it is not possible to definitively say that they are from the same identifier. It is also possible that they could be referring to different locations with the same code.
--------------------------------------------------
name: symbol,
description: Stock symbol or identifier for the business.,
values: SBUX,CVS,

name: ticker,
descriptio

## Method 3
Asking what columns are similar

In [317]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name='embedded_index',
    node_label='embeddable',
    text_node_properties=['name','type','description','values'], 
    embedding_node_property='embedding',
)



In [319]:
template = """
You are given the below details about a column, find columns that represent the same identifier. There could be more than one match.
Use only the information from the given context
Compare these columns based on the flowing criterias and provide an explanation.
1.Do these columns seem to represent the same type of identifier based on name and the description?
2.Compare the sample values to infer if these values are similar enough that they are taken from the same identifier, they don't have to be the same.

Column details:
{col}

Retrieved information:
{context}
"""

prompt = ChatPromptTemplate.from_template(template)

In [320]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
rag_chain = (
    {"context": neo4j_vector_store.as_retriever(search_kwargs={'k': 4}), "col": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [321]:
# these are the pairs that we know are supposed to match
int_pairs = [[{'name':'post_code','source':'foot_traffic'}, {'name':'zip_code','source':'weather'}],
[{'name':'symbol','source':'foot_traffic'}, {'name':'ticker','source':'sec_master'}],
[{'name':'website_owner','source':'web_traffic'}, {'name':'entity_name','source':'sec_master'}],
[{'name':'website_brand','source':'web_traffic'}, {'name':'page_owner','source':'social_media'}]]

In [323]:
for p in int_pairs:
    col1 = get_column_details(p[0]['name'],p[0]['source'])
    col2 = get_column_details(p[1]['name'],p[1]['source'])
    print(col1)
    # print(col2)
    print(rag_chain.invoke(col1))
    print("---------------------------------------------------------------------------------------------------------------------------------------")

name: post_code,
description: Postal code where the location is situated.,
values: 10001,73070,


Based on the information provided, the columns "post_code" and "zip_code" seem to represent the same type of identifier. Both columns refer to codes related to the location where the data was collected, with "post_code" specifically mentioning postal code and "zip_code" mentioning ZIP code. The values in both columns also include the same value "10001", which further supports the idea that they represent the same type of identifier.

Therefore, based on the name, description, and sample values, it can be inferred that the columns "post_code" and "zip_code" likely represent the same type of identifier.
---------------------------------------------
name: symbol,
description: Stock symbol or identifier for the business.,
values: SBUX,CVS,


Based on the information provided, the columns "symbol" and "ticker" seem to represent the same type of identifier based on name and description. Both col

## Method 4

In [301]:
contextualize_query = """
match (node)-[:RELATED_TO]-(c:column)
with ('column:'+'\n'+'name:'+ node.name +'\n'+'type:'+node.type+'\n'+'description:'+node.description +'\n'+'values:'+node.values) as self,
reduce(s="", item in collect(c) | s + "\n\n" + 'column:'+'\n' + 'name:'+item.name +'\n'+ 'description:'+item.description +'\n'+ 'values:'+item.values ) as c_name,
score, {source: ' '} as metadata limit 1
return (self +'\n'+ c_name) as text, score, metadata  
"""

In [302]:
neo4j_vector_store = Neo4jVector.from_existing_index(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name='embedded_index',
    retrieval_query = contextualize_query
)

In [309]:
template = """
You are given the below details about a column, find columns that represent the same identifier. There could be more than one match.
Use only the information from the given context
Compare these columns based on the flowing criterias and provide an explanation.
1.Do these columns seem to represent the same type of identifier based on name and the description?
2.Compare the sample values to infer if these values are similar enough that they are taken from the same identifier, they don't have to be the same.

Column details:
{col}

Retrieved information:
{context}
"""

prompt = ChatPromptTemplate.from_template(template)

In [310]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
rag_chain = (
    {"context": neo4j_vector_store.as_retriever(search_kwargs={'k': 4}), "col": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [315]:
col = """
name: ZIPCODE,
description: location zip code where the data was collected
values: 07304, 10004,
"""

In [316]:
print(rag_chain.invoke(col))

Based on the information provided, the columns that could potentially represent the same identifier as "ZIPCODE" are "zip_code" and "post_code".

1. Comparing the names and descriptions:
- "zip_code" and "ZIPCODE" both refer to a code related to the location where the data was collected. They seem to represent the same type of identifier based on the similarity in name and description.
- "post_code" also seems to represent a similar identifier as it refers to a postal code where the location is situated. While the name is slightly different, the description indicates a similar concept.

2. Comparing the sample values:
- The sample value "10001" in the "zip_code" column matches one of the values in the "ZIPCODE" column, indicating a potential match.
- The sample value "10001" in the "post_code" column also matches one of the values in the "ZIPCODE" column, further suggesting a potential match.

Based on the above analysis, both "zip_code" and "post_code" columns seem to represent the sa