In [None]:
!pip install python-dotenv
!pip install langchain_community
!pip install langchain_openai
!pip install langchain.text_splitter
!pip install neo4j

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting langchain_community
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.3.0,>=0.2.9 (from langchain_community)
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.23 (from langchain_community)
  Downloading langchain_core-0.2.27-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain_community)
  Downloading langsmith-0.1.96-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain_community)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata

In [None]:
from dotenv import load_dotenv
import os

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI


# Warning control
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load from environment
load_dotenv('neo4j_rag.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
# Note the code below is unique to this course environment, and not a
# standard part of Neo4j's integration with OpenAI. Remove if running
# in your own environment.
OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [None]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [None]:
first_file_name = "/content/data/form10k/0000037472-23-000024.json"
first_file_as_object = json.load(open(first_file_name))
type(first_file_as_object)
# files = os.listdir('/content/data')
# print(files)

dict

In [None]:
for k,v in first_file_as_object.items():
    print(k, type(v))

item1_text = first_file_as_object['item1']
item1_text[0:1500]

item1 <class 'str'>
item1a <class 'str'>
item7 <class 'str'>
item7a <class 'str'>
cik <class 'str'>
cusip6 <class 'str'>
cusip <class 'list'>
names <class 'list'>
source <class 'str'>


'>Item 1.\xa0\xa0Business\n \nGeneral\n \nFlexsteel Industries, Inc., and Subsidiaries (the “Company”) is one of the largest manufacturers, importers, and marketers of residential furniture products in the United States. Product offerings include a wide variety of furniture such as sofas, loveseats, chairs, reclining rocking chairs, swivel rockers, sofa beds, convertible bedding units, occasional tables, desks, dining tables and chairs, kitchen storage, bedroom furniture, and outdoor furniture. A featured component in most of the upholstered furniture is a unique steel drop-in seat spring from which the name “Flexsteel” is derived. The Company distributes its products throughout the United States through its e-commerce channel and direct sales force.\n \nThe Company operates in one reportable segment, furniture products.  The Company’s furniture products business involves the distribution of manufactured and imported products consisting of a broad line of furniture for the residential 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap  = 20,
    length_function = len,
    is_separator_regex = False,
)
item1_text_chunks = text_splitter.split_text(item1_text)

In [None]:
def split_form10k_data_from_file(file):
    print(file)
    chunks_with_metadata = [] # use this to accumlate chunk records
    file_as_object = json.load(open(file)) # open the json file
    for item in ['item1','item1a','item7','item7a']: # pull these keys from the json
        print(f'Processing {item} from {file}')
        item_text = file_as_object[item] # grab the text of the item
        item_text_chunks = text_splitter.split_text(item_text) # split the text into chunks
        chunk_seq_id = 0
        for chunk in item_text_chunks[:20]: # only take the first 20 chunks
            form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
            # finally, construct a record with metadata and the chunk text
            chunks_with_metadata.append({
                'text': chunk,
                # metadata from looping...
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                # constructed metadata...
                'formId': f'{form_id}', # pulled from the filename
                'chunkId': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                # metadata from file...
                'names': file_as_object['names'],
                'cik': file_as_object['cik'],
                'cusip6': file_as_object['cusip6'],
                'source': file_as_object['source'],
            })
            chunk_seq_id += 1
        print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata

In [None]:
first_file_chunks = split_form10k_data_from_file(first_file_name)
first_file_chunks[0]

/content/data/form10k/0000037472-23-000024.json
Processing item1 from /content/data/form10k/0000037472-23-000024.json
	Split into 9 chunks
Processing item1a from /content/data/form10k/0000037472-23-000024.json
	Split into 17 chunks
Processing item7 from /content/data/form10k/0000037472-23-000024.json
	Split into 17 chunks
Processing item7a from /content/data/form10k/0000037472-23-000024.json
	Split into 1 chunks


{'text': '>Item 1.\xa0\xa0Business\n \nGeneral\n \nFlexsteel Industries, Inc., and Subsidiaries (the “Company”) is one of the largest manufacturers, importers, and marketers of residential furniture products in the United States. Product offerings include a wide variety of furniture such as sofas, loveseats, chairs, reclining rocking chairs, swivel rockers, sofa beds, convertible bedding units, occasional tables, desks, dining tables and chairs, kitchen storage, bedroom furniture, and outdoor furniture. A featured component in most of the upholstered furniture is a unique steel drop-in seat spring from which the name “Flexsteel” is derived. The Company distributes its products throughout the United States through its e-commerce channel and direct sales force.\n \nThe Company operates in one reportable segment, furniture products.  The Company’s furniture products business involves the distribution of manufactured and imported products consisting of a broad line of furniture for the res

In [None]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET
        mergedChunk.names = $chunkParam.names,
        mergedChunk.formId = $chunkParam.formId,
        mergedChunk.cik = $chunkParam.cik,
        mergedChunk.cusip6 = $chunkParam.cusip6,
        mergedChunk.source = $chunkParam.source,
        mergedChunk.f10kItem = $chunkParam.f10kItem,
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId,
        mergedChunk.text = $chunkParam.text
RETURN mergedChunk
"""

In [None]:
kg.query(merge_chunk_node_query,
         params={'chunkParam':first_file_chunks[0]})

[{'mergedChunk': {'formId': '0000037472-23-000024',
   'f10kItem': 'item1',
   'names': ['FLEXSTEEL INDS INC'],
   'cik': '37472',
   'textEmbedding': [0.019772157073020935,
    -0.002064920263364911,
    -0.02809232473373413,
    -0.021559851244091988,
    -0.009140086360275745,
    0.01266170758754015,
    -0.031156940385699272,
    -0.0024530377704650164,
    0.014718227088451385,
    -0.029597749933600426,
    0.006545914802700281,
    0.00809838529676199,
    -0.01895223744213581,
    -0.012863327749073505,
    -0.007285186555236578,
    0.016048915684223175,
    0.001388654694892466,
    -0.001809535431675613,
    0.01819952391088009,
    -0.03782382607460022,
    0.0030175726860761642,
    -0.005944416392594576,
    0.0044255489483475685,
    0.004371784161776304,
    -0.022500742226839066,
    0.014099927619099617,
    0.021546408534049988,
    0.007916927337646484,
    0.004318018909543753,
    0.016479037702083588,
    0.005332837346941233,
    -0.012634824961423874,
    -0.0

In [None]:
kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")

[]

In [None]:
node_count = 0
for chunk in first_file_chunks:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    kg.query(merge_chunk_node_query,
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1-chunk0000
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1-chunk0001
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1-chunk0002
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1-chunk0003
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1-chunk0004
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1-chunk0005
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1-chunk0006
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1-chunk0007
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1-chunk0008
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1a-chunk0000
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1a-chunk0001
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1a-chunk0002
Creating `:Chunk` node for chunk ID 0000037472-23-000024-item1a-chunk0003
Creating `:Chunk` node for chunk ID 0000037472-

In [None]:
kg.query("""
         CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding)
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'
         }}
""")

[]

In [None]:
kg.query("""
    MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
    WITH chunk, genai.vector.encode(
      chunk.text,
      "OpenAI",
      {
        token: $openAiApiKey,
        endpoint: $openAiEndpoint
      }) AS vector
    CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
    """,
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

[]

In [None]:
kg.refresh_schema()
print(kg.schema)

Node properties:
Chunk {chunkId: STRING, names: LIST, formId: STRING, cik: STRING, cusip6: STRING, source: STRING, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, textEmbedding: LIST}
Form {names: LIST, formId: STRING, cik: STRING, cusip6: STRING, source: STRING}
Relationship properties:
SECTION {f10kItem: STRING}
The relationships:
(:Chunk)-[:NEXT]->(:Chunk)
(:Chunk)-[:PART_OF]->(:Form)
(:Form)-[:SECTION]->(:Chunk)


In [None]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question,
      "OpenAI",
      {
        token: $openAiApiKey,
        endpoint: $openAiEndpoint
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.text AS text, node.chunkId AS source
  """
  similar = kg.query(vector_search_query,
                     params={
                      'question': question,
                      'openAiApiKey':OPENAI_API_KEY,
                      'openAiEndpoint': OPENAI_ENDPOINT,
                      'index_name':VECTOR_INDEX_NAME,
                      'top_k': 10})
  return similar

In [None]:
search_results = neo4j_vector_search(
    'What was the net sales attributable to the Contract segment in fiscal 2022?'
)
search_results

[{'score': 0.9324605464935303,
  'text': 'Fiscal 2022 Compared to Fiscal 2021\n \nNet sales were $544.3 million for the year ended June 30, 2022, compared to net sales of $478.9 million in the prior year, an increase of $65.4 million or 13.6%. Sales of products sold through retailers grew by $73.4 million or 17.8% primarily driven by pricing and a strong order backlog at the start of the year.  Sales of products sold through e-commerce channels decreased by ($8.0) million, or (12%) due to a decrease in consumer demand.\n \nGross margin as a percent of net sales for the year ended June 30, 2022, was 13.4%, compared to 20.2% for the prior year period, a decrease of 680-bps. The 680-bps decrease was primarily driven by \na 450-bps decrease related to ancillary charges caused by domestic supply chain disruptions and higher per diem charges, a decrease of 200-bps due to pricing promotions and inventory write-downs, a decrease of 110-bps related to capacity growth \ninvestments\n in a third 

In [None]:

neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)


In [None]:
# retriever = neo4j_vector_store.as_retriever()
# chain = RetrievalQAWithSourcesChain.from_chain_type(
#     ChatOpenAI(temperature=0),
#     chain_type="stuff",
#     retriever=retriever
# )
from langchain_core.retrievers import BaseRetriever
from typing import List
from langchain_core.documents import Document

class CustomRetriever(BaseRetriever):
    def _get_relevant_documents(self, query: str):
        results = neo4j_vector_search(query)
        documents = [
            Document(
                # text=result['text'],
                page_content=result['text'],
                metadata={'source': result.get('source', 'Unknown')}
            ) for result in results
        ]
        return documents



retriever = CustomRetriever()

chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [None]:
def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain({"question": question},
        return_only_outputs=True,)
    print("response:", response)
    print(textwrap.fill(response['answer'], 60))

In [None]:
question = "Where are Flexsteel's manufacturing facilities located?"
prettychain(question)

response: {'answer': "Flexsteel's manufacturing facilities are located in Dublin, Georgia, and Juarez, Mexico.\n", 'sources': '0000037472-23-000024-item1-chunk0001', 'source_documents': [Document(metadata={'source': '0000037472-23-000024-item1-chunk0000'}, page_content='>Item 1.\xa0\xa0Business\n \nGeneral\n \nFlexsteel Industries, Inc., and Subsidiaries (the “Company”) is one of the largest manufacturers, importers, and marketers of residential furniture products in the United States. Product offerings include a wide variety of furniture such as sofas, loveseats, chairs, reclining rocking chairs, swivel rockers, sofa beds, convertible bedding units, occasional tables, desks, dining tables and chairs, kitchen storage, bedroom furniture, and outdoor furniture. A featured component in most of the upholstered furniture is a unique steel drop-in seat spring from which the name “Flexsteel” is derived. The Company distributes its products throughout the United States through its e-commerce c

In [None]:
cypher = """
  MATCH (anyChunk:Chunk)
  WITH anyChunk LIMIT 1
  RETURN anyChunk { .names, .source, .formId, .cik, .cusip6 } as formInfo
"""
form_info_list = kg.query(cypher)

form_info_list


[{'formInfo': {'cik': '37472',
   'source': 'https://www.sec.gov/Archives/edgar/data/37472/000003747223000024/0000037472-23-000024-index.htm',
   'formId': '0000037472-23-000024',
   'names': ['FLEXSTEEL INDS INC'],
   'cusip6': '339382'}}]

In [None]:
form_info = form_info_list[0]['formInfo']
form_info

{'cik': '37472',
 'source': 'https://www.sec.gov/Archives/edgar/data/37472/000003747223000024/0000037472-23-000024-index.htm',
 'formId': '0000037472-23-000024',
 'names': ['FLEXSTEEL INDS INC'],
 'cusip6': '339382'}

In [None]:
cypher = """
    MERGE (f:Form {formId: $formInfoParam.formId })
      ON CREATE
        SET f.names = $formInfoParam.names
        SET f.source = $formInfoParam.source
        SET f.cik = $formInfoParam.cik
        SET f.cusip6 = $formInfoParam.cusip6
"""

kg.query(cypher, params={'formInfoParam': form_info})

[]

In [None]:
cypher = """
  MATCH (from_same_section:Chunk)
  WHERE from_same_section.formId = $formIdParam
    AND from_same_section.f10kItem = $f10kItemParam
  WITH from_same_section
    ORDER BY from_same_section.chunkSeqId ASC
  WITH collect(from_same_section) as section_chunk_list
    CALL apoc.nodes.link(
        section_chunk_list,
        "NEXT",
        {avoidDuplicates: true}
    )
  RETURN size(section_chunk_list)
"""

kg.query(cypher, params={'formIdParam': form_info['formId'],
                         'f10kItemParam': 'item1'})


[{'size(section_chunk_list)': 9}]

In [None]:
kg.refresh_schema()
print(kg.schema)

Node properties:
Chunk {chunkId: STRING, names: LIST, formId: STRING, cik: STRING, cusip6: STRING, source: STRING, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, textEmbedding: LIST}
Form {names: LIST, formId: STRING, cik: STRING, cusip6: STRING, source: STRING}
Relationship properties:
SECTION {f10kItem: STRING}
The relationships:
(:Chunk)-[:NEXT]->(:Chunk)
(:Chunk)-[:PART_OF]->(:Form)
(:Form)-[:SECTION]->(:Chunk)


In [None]:
cypher = """
  MATCH (from_same_section:Chunk)
  WHERE from_same_section.formId = $formIdParam
    AND from_same_section.f10kItem = $f10kItemParam
  WITH from_same_section
    ORDER BY from_same_section.chunkSeqId ASC
  WITH collect(from_same_section) as section_chunk_list
    CALL apoc.nodes.link(
        section_chunk_list,
        "NEXT",
        {avoidDuplicates: true}
    )
  RETURN size(section_chunk_list)
"""
for form10kItemName in ['item1', 'item1a', 'item7', 'item7a']:
  kg.query(cypher, params={'formIdParam':form_info['formId'],
                           'f10kItemParam': form10kItemName})


In [None]:
cypher = """
  MATCH (c:Chunk), (f:Form)
    WHERE c.formId = f.formId
  MERGE (c)-[newRelationship:PART_OF]->(f)
  RETURN count(newRelationship)
"""

kg.query(cypher)

[{'count(newRelationship)': 44}]

In [None]:
cypher = """
  MATCH (first:Chunk), (f:Form)
  WHERE first.formId = f.formId
    AND first.chunkSeqId = 0
  WITH first, f
    MERGE (f)-[r:SECTION {f10kItem: first.f10kItem}]->(first)
  RETURN count(r)
"""

kg.query(cypher)

[{'count(r)': 4}]

In [None]:
cypher = """
  MATCH (f:Form)-[r:SECTION]->(first:Chunk)
    WHERE f.formId = $formIdParam
        AND r.f10kItem = $f10kItemParam
  RETURN first.chunkId as chunkId, first.text as text
"""

first_chunk_info = kg.query(cypher, params={
    'formIdParam': form_info['formId'],
    'f10kItemParam': 'item1'
})[0]

first_chunk_info


{'chunkId': '0000037472-23-000024-item1-chunk0000',
 'text': '>Item 1.\xa0\xa0Business\n \nGeneral\n \nFlexsteel Industries, Inc., and Subsidiaries (the “Company”) is one of the largest manufacturers, importers, and marketers of residential furniture products in the United States. Product offerings include a wide variety of furniture such as sofas, loveseats, chairs, reclining rocking chairs, swivel rockers, sofa beds, convertible bedding units, occasional tables, desks, dining tables and chairs, kitchen storage, bedroom furniture, and outdoor furniture. A featured component in most of the upholstered furniture is a unique steel drop-in seat spring from which the name “Flexsteel” is derived. The Company distributes its products throughout the United States through its e-commerce channel and direct sales force.\n \nThe Company operates in one reportable segment, furniture products.  The Company’s furniture products business involves the distribution of manufactured and imported products

In [None]:
cypher = """
  MATCH (first:Chunk)-[:NEXT]->(nextChunk:Chunk)
    WHERE first.chunkId = $chunkIdParam
  RETURN nextChunk.chunkId as chunkId, nextChunk.text as text
"""

next_chunk_info = kg.query(cypher, params={
    'chunkIdParam': first_chunk_info['chunkId']
})[0]

next_chunk_info


{'chunkId': '0000037472-23-000024-item1-chunk0001',
 'text': '2023\n \n2022\n \n2021\nResidential  \n \n$\n 393,692\n \n$\n 543,447\n \n$\n 476,519\nContract  \n \n \n —\n \n \n 835\n \n \n 2,406\n \n \n$\n 393,692\n \n$\n 544,282\n \n$\n 478,925\n \n \n \nManufacturing and Offshore Sourcing\n \nDuring the fiscal year ended June 30, 2023, the Company operated manufacturing facilities located in Dublin, Georgia, and Juarez, Mexico.  These ongoing manufacturing operations are integral to the Company’s product offerings and distribution strategy by offering smaller and more frequent product runs of a wider product selection. The Company identifies and eliminates manufacturing inefficiencies and adjusts manufacturing schedules on a daily basis to meet customer requirements.  The Company has established relationships with key suppliers to ensure prompt delivery of quality component parts.  The Company’s production includes the use of selected component parts sourced offshore to enhance valu

In [None]:
print(first_chunk_info['chunkId'], next_chunk_info['chunkId'])

0000037472-23-000024-item1-chunk0000 0000037472-23-000024-item1-chunk0001


In [None]:
cypher = """
    MATCH (c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk)
        WHERE c2.chunkId = $chunkIdParam
    RETURN c1.chunkId, c2.chunkId, c3.chunkId
    """

kg.query(cypher,
         params={'chunkIdParam': next_chunk_info['chunkId']})

[{'c1.chunkId': '0000037472-23-000024-item1-chunk0000',
  'c2.chunkId': '0000037472-23-000024-item1-chunk0001',
  'c3.chunkId': '0000037472-23-000024-item1-chunk0002'}]

In [None]:
cypher = """
    MATCH window = (c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk)
        WHERE c1.chunkId = $chunkIdParam
    RETURN length(window) as windowPathLength
    """

kg.query(cypher,
         params={'chunkIdParam': next_chunk_info['chunkId']})

In [None]:
cypher = """
    MATCH window=(c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk)
        WHERE c2.chunkId = $chunkIdParam
    RETURN nodes(window) as chunkList
    """
# pull the chunk ID from the first
kg.query(cypher,
         params={'chunkIdParam': first_chunk_info['chunkId']})


In [None]:
cypher = """
  MATCH window=
      (:Chunk)-[:NEXT*0..1]->(c:Chunk)-[:NEXT*0..1]->(:Chunk)
    WHERE c.chunkId = $chunkIdParam
  RETURN length(window)
  """

kg.query(cypher,
         params={'chunkIdParam': first_chunk_info['chunkId']})

In [None]:
cypher = """
  MATCH window=
      (:Chunk)-[:NEXT*0..1]->(c:Chunk)-[:NEXT*0..1]->(:Chunk)
    WHERE c.chunkId = $chunkIdParam
  WITH window as longestChunkWindow
      ORDER BY length(window) DESC LIMIT 1
  RETURN length(longestChunkWindow)
  """

kg.query(cypher,
         params={'chunkIdParam': first_chunk_info['chunkId']})

In [None]:
# retrieval_query_extra_text = """
# WITH node as extraText
# RETURN extraText + "\n" + node.text as text,
#     node {.text} AS metadata
# """

In [None]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
    # retrieval_query=retrieval_query_extra_text,
)
# Create a retriever from the vector store
retriever = CustomRetriever()
# windowless_retriever = neo4j_vector_store.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
windowless_chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [None]:
retrieval_query_window = """
MATCH window=
    (:Chunk)-[:NEXT*0..1]->(node)-[:NEXT*0..1]->(:Chunk)
WITH node, score, window as longestWindow
  ORDER BY length(window) DESC LIMIT 1
WITH nodes(longestWindow) as chunkList, node, score
  UNWIND chunkList as chunkRows
WITH collect(chunkRows.text) as textList, node, score
RETURN apoc.text.join(textList, " \n ") as text,
    score,
    node {.source},
    node {.source} AS metadata
"""

In [None]:

vector_store_window = Neo4jVector.from_existing_index(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database="neo4j",
    index_name=VECTOR_INDEX_NAME,
    text_node_property=VECTOR_SOURCE_PROPERTY,
    retrieval_query=retrieval_query_window, # NEW!!!
)

# Create a retriever from the vector store
retriever_window = vector_store_window.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
chain_window = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0),
    chain_type="stuff",
    retriever=retriever_window,
    return_source_documents=True
)

In [None]:
# question = "What are some of the furniture products offered by Flexsteel Industries, Inc.?"
# question = "What is the featured component in most of Flexsteel's upholstered furniture?"
# question = "In which market segment does Flexsteel operate?"
# question = "What restructuring activities did Flexsteel complete during fiscal 2021?"
# question = "Where are Flexsteel's manufacturing facilities located?"
# question = "How does Flexsteel's manufacturing strategy contribute to its distribution strategy?"
# question = "How does Flexsteel ensure prompt delivery of quality component parts?"
# question = "What was the net sales attributable to the Residential segment in fiscal 2023?"
# question = "What was the net sales attributable to the Contract segment in fiscal 2022?"
# question = "How has Flexsteel's product offerings evolved over the past three fiscal years?"

# answer = windowless_chain(
#     {"question": question},
#     return_only_outputs=True,
# )
# # print("Full response:", answer)
# print(textwrap.fill(answer["answer"]))

# answer = chain_window(
#     {"question": question},
#     return_only_outputs=False,
# )
# print("Full response:", answer)
# print("Answer:", textwrap.fill(answer["answer"]))



Q1: What are some of the furniture products offered by Flexsteel Industries, Inc.?

A: Flexsteel Industries, Inc. offers a wide variety of furniture products
such as sofas, loveseats, chairs, reclining rocking chairs, swivel
rockers, sofa beds, convertible bedding units, occasional tables,
desks, dining tables and chairs, kitchen storage, bedroom furniture,
and outdoor furniture.

Q2: What is the featured component in most of Flexsteel's upholstered furniture?

A: A featured component in most of Flexsteel's upholstered furniture is a
unique steel drop-in seat spring.

Q3: In which market segment does Flexsteel operate?

A: Flexsteel operates in the residential furniture market segment.

Q4: What restructuring activities did Flexsteel complete during fiscal 2021?

A: During fiscal 2021, Flexsteel Industries, Inc. substantially completed
its restructuring activities related to the exit of its Vehicle
Seating and the remainder of its Hospitality product lines.

Q5: Where are Flexsteel's manufacturing facilities located?"

A: Flexsteel's manufacturing facilities are located in Dublin, Georgia,
and Juarez, Mexico.

Q6: How does Flexsteel's manufacturing strategy contribute to its distribution strategy?

A: Flexsteel's manufacturing strategy contributes to its distribution
strategy by integrating manufactured products with finished products
acquired from offshore suppliers, allowing the company to offer a wide
range of price points, styles, and product categories to satisfy
customer requirements. This blended focus on products enhances the
company's competitive advantage in the furniture industry.

Q7: How does Flexsteel ensure prompt delivery of quality component parts?

A: Flexsteel ensures prompt delivery of quality component parts by
establishing relationships with key suppliers.

Q8: What was the net sales attributable to the Residential segment in fiscal 2023?

A: The net sales attributable to the Residential segment in fiscal 2023
were $393,692.

Q9: What was the net sales attributable to the Contract segment in fiscal 2022?

A: The net sales attributable to the Contract segment in fiscal 2022 were
$835 million.

Q10: How has Flexsteel's product offerings evolved over the past three fiscal years?

A: Flexsteel's product offerings have evolved over the past three fiscal
years. In fiscal 2020, the company substantially completed its exit
from the Commercial Office and custom design Hospitality product
lines. During fiscal 2021, the company substantially completed its
restructuring activities related to the exit of its Vehicle Seating
and the remainder of its Hospitality product lines. The net sales
attributable to each area of application for the past three fiscal
years are as follows:  - 2023: Residential - $393,692; Contract - $0 -
2022: Residential - $543,447; Contract - $835 - 2021: Residential -
$476,519; Contract - $2,406


In [None]:
!pip install nltk
!python -m nltk.downloader popular

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

In [None]:
# import pandas as pd
# from sklearn.metrics import f1_score, precision_score, recall_score
# import textwrap

# csv_path = "/content/queries.csv"
# data = pd.read_csv(csv_path)

# def process_queries_and_evaluate(data):
#     predictions = []
#     correct_answers = data['Correct Answer'].tolist()

#     for question in data['Query']:
#         response = windowless_chain({"question": question}, return_only_outputs=True)
#         predictions.append(response['answer'])
#         print('answer', response['answer'])

#     # Calculate metrics
#     precision = precision_score(correct_answers, predictions, average='micro', zero_division=0)
#     recall = recall_score(correct_answers, predictions, average='micro', zero_division=0)
#     f1 = f1_score(correct_answers, predictions, average='micro', zero_division=0)

#     return precision, recall, f1


In [None]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
import textwrap

csv_path = "/content/queries.csv"
data = pd.read_csv(csv_path)

def calculate_bleu_score(reference, candidate):
    reference_tokens = [reference.split()]
    candidate_tokens = candidate.split()
    return sentence_bleu(reference_tokens, candidate_tokens)

def calculate_meteor_score(reference, candidate):
    """
    Calculate METEOR score for a candidate sentence given a reference sentence.

    Args:
    reference (str): Reference sentence as a string.
    candidate (str): Candidate sentence as a string.

    Returns:
    float: The METEOR score for the candidate sentence.
    """
    # Tokenizing the reference and candidate
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()

    # METEOR score expects the references as a list of strings and candidate as a list of strings.
    # Both reference and candidate should be passed as lists of tokenized words.
    score = meteor_score([reference_tokens], candidate_tokens)  # Here reference and candidate both are lists of words.
    return score

def process_queries_and_evaluate(data):
    bleu_scores = []
    meteor_scores = []
    correct_answers = data['Correct Answer'].tolist()

    for index, (question, correct_answer) in enumerate(zip(data['Query'], correct_answers)):
        # Skip processing if the question is empty, contains only whitespace, or is not a string
        if isinstance(question, float) or not question.strip():
            print(f"Skipping invalid or empty question at index {index}")
            continue

        # Assuming windowless_chain is your model invocation function
        response = windowless_chain({"question": question}, return_only_outputs=True)
        generated_answer = response['answer']
        print('Question:', question)
        print('Generated Answer:', generated_answer)
        print('Correct Answer:', correct_answer)

        # Calculate BLEU and METEOR scores
        bleu_score = calculate_bleu_score(correct_answer, generated_answer)
        bleu_scores.append(bleu_score)

        meteor_score = calculate_meteor_score(correct_answer, generated_answer)
        meteor_scores.append(meteor_score)

        print('BLEU Score:', bleu_score)
        print('METEOR Score:', meteor_score)
        print()  # Add a blank line for readability

    # Calculate the average scores
    average_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
    average_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0

    print('Average BLEU Score:', average_bleu)
    print('Average METEOR Score:', average_meteor)

    return bleu_scores, meteor_scores, average_bleu, average_meteor


bleu_scores, meteor_scores, average_bleu, average_meteor = process_queries_and_evaluate(data)


Question: What are some of the furniture products offered by Flexsteel Industries, Inc.?
Generated Answer: Flexsteel Industries, Inc. offers a wide variety of furniture products such as sofas, loveseats, chairs, reclining rocking chairs, swivel rockers, sofa beds, convertible bedding units, occasional tables, desks, dining tables and chairs, kitchen storage, bedroom furniture, and outdoor furniture.

Correct Answer: Product offerings include a wide variety of furniture such as sofas, loveseats, chairs, reclining rocking chairs, swivel rockers, sofa beds, convertible bedding units, occasional tables, desks, dining tables and chairs, kitchen storage, bedroom furniture, and outdoor furniture.
BLEU Score: 0.8248765135255685
METEOR Score: 0.9670781893004116

Question: What is the featured component in most of Flexsteel's upholstered furniture?
Generated Answer: A featured component in most of Flexsteel's upholstered furniture is a unique steel drop-in seat spring.

Correct Answer: A feature