In [1]:
sys_prom = """
You are a documentation and language semantics expert. Your job is to breakdown process and data documents furnished by a user and re-organize their knowledge in an easily intelligible way. You start by breaking down documents into chunks of knowledge. Each one of the chunks is semantically atomic, i.e., its knowledge is limited to one and only one concept. A chunk is shorter than or as long as one sentence. The concept captured by a chunk is simple. A complex concept is broken down into multiple chunks. Chunks can overlap too. They are not always exhaustive and only important information is captured. They fall completely within the bounds of the information contained in the user provided documents. The user may also ask you to find semantically related chunks, i.e., top few most relevant chunks, for each chunk. You look at other chunks in the same paragraph and the neighbouring few sentences to find relevant chunks in short range. You also look many lines and pages back and ahead to find more relevant chunks in the long range. If the user furnishes multiple documents, you look for relevant chunks in the other documents too. You scan the entire document and all other documents to find relevant chunks for each chunk and order them by their relevance from most relevant to least relevant.
"""

In [2]:
prom = """
Please break down the knowledge contained in the 2 documents (files) - a process document - optimization.txt - and a data document - schema_limited.txt - into chunks.
"""

In [3]:
file1 = "gs://p0s0a31/sao_chatbot/kg/optimization.txt"
file2 = "gs://p0s0a31/sao_chatbot/kg/schema_limited.txt"

In [4]:
temp = 0
topP = 0.95
out_tok = 8192
rel_chnks = "10"

In [5]:
resp_sch = {
  "type": "OBJECT",
  "properties": {
    "resp": {
      "type": "ARRAY",
      "description": "a list of chunks",
      "items": {
        "description": "a chunk characterized by an id and text",
        "type": "OBJECT",
        "properties": {
          "id": {
            "type": "INTEGER",
            "description": "a unique id"
          },
          "txt": {
            "type": "STRING",
            "description": "the text of the chunk as in the user provided documents"
          }
        },
        "required": [
          "id",
          "txt"
        ]
      }
    }
  },
  "required": [
    "resp"
  ]
}

In [6]:
payload = {
    "contents": [
        {
            "role": "user",
            "parts": [
                {
                    "text": prom
                },
                {
                    "fileData": {
                        "mimeType": "text/plain",
                        "fileUri": file1
                    }
                },
                {
                    "fileData": {
                        "mimeType": "text/plain",
                        "fileUri": file2
                    }
                }
            ]
        }
    ]
    , "systemInstruction": {
        "parts": [
        {
            "text": sys_prom
        }
      ]
    }
    , "generationConfig": {
        "responseModalities": ["TEXT"]
        ,"temperature": temp
        ,"maxOutputTokens": out_tok
        ,"topP": topP
        ,"responseMimeType": "application/json"
        ,"responseSchema": resp_sch
    }
}

In [7]:
import google.auth
import google.auth.transport.requests

In [8]:
def load_gcloud_oauth_token():
    cred, proj = google.auth.default() # creds.valid is False, and creds.token is None
    auth_req = google.auth.transport.requests.Request()
    cred.refresh(auth_req) # need to refresh credentials to populate those
    access_token:str = cred.token
    return {"Authorization": f"Bearer {access_token}"}

In [9]:
from requests import request, models

In [10]:
project = "wmt-mtech-assortment-ml-prod"
model = "gemini-2.0-flash-exp"
gateway_url = f"https://us-central1-aiplatform.googleapis.com/v1/projects/{project}/locations/us-central1/publishers/google/models/{model}:generateContent"
header = load_gcloud_oauth_token()

In [11]:
response:models.Response = request("POST", 
                                   gateway_url, 
                                   headers=header, 
                                   json=payload)

In [12]:
chunks = response.json()["candidates"][0]["content"]

## second iteration

In [14]:
sec_prom = """
Consider the chunks generated in the previous step and find out the relevant chunks for each chunk.
"""

In [15]:
sec_prom = {
    "role": "user",
    "parts": [
        {
            "text": sec_prom
        }
    ]
}

In [16]:
resp_sch = {
  "type": "OBJECT",
  "properties": {
    "resp": {
      "type": "ARRAY",
      "description": "a list of chunk ids",
      "items": {
        "description": "a chunk and relevant chunks ids",
        "type": "OBJECT",
        "properties": {
          "id": {
            "type": "INTEGER",
            "description": "chunk id"
          },
          "rel": {
            "type": "ARRAY",
            "description": "a list of ids of semantically related chunks",
            "items": {
              "type": "INTEGER",
              "description": "id of a related chunk"
            },
            "minItems": "0",
            "maxItems": rel_chnks
          }
        },
        "required": [
          "id"
        ]
      }
    }
  },
  "required": [
    "resp"
  ]
}

In [17]:
payload = {
    "contents": [
        {
            "role": "user",
            "parts": [
                {
                    "text": prom
                },
                {
                    "fileData": {
                        "mimeType": "text/plain",
                        "fileUri": file1
                    }
                },
                {
                    "fileData": {
                        "mimeType": "text/plain",
                        "fileUri": file2
                    }
                }
            ]
        }
    ]
    , "systemInstruction": {
        "parts": [
        {
            "text": sys_prom
        }
      ]
    }
    , "generationConfig": {
        "responseModalities": ["TEXT"]
        ,"temperature": temp
        ,"maxOutputTokens": out_tok
        ,"topP": topP
        ,"responseMimeType": "application/json"
        ,"responseSchema": resp_sch
    }
}
payload["contents"].append(chunks)
payload["contents"].append(sec_prom)

In [18]:
response:models.Response = request("POST", 
                                   gateway_url, 
                                   headers=header, 
                                   json=payload)

In [19]:
relevant = response.json()["candidates"][0]["content"]

## write json

In [21]:
import json

In [20]:
# response = """
# {\n  "resp": [\n    {\n      "id": 1,\n      "txt": "Store Assortment Optimization is an optimization and recommendation system.",\n       "rel": [2, 3]\n    },\n    {\n      "id": 2,\n      "txt": "It recommends which roll-ups and how many facings of each should be stocked in a particular fixture at any store.",\n       "rel": [1, 3, 4, 5, 9, 10, 11, 12, 13, 14]\n    },\n    {\n      "id": 3,\n      "txt": "A roll-up is equivalent to an item.",\n       "rel": [1, 2, 4, 7, 10, 11, 12, 13, 14, 15]\n    },\n    {\n      "id": 4,\n      "txt": "A facing is the space on a fixture occupied by an item, displayed with the label facing the customer.",\n       "rel": [2, 3, 5, 10, 11, 12, 13, 14]\n    },\n    {\n      "id": 5,\n      "txt": "A fixture is most commonly a shelf or other similar stock displays.",\n       "rel": [2, 4, 6, 10, 11, 12, 13, 14]\n    },\n    {\n      "id": 6,\n      "txt": "The objective of the optimization is to recommend items such that the productivity of fixtures across many different stores is maximized.",\n       "rel": [5, 7, 8, 9, 10, 11, 12, 13, 14]\n    },\n    {\n      "id": 7,\n      "txt": "Productivity is defined as the demand per inch.",\n       "rel": [3, 6, 8, 10, 11, 12, 13, 14, 38]\n    },\n    {\n      "id": 8,\n      "txt": "Each fixture has a limited width and maximizing the productivity translates to maximizing the demand, and consequently, the sales.",\n       "rel": [6, 7, 9, 10, 11, 12, 13, 14]\n    },\n    {\n      "id": 9,\n      "txt": "Each roll-up too has a specific width and demand.",\n       "rel": [2, 6, 10, 11, 12, 13, 14]\n    },\n    {\n      "id": 10,\n      "txt": "The higher the demand and smaller the width, the more productive it is considered.",\n       "rel": [2, 3, 4, 5, 6, 7, 8, 9, 11, 12]\n    },\n    {\n      "id": 11,\n      "txt": "The optimizer tries to recommend high-performing PoDs, i.e., those with higher productivity.",\n       "rel": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12]\n    },\n    {\n      "id": 12,\n      "txt": "It considers the productivity of all possible substitutes for a roll-up in a store on a fixture and recommends the one with the highest productivity.",\n       "rel": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n    },\n    {\n      "id": 13,\n      "txt": "A PoD is an item-store pair.",\n       "rel": [2, 3, 4, 5, 6, 14, 15, 16, 17, 18]\n    },\n    {\n      "id": 14,\n      "txt": "However, the highest performing PoDs may not always make the list of recommendations.",\n       "rel": [2, 3, 4, 5, 6, 13, 15, 16, 17, 18]\n    },\n    {\n      "id": 15,\n      "txt": "This may happen because of certain rules.",\n       "rel": [3, 13, 14, 16, 17, 18, 19, 20, 21, 22]\n    },\n    {\n      "id": 16,\n      "txt": "The optimization is constrained by various rules - hard and soft.",\n       "rel": [13, 14, 15, 17, 18, 19, 20, 21, 22]\n    },\n    {\n      "id": 17,\n      "txt": "A hard rule is one which when met forces the system to ignore all other rules.",\n       "rel": [13, 14, 15, 16, 18, 19, 20, 21, 22]\n    },\n    {\n      "id": 18,\n      "txt": "A soft rule is one which can be satisfied along with other soft rules.",\n       "rel": [13, 14, 15, 16, 17, 19, 20, 21, 22]\n    },\n    {\n      "id": 19,\n      "txt": "Such rules urge the optimizer to explore lower-performing PoDs too, over ones with the highest performance, in many cases.",\n       "rel": [15, 16, 17, 18, 20, 21, 22]\n    },\n    {\n      "id": 20,\n      "txt": "An example of a soft rule would be the Pack-and-Half (PnH) constraint.",\n       "rel": [15, 16, 17, 18, 19, 21, 22, 23, 24, 25]\n    },\n    {\n      "id": 21,\n      "txt": "The rule penalizes any roll-ups at any store for which the recommended stock quantity falls short of the number of units in one and a half of its packs.",\n       "rel": [15, 16, 17, 18, 19, 20, 22, 23, 24, 25]\n    },\n    {\n      "id": 22,\n      "txt": "The rule doesn\'t apply to PoDs which do not fulfil the PnH constraint on the current mod.",\n       "rel": [15, 16, 17, 18, 19, 20, 21, 23, 24, 25]\n    },\n    {\n      "id": 23,\n      "txt": "The stock quantity can be calculated as the facings times the capacity of a facing.",\n       "rel": [20, 21, 22, 24, 25, 26]\n    },\n    {\n      "id": 24,\n      "txt": "The pack size for each roll-up can be different.",\n       "rel": [20, 21, 22, 23, 25]\n    },\n    {\n      "id": 25,\n      "txt": "Another example of a soft rule is the Days-of-Supply (DoS) constraint.",\n       "rel": [20, 21, 22, 23, 24, 26, 27, 28]\n    },\n    {\n      "id": 26,\n      "txt": "The rule favours recommendations with a reasonable number of days of supply.",\n       "rel": [23, 25, 27, 28]\n    },\n    {\n      "id": 27,\n      "txt": "Too small or too large a number demotes a PoD in the recommendations list.",\n       "rel": [25, 26, 28]\n    },\n    {\n      "id": 28,\n      "txt": "Days of Supply is calculated as the stock quantity by demand per day for the roll-up at the store.",\n       "rel": [25, 26, 27]\n    },\n    {\n      "id": 29,\n      "txt": "A hard rule is one of the global constraints which renders a roll-up ineligible or a protect strategy which mandates a roll-up stay on mod.",\n       "rel": [17, 30, 31, 32, 33, 34, 35, 36, 37]\n    },\n    {\n      "id": 30,\n      "txt": "The recommendation, also called status, for a roll-up at a store can be an \'add\', \'delete\', \'maintain\' or \'not added\'.",\n       "rel": [29, 31, 32, 33, 34, 35, 36, 37]\n    },\n    {\n      "id": 31,\n      "txt": "An optimzation system run is usually carried out for a scenario, also called a relay.",\n       "rel": [29, 30, 32, 33, 34, 35, 36, 37]\n    },\n    {\n      "id": 32,\n      "txt": "Merchants decide the universe, i.e., the roll-ups and stores for a relay.",\n       "rel": [29, 30, 31, 33, 34, 35, 36, 37]\n    },\n    {\n      "id": 33,\n      "txt": "A set of PoDs, a subset of this universe is called a mod or modular.",\n       "rel": [29, 30, 31, 32, 34, 35, 36, 37]\n    },\n    {\n      "id": 34,\n      "txt": "The set of PoDs which are currently stocked is called the current mode while the set of PoDs, recommended by the optimizer is the recommended mod.",\n       "rel": [29, 30, 31, 32, 33, 35, 36, 37]\n    },\n    {\n      "id": 35,\n      "txt": "An \'add\' recommendation for a PoD means that its new, not present in the current mod, and it should be added to the mod.",\n       "rel": [29, 30, 31, 32, 33, 34, 36, 37]\n    },\n    {\n      "id": 36,\n      "txt": "A \'delete\' recommendation for a PoD means it should be removed from the mod.",\n       "rel": [29, 30, 31, 32, 33, 34, 35, 37]\n    },\n    {\n      "id": 37,\n      "txt": "A \'maintain\' recommendation for a PoD simply means that the PoD is not new, i.e., it is not added to the mod and that it should not be removed from the mod, i.e., not be deleted.",\n       "rel": [29, 30, 31, 32, 33, 34, 35, 36]\n    },\n    {\n      "id": 38,\n      "txt": "A \'maintain\' recommendation can mean a change in the number of facings for PoD.",\n       "rel": [7, 39, 40, 41]\n    },\n    {\n      "id": 39,\n      "txt": "The number of facings can increase, and the item is said to have expanded in the store, while a decrease in the number of facings is referred to as a reduction or contraction of the item in the store.",\n       "rel": [38, 40, 41]\n    },\n    {\n      "id": 40,\n      "txt": "It is not necessary for the number of facings to change for a  maintained PoD.",\n       "rel": [38, 39, 41]\n    },\n    {\n      "id": 41,\n      "txt": "A \'not added\' recommendation for a PoD means that while it was considered it could not make the list of added PoDs, because it failed to satisfy one or more rules.",\n       "rel": [38, 39, 40]\n    },\n    {\n      "id": 42,\n      "txt": "All the recommendations - add, not added, delete, maintain and expand or contract - are governed by the optimization system.",\n       "rel": [43, 44, 45, 46, 47, 48, 49, 50, 51, 52]\n    },\n    {\n      "id": 43,\n      "txt": "The optimizer is a complex system and it can give some recommendations which are intuitive to the merchants and other recommendations which may need explanation.",\n       "rel": [42, 44, 45, 46, 47, 48, 49, 50, 51, 52]\n    },\n    {\n      "id": 44,\n      "txt": "The intuitive recommendations are the \'add\' and \'delete\' PoD recommendations for the PoDs which meet a criterion in each case.",\n       "rel": [42, 43, 45, 46, 47, 48, 49, 50, 51, 52]\n    },\n    {\n      "id": 45,\n      "txt": "If the PoD falls in the bottom 5% productivity bracket and its deletion is recommended, it is considered intuitive.",\n       "rel": [42, 43, 44, 46, 47, 48, 49, 50, 51, 52]\n    },\n    {\n      "id": 46,\n      "txt": "Similarly, if the PoD falls in the top 5% productivity bracket and its addition is recommended, it is considered intuitive too.",\n       "rel": [42, 43, 44, 45, 47, 48, 49, 50, 51, 52]\n    },\n    {\n      "id": 47,\n      "txt": "The reasons for recommendations which need explanation are listed as follows.",\n       "rel": [42, 43, 44, 45, 46, 48, 49, 50, 51, 52]\n    },\n    {\n      "id": 48,\n      "txt": "A PoD deletion might be recommended because: 1. It is part of a global constraint (\'do not carry list\' or \'global deletes\') that renders the item ineligible.",\n       "rel": [42, 43, 44, 45, 46, 47, 49, 50, 51, 52]\n    },\n    {\n      "id": 49,\n      "txt": "A PoD deletion might be recommended because: 2. It has a violation in DoS or PnH constraints - \'DoS / PnH Violation\'.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 50, 51, 52]\n    },\n    {\n      "id": 50,\n      "txt": "A PoD deletion might be recommended because: 3. At least one higher-performing substitute is recommended - \'Better substitute recommended\'.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 51, 52]\n    },\n    {\n      "id": 51,\n      "txt": "A PoD deletion might be recommended because: 4. Its minimum facing width is greater than total allocated space for the shelf - \'Item cannot fit on the shelf\'.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 50, 52]\n    },\n    {\n      "id": 52,\n      "txt": "A PoD deletion might be recommended because: 5. It does not satisfy 95% space utilization, when the shelf fits only one average-width item - \'Width violates the utilization\'.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]\n    },\n    {\n      "id": 53,\n      "txt": "A PoD deletion might be recommended because: 6. There are no adds or maintains below (in rank) this delete - \'More productive items recommended / expanded\'.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]\n    },\n    {\n      "id": 54,\n      "txt": "A PoD deletion might be recommended because: 7. Meeting pack and half constraint would require a high facing count so that this item becomes less productive than items added/ maintained below (in rank) - \'Not productive when required to meet PnH\'.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]\n    },\n    {\n      "id": 55,\n      "txt": "A PoD deletion might be recommended because: 8. The addition of its width is insufficient for the rest of on-mod recommendations to utilize 95% of the space - \'Width not sufficient to adhere utilization\'.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]\n    },\n    {\n      "id": 56,\n      "txt": "A PoD addition might be recommended because: 1. It is part of a protect strategy (\'protect list\' or \'top %ile rule\').",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]\n    },\n    {\n      "id": 57,\n      "txt": "A PoD addition might be recommended because: 2. It has no substitutes on the mod - \'Covers a need state\'.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]\n    },\n    {\n      "id": 58,\n      "txt": "A PoD addition might be recommended because: 3. It is in a shelf where all higher-performing items max out on facing - \'No better item add/ expand\'.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]\n    },\n    {\n      "id": 59,\n      "txt": "\'Facing max out\' is determined in two ways: i. already reaching the optimal facing in terms of nominal demand increments, or ii. reaching the maximum facing without incurring DoS or PnH penalties.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]\n    },\n    {\n      "id": 60,\n      "txt": "This also includes items with a \'not added\' status, where adding any number of facings would lead to violations in DoS or PnH.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]\n    },\n    {\n      "id": 61,\n      "txt": "A PoD addition might be recommended because: 4. In a second-iteration calculation, with the additional space provide by the intuitive deletes, there is extra space to fill and this item now falls within the newly calculated top percentage items that the shelf can fit - \'Space available for additional productive items\'.",\n       "rel": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51]\n    },\n    {\n      "id": 62,\n      "txt": "There are explanations for expansions and contractions, i.e., facings change too.",\n       "rel": [38, 39, 40, 41]\n    },\n    {\n      "id": 63,\n      "txt": "There are few key metrics used to measure the effectiveness of a recommendation - \'Units lift\', \'sales lift\', \'quality\', \'DoS\', \'outlier add/ delete ratio\', \'space utilization\', \'facings change\', \'PoD change\'.",\n       "rel": [64, 65, 66, 67, 68, 69, 70, 71, 72, 73]\n    },\n    {\n      "id": 64,\n      "txt": "Units lift is defined as the percentage change from units base forecast to recommended units.",\n       "rel": [63, 65, 66, 67, 68, 69, 70, 71, 72, 73]\n    },\n    {\n      "id": 65,\n      "txt": "Sales lift is percentage change from sales base forecast to recommended dollar sales.",\n       "rel": [63, 64, 66, 67, 68, 69, 70, 71, 72, 73]\n    },\n    {\n      "id": 66,\n      "txt": "The units lift and sales lift are collectively referred to as the incrementality of a recommendation.",\n       "rel": [63, 64, 65, 67, 68, 69, 70, 71, 72, 73]\n    },\n    {\n      "id": 67,\n      "txt": "Quality is the number of the intuitive recommendations as a percentage of all the PoDs/ recommendations.",\n       "rel": [63, 64, 65, 66, 68, 69, 70, 71, 72, 73]\n    },\n    {\n      "id": 68,\n      "txt": "DoS is Days of Supply already defined earlier.",\n       "rel": [63, 64, 65, 66, 67, 69, 70, 71, 72, 73]\n    },\n    {\n      "id": 69,\n      "txt": "Outliers are defined as the added PoDs which have a demand contribution outside the top 80% or the deleted PoDs which have a demand contribution within the top 80%.",\n       "rel": [63, 64, 65, 66, 67, 68, 70, 71, 72, 73]\n    },\n    {\n      "id": 70,\n      "txt": "Outlier add/ delete ratio is defined as the ratio number of outliers to total number of add or delete recommendations.",\n       "rel": [63, 64, 65, 66, 67, 68, 69, 71, 72, 73]\n    },\n    {\n      "id": 71,\n      "txt": "Space utilization is defined as the total linear inches of shelves occupied by a set of PoDs.",\n       "rel": [63, 64, 65, 66, 67, 68, 69, 70, 72, 73]\n    },\n    {\n      "id": 72,\n      "txt": "Facings change is only defined for PoDs with a maintain status.",\n       "rel": [63, 64, 65, 66, 67, 68, 69, 70, 71, 73]\n    },\n    {\n      "id": 73,\n      "txt": "It is simply the difference between the number of facings in the current mod and the recommended number of facings.",\n       "rel": [63, 64, 65, 66, 67, 68, 69, 70, 71, 72]\n    },\n    {\n      "id": 74,\n      "txt": "PoD change is the difference between the number of PoDs in the current mod and the recommended number of PoDs.",\n       "rel": [63, 75]\n    },\n    {\n      "id": 75,\n      "txt": "It can only be computed at levels above PoD, e.g., roll-up, store, brand, category, relay, etc.",\n       "rel": [74]\n    },\n    {\n      "id": 76,\n      "txt": "Among all the other metrics, quality and outlier add/ delete ratio also can only be computed at levels higher than PoD.",\n       "rel": [67, 70]\n    },\n    {\n      "id": 77,\n      "txt": "Units and sales lift or incementality, DoS, space utilization and facings change can be computed for each PoD, as well as higher levels, i.e., aggregated.",\n       "rel": [64, 65, 66, 68, 71, 73]\n    },\n    {\n      "id": 78,\n      "txt": "Relay_Scenario_Name is optimization run unique ID",\n      "rel": []\n    },\n    {\n      "id": 79,\n      "txt": "modlr_dept_nbr is mod department number",\n      "rel": []\n    },\n    {\n      "id": 80,\n      "txt": "modlr_catg_nbr is mod category number",\n      "rel": []\n    },\n    {\n      "id": 81,\n      "txt": "planogram_or_store is planogram unique ID (in case of a planogram optimization run) or store number",\n      "rel": []\n    },\n    {\n      "id": 82,\n      "txt": "fixture_type is roll-up grouping (a physical display case, e.g., shelf, etc.)",\n      "rel": [5]\n    },\n    {\n      "id": 83,\n      "txt": "total_comp_shelf_width is total shelf width in inches",\n      "rel": [8]\n    },\n    {\n      "id": 84,\n      "txt": "rollup_id is roll-up unique ID",\n      "rel": [3]\n    },\n    {\n      "id": 85,\n      "txt": "status_name is recommendation - can be \'add\', \'delete\', \'maintain\' or \'not added\'",\n      "rel": [30]\n    },\n    {\n      "id": 86,\n      "txt": "rec_facings is recommended number of facings for the roll-up at a store in the proposed mod",\n      "rel": [2, 4, 38, 39, 40, 41]\n    },\n    {\n      "id": 87,\n      "txt": "cur_facings is number of facings for the roll-up in the current mod",\n      "rel": [2, 4, 38, 39, 40, 41]\n    },\n    {\n      "id": 88,\n      "txt": "facing_width is facing width in inches",\n      "rel": [4, 9]\n    },\n    {\n      "id": 89,\n      "txt": "on_mod_subs_count is number of substitutes of the roll-up present on the current mod",\n      "rel": [12]\n    },\n    {\n      "id": 90,\n      "txt": "demand_transferred_to_subs_on_mod is fractional demand transferred to substitutes in case of a \'delete\' recommendation",\n      "rel": [36]\n    },\n    {\n      "id": 91,\n      "txt": "total_subs_count is overall total number of substitutes of the roll-up",\n      "rel": [12]\n    },\n    {\n      "id": 92,\n      "txt": "linear_nominal_dem is demand (units)",\n      "rel": [7]\n    },\n    {\n      "id": 93,\n      "txt": "do_not_carry_list is indicator of a global constraint",\n      "rel": [29, 48]\n    },\n    {\n      "id": 94,\n      "txt": "global_deletes is indicator of a global constraint",\n      "rel": [29, 48]\n    },\n    {\n      "id": 95,\n      "txt": "protect_list is indicator of a protect strategy",\n      "rel": [29, 56]\n    },\n    {\n      "id": 96,\n      "txt": "max_on_max_dos_constraint_75p is second level ceiling constraint for DoS",\n      "rel": [25, 26, 27, 28]\n    },\n    {\n      "id": 97,\n      "txt": "max_dos_constraint_50p is first level ceiling constraint for DoS",\n      "rel": [25, 26, 27, 28]\n    },\n    {\n      "id": 98,\n      "txt": "min_dos_constraint_1p is floor constraint for DoS",\n      "rel": [25, 26, 27, 28]\n    },\n    {\n      "id": 99,\n      "txt": "dos_rec_facings is calculated DoS at recommended facings",\n      "rel": [25, 26, 27, 28]\n    },\n    {\n      "id": 100,\n      "txt": "dos_facings_horizontal is  ",\n      "rel": [25, 26, 27, 28]\n    },\n    {\n      "id": 101,\n      "txt": "facings_needed_for_pnh is facings needed to satisfy PnH constraint",\n      "rel": [20, 21, 22]\n    },\n    {\n      "id": 102,\n      "txt": "dos_pnh_facings is calculated DoS at facings equal to facings_needed_for_pnh",\n      "rel": [20, 21, 22]\n    },\n    {\n      "id": 103,\n      "txt": "pnh_rec_facings is stock quantity at recommended facings expressed as a multiple of PnH",\n      "rel": [20, 21, 22]\n    },\n    {\n      "id": 104,\n      "txt": "pnh_cur_facings is stock quantity at current facings expressed as a multiple of PnH",\n      "rel": [20, 21, 22]\n    },\n    {\n      "id": 105,\n      "txt": "forecasted_units is units forecast",\n      "rel": [64]\n    },\n    {\n      "id": 106,\n      "txt": "forecast_revenue is reveneue forecast",\n      "rel": [65]\n    },\n    {\n      "id": 107,\n      "txt": "lower_bound_facing_count is minimum number of allowed facings",\n      "rel": [4]\n    },\n    {\n      "id": 108,\n      "txt": "upper_bound_facing_count is maximum number of allowed facings",\n      "rel": [4]\n    },\n    {\n      "id": 109,\n      "txt": "store_costs_pct is  ",\n      "rel": []\n    },\n    {\n      "id": 110,\n      "txt": "lost_sales_pct is  ",\n      "rel": []\n    },\n    {\n      "id": 111,\n      "txt": "sum_nominal_dem is  ",\n      "rel": []\n    },\n    {\n      "id": 112,\n      "txt": "nominal_demand_facing is ? this is the cutting point for the nominal demand curve",\n      "rel": []\n    },\n    {\n      "id": 113,\n      "txt": "nominal_demand_per_inch is demand by width (also called productivity)",\n      "rel": [7]\n    }]}
# """

In [22]:
with open("../resources/knowledge/chunks_v5.json", "a") as f:
    # f.write(response)
    f.write(json.dumps(json.loads(chunks["parts"][0]["text"]), indent=4))

In [23]:
with open("../resources/knowledge/relevant_v5.json", "a") as f:
    # f.write(response)
    f.write(json.dumps(json.loads(relevant["parts"][0]["text"]), indent=4))

## combine chunks and relations

In [24]:
with open("../resources/knowledge/chunks_v5.json", "r") as f:
    chunks_ = json.loads(f.read())

In [25]:
with open("../resources/knowledge/relevant_v5.json", "r") as f:
    relevant_ = json.loads(f.read())

In [26]:
relevant_ = {_["id"]: _["rel"] for _ in relevant_["resp"]}
nodes = {_["id"]:{"txt": _["txt"], "rel": relevant_[_["id"]]} for _ in chunks_["resp"]}

In [27]:
with open("../resources/knowledge/graph_v5.json", "a") as f:
    f.write(json.dumps(nodes, indent=4))

#### check if there are any one-way relations

In [28]:
for id_, chunk in nodes.items():
    for id__ in chunk["rel"]:
        if id_ not in nodes[id__]["rel"]:
            print(id_, id__)

42 30
45 48
50 21
50 28
66 39
83 31
89 3
139 66


## third iteration

In [35]:
thrd_prom = """
Reconsider the list of related chunks generated in the previous step. Find out if any more chunks are relevant for each chunk, in short range, long range or in other documents. Keep all the related ones generated in the previous step.
"""

In [36]:
thrd_prom = {
    "role": "user",
    "parts": [
        {
            "text": thrd_prom
        }
    ]
}

In [37]:
rel_chnks = "10"

In [38]:
resp_sch = {
  "type": "OBJECT",
  "properties": {
    "resp": {
      "type": "ARRAY",
      "description": "a list of chunk ids",
      "items": {
        "description": "a chunk and relevant chunks ids",
        "type": "OBJECT",
        "properties": {
          "id": {
            "type": "INTEGER",
            "description": "chunk id"
          },
          # "txt": {
            # "type": "STRING",
            # "description": "the text of the chunk as in the user provided document"
          # },
          "rel": {
            "type": "ARRAY",
            "description": "a list of ids of semantically related chunks",
            "items": {
              "type": "INTEGER",
              "description": "id of a related chunk"
            },
            "maxItems": rel_chnks
          }
        },
        "required": [
          "id"
        ]
      }
    }
  },
  "required": [
    "resp"
  ]
}

In [39]:
payload = {
    "contents": [
        {
            "role": "user",
            "parts": [
                {
                    "text": prom
                },
                {
                    "fileData": {
                        "mimeType": "text/plain",
                        "fileUri": file1
                    }
                },
                {
                    "fileData": {
                        "mimeType": "text/plain",
                        "fileUri": file2
                    }
                }
            ]
        }
    ]
    , "systemInstruction": {
        "parts": [
        {
            "text": sys_prom
        }
      ]
    }
    , "generationConfig": {
        "responseModalities": ["TEXT"]
        ,"temperature": temp
        ,"maxOutputTokens": out_tok
        ,"topP": topP
        ,"responseMimeType": "application/json"
        ,"responseSchema": resp_sch
    }
}
payload["contents"].append(chunks)
payload["contents"].append(sec_prom)
payload["contents"].append(relevant)
payload["contents"].append(thrd_prom)

In [41]:
response:models.Response = request("POST", 
                                   gateway_url, 
                                   headers=header, 
                                   json=payload)

In [42]:
relevant_2 = response.json()["candidates"][0]["content"]

## write json

In [46]:
with open("../resources/relevant2_lim_v4.json", "a") as f:
    # f.write(response)
    f.write(json.dumps(json.loads(relevant_2["parts"][0]["text"]), indent=4))

In [47]:
with open("../resources/relevant2_lim_v4.json", "r") as f:
    relevant2_ = json.loads(f.read())

In [53]:
relevant2_ = {_["id"]: _["rel"] for _ in relevant2_["resp"]}

#### check if anything was added to relevant chunks list

In [57]:
for id_, rel in relevant2_.items():
    deleted = list(set(rel) - set(relevant2_[id_]))
    if deleted:
        break
    print("id:", id_, "added:", list(set(relevant2_[id_]) - set(rel)), sep=" ")

id: 1 added: []
id: 2 added: [86, 87]
id: 3 added: [84]
id: 4 added: [88]
id: 5 added: [82]
id: 6 added: [10, 95]
id: 7 added: [95]
id: 8 added: [95]
id: 9 added: [88]
id: 10 added: [95]
id: 11 added: [95]
id: 12 added: [50, 122]
id: 13 added: []
id: 14 added: []
id: 15 added: []
id: 16 added: [29]
id: 17 added: [90, 91]
id: 18 added: [29]
id: 19 added: []
id: 20 added: []
id: 21 added: [100]
id: 22 added: []
id: 23 added: [109]
id: 24 added: []
id: 25 added: [100]
id: 26 added: []
id: 27 added: []
id: 28 added: [68]
id: 29 added: [16, 92]
id: 30 added: [85]
id: 31 added: [78]
id: 32 added: []
id: 33 added: []
id: 34 added: []
id: 35 added: []
id: 36 added: []
id: 37 added: []
id: 38 added: [72, 73]
id: 39 added: [134, 62]
id: 40 added: []
id: 41 added: []
id: 42 added: []
id: 43 added: []
id: 44 added: []
id: 45 added: [98]
id: 46 added: [98]
id: 47 added: []
id: 48 added: [90, 91]
id: 49 added: [100]
id: 50 added: [122, 12]
id: 51 added: []
id: 52 added: []
id: 53 added: []
id: 54 ad

## combine chunks and relations

In [60]:
nodes = {_["id"]:{"txt": _["txt"], "rel": relevant2_[_["id"]]} for _ in chunks_["resp"]}

In [62]:
with open("../resources/graph_lim_v4.json", "a") as f:
    f.write(json.dumps(nodes, indent=4))

#### check if there are any one-way relations

In [63]:
for id_, chunk in nodes.items():
    for id__ in chunk["rel"]:
        if id_ not in nodes[id__]["rel"]:
            print(id_, id__)

6 10
18 29
104 84


#### check if there are self-relations

In [64]:
for id_, chunk in nodes.items():
    for id__ in chunk["rel"]:
        if id__ == id_:
            print(id_, id__)