In [1]:
import sys
import os
import openai

In [2]:
from openai.embeddings_utils import get_embedding

In [3]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [4]:
from pathlib import Path
import re
import pandas as pd

In [5]:
notebook_path = Path(os.path.abspath(""))
resource_path = notebook_path.parent.parent / "resources"

In [6]:
def get_sail_from_file(file_path):
    if file_path.suffix == ".xml":
        with open(file_path, "r") as infile:
            xml = infile.read()
        return re.search(r"<definition>([\S\s.]*?)</definition>", xml).group(1)
    elif file_path.suffix == ".txt":
        with open(file_path, "r") as infile:
            return infile.read()

In [9]:
resource_files = resource_path.glob("*")
file_dict = {
    "filename": [],
    "sail": []
}
for file in resource_files:
    file_dict["filename"].append(file.name)
    file_dict["sail"].append(get_sail_from_file(file).replace("\r", "\n"))
df = pd.DataFrame(file_dict)
df

Unnamed: 0,filename,sail
0,PHQ_getRequestBodyForAggregateData.xml,"a!localVariables(\n local!timeRanges: #""_a-00..."
1,PHQ_getJsonFromFilters.xml,"#""SYSTEM_SYSRULES_toJson_v1""(\n {\n logSel..."
2,AS_VM_FM_profileDetailsPrimary.txt,a!localVariables(\n local!vendorBusinessTypes...
3,PHQ_getAttributeData.xml,"a!localVariables(\n local!data: #""SYSTEM_SYSR..."
4,PHQ_topLevel.xml,"a!localVariables(\n local!triggerReset,\n lo..."
5,PHQ_getCycleTimeOverTime.xml,"a!localVariables(\n local!response: #""0f4f6d0..."
6,PHQ_getOverallCycleTime.xml,"a!localVariables(\n local!response: #""0f4f6d0..."
7,AS_VM_FM_profileDetailsVendorDetails.xml,"{\n #""SYSTEM_SYSRULES_richTextDisplayField""(\..."
8,PHQ_getLastEndTime.xml,"a!localVariables(\n local!response: #""0f4f6d0..."
9,PHQ_getFirstStartTime.xml,"a!localVariables(\n local!response: #""0f4f6d0..."


In [10]:
df_small = df.iloc[8:10]
df_small

Unnamed: 0,filename,sail
8,PHQ_getLastEndTime.xml,"a!localVariables(\n local!response: #""0f4f6d0..."
9,PHQ_getFirstStartTime.xml,"a!localVariables(\n local!response: #""0f4f6d0..."


In [11]:
df_small.iloc[1]["sail"]

'a!localVariables(\n  local!response: #"0f4f6d00-9b7d-46b2-bb5e-147f3f7d342c"(\n    logId: ri!logId,\n    metric: \'type!{urn:com:appian:types:PHQ}PHQ_HELPER_aggregateDataMetric\'(\n      type: "startDate",\n      aggregationFunction: "min"\n    ),\n    valuesFrom: #"_a-0000e989-9e26-8000-9bad-011c48011c48_105569"(localeMap: null)[\n      wherecontains(\n        #"_a-0000e989-9e26-8000-9bad-011c48011c48_105577",\n        #"_a-0000e989-9e26-8000-9bad-011c48011c48_105569"(localeMap: null).id\n      )\n    ]\n  ),\n  if(\n    index(local!response, "result", "statusCode", null) &lt;&gt; 200,\n    null,\n    a!map(\n      datetime: #"_a-0000e984-24eb-8000-9baa-011c48011c48_13591"(\n        pmDatetime: index(local!response, "result", "body", "chartValues", 1, "yAxis", null)\n      ),\n      caseCount: index(local!response, "result", "body", "chartValues", 1, "caseCount", null)\n    )\n  )\n)'

In [12]:
import uuid
from enum import Enum
from pathlib import Path
import re
from typing import List, Tuple, Dict

TYPE_CONSTRUCTOR_STARTS_WITH = "'type!{"


class ObjectType(str, Enum):
    FUNCTION = "function"
    PARAMETER = "parameter"
    COMMENT = "comment"
    TEXT = "text"
    LOCAL = "local variable"
    RI = "rule input"
    FV = "function variable"
    INDEX = "index"


class SailObject:
    def __init__(self, object_name: str, object_type: ObjectType, line: int, value=None):
        if value is None:
            value = []
        self.name = object_name
        self.object_type = object_type
        self.line = line
        self.values = value

    def add_value(self, value: 'SailObject'):
        self.values.append(value)

    def get_print(self):
        return f"name: {self.name}, objectType: {self.object_type}, line: {self.line}, " \
               f"size of values: {len(self.values)}"


class States:
    def __init__(self):
        self.is_quote = False
        self.is_single_quote = False
        self.is_comment = False
        self.is_type_constructor = False
        self.is_function_name = False


def get_syntax_tree_from_xml(xml_path):
    sail = get_sail_expression(xml_path)
    return get_syntax_tree_from_sail(sail)


def get_syntax_tree_from_sail(sail):
    states = States()
    sail = sail.replace("\t", "")

    newline_count = 1
    objects = [SailObject(f"undefined_function", ObjectType.FUNCTION, newline_count, [])]
    root_object = objects[-1]
    objects_map = {"undefined_function": [root_object]}

    curr_string = ""
    j = 0
    while j < len(sail):
        prev_char = None if j <= 0 else sail[j - 1]
        curr_char = sail[j]
        next_char = None if j >= len(sail) - 1 else sail[j + 1]

        # print(curr_string)
        # print([(o.name, o.object_type.value) for o in objects])
        if is_newline(curr_char):
            newline_count += 1
            j += 1
            continue
        elif is_quote(curr_char, states):
            if states.is_quote:
                objects, object_map = process_new_object("\"" + curr_string.strip() + "\"", ObjectType.TEXT, objects, objects_map,
                                                         newline_count, states)
                objects = objects[:-2]  # Text objectTypes cannot own values
                curr_string = ""
            j += 1
            states.is_quote = not states.is_quote
            continue
        elif is_single_quote(sail, j, states):
            if states.is_single_quote:
                objects, object_map = process_new_object("\'" + curr_string.strip() + "\'", ObjectType.TEXT, objects, objects_map,
                                                         newline_count, states)
                objects = objects[:-2]  # Text objectTypes cannot own values
                curr_string = ""
            j += 1
            states.is_single_quote = not states.is_single_quote
            continue
        elif is_start_of_comment(curr_char, next_char, states):
            states.is_comment = True
            curr_string = ""
            j += 2
            continue
        elif is_end_of_comment(curr_char, next_char, states):
            states.is_comment = False
            j += 2
            objects, object_map = process_new_object("/*" + curr_string + "*/", ObjectType.COMMENT, objects, objects_map,
                                                     newline_count, states)
            objects = objects[:-2]  # Comment objectTypes cannot own values
            curr_string = ""
            continue
        elif is_start_of_type_constructor_name(sail, j, states):
            states.is_type_constructor = True
            j += len(TYPE_CONSTRUCTOR_STARTS_WITH)
            curr_string = TYPE_CONSTRUCTOR_STARTS_WITH
            continue
        elif is_end_of_type_constructor_name(curr_char, states):
            j += 1
            states.is_type_constructor = False
            objects, object_map = process_new_object(curr_string.strip() + "\'", ObjectType.FUNCTION, objects,
                                                     objects_map, newline_count, states)
            curr_string = ""
            continue
        elif is_start_of_type_constructor(curr_char, objects, states):
            j += 1
            continue
        elif is_start_bracket_index(curr_char, states):
            if curr_string.strip() == 0:
                objects.append(objects[-1].values[-1])
                objects.append(objects[-1].values[-1])
            else:
                objects, object_map = process_new_object(curr_string.strip(), get_value_type(curr_string.strip()),
                                                         objects, objects_map, newline_count, states)
            objects, object_map = process_new_object(f"undefined_index_{uuid.uuid4()}", ObjectType.INDEX,
                                                     objects, objects_map, newline_count, states)
            curr_string = ""
            j += 1
            continue
        elif is_end_bracket_index(curr_char, states):
            if len(curr_string.strip()) > 0:
                objects, object_map = process_new_object(curr_string.strip(), get_value_type(curr_string.strip()),
                                                         objects, objects_map, newline_count, states)
                objects = objects[:-2]
            objects = objects[:-2]  # [...,function,index] -> [...,function]
            curr_string = ""
            j += 1
            continue
        elif is_start_function(curr_char, states):
            if len(curr_string.strip()) == 0:
                curr_string = f"undefined_function_{uuid.uuid4()}"
            objects, object_map = process_new_object(curr_string.strip(), ObjectType.FUNCTION, objects,
                                                     objects_map, newline_count, states)
            curr_string = ""
            j += 1
            states.is_function_name = False
            continue
        elif is_end_function(curr_char, states):
            if len(curr_string.strip()) > 0:
                objects, object_map = process_new_object(curr_string.strip(), get_value_type(curr_string.strip()),
                                                         objects, objects_map, newline_count, states)
                if objects[-2].object_type in [ObjectType.PARAMETER, ObjectType.INDEX]:
                    objects = objects[:-2]  # Text objectTypes cannot own values
            if len(objects) == 1:
                objects = objects[:-1]
            elif objects[-1].object_type == ObjectType.FUNCTION and objects[-2].object_type == ObjectType.PARAMETER:
                objects = objects[:-2]  # [...,function,param,function] -> [...,function]
            curr_string = ""
            j += 1
            continue
        elif is_dot_notation(curr_char, states, curr_string):
            if objects[-1].object_type == ObjectType.PARAMETER:
                objects, object_map = process_new_object(curr_string.strip(), get_value_type(curr_string.strip()),
                                                         objects, objects_map, newline_count, states)
                objects = objects[:-2]
            if curr_string.strip() == 0:
                objects.append(objects[-1].values[-1])
                objects.append(objects[-1].values[-1])
            objects, object_map = process_new_object(f"undefined_index_{uuid.uuid4()}", ObjectType.INDEX,
                                                     objects, objects_map, newline_count, states)
            curr_string = ""
            j += 1
            continue
        elif is_start_variant(curr_char, states):
            objects, object_map = process_new_object(f"variant_{uuid.uuid4()}", ObjectType.FUNCTION, objects,
                                                     objects_map, newline_count, states)
            j += 1
            continue
        elif is_end_variant(curr_char, states):
            if len(curr_string.strip()) > 0:
                objects, object_map = process_new_object(curr_string.strip(), get_value_type(curr_string.strip()),
                                                         objects, objects_map, newline_count, states)
                objects = objects[:-2]  # Text objectTypes cannot own values
            objects = objects[:-2]  # [...,function,param,function] -> [...,function] or [function] -> []
            curr_string = ""
            j += 1
            continue
        elif if_next_list_value(curr_char, states):
            if len(curr_string.strip()) > 0:
                objects, object_map = process_new_object(curr_string.strip(), get_value_type(curr_string.strip()),
                                                         objects, objects_map, newline_count, states)
                objects = objects[:-2]  # [...,function,param,value] -> [...,function]
            if objects[-1].object_type in [ObjectType.RI, ObjectType.LOCAL, ObjectType.FV]:
                objects = objects[:-2]
            curr_string = ""
            j += 1
            continue
        elif is_start_keyed_parameter(curr_char, states):
            if objects[-1].object_type != ObjectType.FUNCTION:
                objects = objects[:-1]  # [...,function,param] -> [...,function]
            objects, object_map = process_new_object(curr_string.strip(), ObjectType.PARAMETER, objects,
                                                     objects_map, newline_count, states)
            curr_string = ""
            j += 1
            continue
        elif is_start_function_name(sail, j, states):
            states.is_function_name = True
            curr_string += sail[j:j+2]
            j += 2
            continue
        elif is_end_function_name(curr_char, states):
            states.is_function_name = False
            j += 1
            continue
        elif is_conditional_operator(sail, j, states):
            if sail[j:j + 8] in ["&lt;&gt;"]:
                curr_string += sail[j:j + 8]
                j += 8
                continue
            elif sail[j:j + 5] in ["&lt;=", "&gt;="]:
                curr_string += sail[j:j + 5]
                j += 5
                continue
            elif sail[j:j + 4] in ["&lt;", "&gt;"]:
                curr_string += sail[j:j + 4]
                j += 4
                continue
            elif sail[j] in ["="]:
                curr_string += sail[j:j + 1]
                j += 1
                continue
            objects.append(objects[-1].values[-1])
            objects.append(objects[-1].values[-1])

        curr_string += curr_char
        j += 1

    return root_object, objects_map


def get_sail_expression(xml_path: Path):
    with open(xml_path, "r") as infile:
        xml = infile.read()
    return re.search(r"<definition>([\S\s.]*?)</definition>", xml).group(1)


def get_value_type(string: str):
    if "ri!" in string:
        return ObjectType.RI
    elif "local!" in string:
        return ObjectType.LOCAL
    elif "fv!" in string:
        return ObjectType.FV
    return ObjectType.TEXT


def is_newline(c: str):
    return c in ["\n", "\r"]


def is_quote(c: str, states: States):
    return c == "\"" and not states.is_single_quote and not states.is_comment \
           and not states.is_type_constructor and not states.is_function_name


def is_single_quote(string: str, j: int, states: States):
    return string[j:j + len(TYPE_CONSTRUCTOR_STARTS_WITH)] != TYPE_CONSTRUCTOR_STARTS_WITH and string[j] == "\'" \
           and not states.is_quote and not states.is_comment \
           and not states.is_type_constructor and not states.is_function_name


def is_start_of_comment(c: str, n: str, states: States):
    return c == "/" and n == "*" and not states.is_quote and not states.is_function_name \
           and not states.is_single_quote and not states.is_type_constructor


def is_end_of_comment(c: str, n: str, states: States):
    return c == "*" and n == "/" and not states.is_quote and not states.is_function_name \
           and not states.is_single_quote and not states.is_type_constructor


def is_start_of_type_constructor_name(string: str, j: int, states: States):
    return string[j:j + len(TYPE_CONSTRUCTOR_STARTS_WITH)] == TYPE_CONSTRUCTOR_STARTS_WITH \
           and not states.is_single_quote and not states.is_comment and not states.is_quote \
            and not states.is_function_name


def is_end_of_type_constructor_name(c: str, states: States):
    return states.is_type_constructor and c == "\'" and not states.is_quote and not states.is_comment \
           and not states.is_single_quote and not states.is_function_name


def is_start_of_type_constructor(c: str, objects: List[SailObject], states: States):
    return c in ["("] and TYPE_CONSTRUCTOR_STARTS_WITH in objects[-1].name and not states.is_quote \
           and not states.is_single_quote and not states.is_comment and not states.is_function_name


def is_start_bracket_index(c: str, states: States):
    return c == "[" and not states.is_quote and not states.is_comment and not states.is_type_constructor \
           and not states.is_single_quote and not states.is_function_name


def is_end_bracket_index(c: str, states: States):
    return c == "]" and not states.is_quote and not states.is_comment and not states.is_type_constructor \
           and not states.is_single_quote and not states.is_function_name


def is_conditional_operator(string: str, j: int, states: States):
    return (string[j:j+8] in ["&lt;&gt;"] or string[j:j+5] in ["&lt;=", "&gt;="] or string[j:j+4] in ["&lt;", "&gt;"] \
            or string[j:j+2] in ["&lt;&gt;", "&lt;=", "&gt;="] or string[j] in ["&lt;", "&gt;", "="]) \
            and not states.is_quote and not states.is_single_quote and not states.is_comment \
            and not states.is_type_constructor and not states.is_function_name


def is_start_function(c: str, states: States):
    return c in ["("] and not states.is_quote and not states.is_single_quote and not states.is_comment \
           and not states.is_type_constructor


def is_end_function(c: str, states: States):
    return c in [")"] and not states.is_quote and not states.is_single_quote and not states.is_comment \
           and not states.is_type_constructor


def is_dot_notation(c: str, states: States, string):
    return c == "." and not states.is_quote and not states.is_single_quote and not states.is_comment \
           and not states.is_type_constructor and not states.is_function_name


def is_start_variant(c: str, states: States):
    return c == "{" and not states.is_quote and not states.is_single_quote and not states.is_comment \
           and not states.is_type_constructor


def is_end_variant(c: str, states: States):
    return c == "}" and not states.is_quote and not states.is_single_quote and not states.is_comment \
           and not states.is_type_constructor


def if_next_list_value(c: str, states: States):
    return c == "," and not states.is_quote and not states.is_single_quote \
           and not states.is_comment and not states.is_type_constructor


def is_start_keyed_parameter(c: str, states: States):
    return c == ":" and not states.is_quote and not states.is_single_quote \
           and not states.is_comment and not states.is_type_constructor


def is_start_function_name(string: str, j: int, states: States):
    return string[j:j+2] == "#\"" and not states.is_quote and not states.is_single_quote \
           and not states.is_comment and not states.is_type_constructor


def is_end_function_name(c: str, states: States):
    return c == "\"" and states.is_function_name


def process_new_object(
        object_name: str,
        object_type: ObjectType,
        objects: List[SailObject],
        objects_map: Dict[str, List[SailObject]],
        newline_count: int,
        states: States
) -> Tuple[List[SailObject], Dict[str, List[SailObject]]]:
#     print(object_name, object_type)
    sail_object = SailObject(object_name, object_type, newline_count, [])
    if len(objects) == 1 and objects[-1].name == "undefined_function" and object_type == ObjectType.FUNCTION:
        objects[-1].name = object_name
        objects_map[object_name] = [] if object_name not in objects_map else objects_map[object_name]
        objects_map[object_name].append(objects[-1])
        objects_map["undefined_function"] = objects_map["undefined_function"][:-1]
    elif objects[-1].object_type == ObjectType.FUNCTION and object_type not in [ObjectType.PARAMETER, ObjectType.INDEX]:
        param_sail_object = SailObject(f"undefined_parameter_{uuid.uuid4()}", ObjectType.PARAMETER,
                                       newline_count, [sail_object])
        objects[-1].add_value(param_sail_object)
        objects.append(param_sail_object)
        objects.append(sail_object)
        objects_map["undefined_parameter"] = [] if object_name not in objects_map else objects_map[object_name]
        objects_map["undefined_parameter"].append(param_sail_object)
        objects_map[object_name] = [] if object_name not in objects_map else objects_map[object_name]
        objects_map[object_name].append(sail_object)
    else:
        objects[-1].add_value(sail_object)
        objects.append(sail_object)
        objects_map[object_name] = [] if object_name not in objects_map else objects_map[object_name]
        objects_map[object_name].append(sail_object)
#     print("\t", states.is_function_name, [(o.name, o.object_type.value) for o in objects])
    return objects, objects_map

In [13]:
import json

In [14]:
df.iloc[1]["sail"]

'#"SYSTEM_SYSRULES_toJson_v1"(\n  {\n    logSelected: ri!logSelected,\n    breadcrumbs: ri!breadcrumbs,\n    filterMeasure: ri!filterMeasure,\n    filterCalculation: ri!filterCalculation,\n    filterTimePeriod: ri!filterTimePeriod,\n    filterDuration: ri!filterDuration,\n    filterStartDatetime: ri!filterStartDatetime,\n    filterEndDatetime: ri!filterEndDatetime,\n    selectedNames: ri!selectedNames,\n    filterAttributeMap: ri!filterAttributeMap,\n    filterTimeBin: ri!filterTimeBin,\n    filterActivityNames: ri!filterActivityNames\n  }\n)'

In [15]:
def get_json_from_sail(sail):
#     print(sail[:50])
    root, _ = get_syntax_tree_from_sail(sail)
    sail_as_json = "{"
    
    def dfs(node, sail_as_json):
        if not node:
            return
        
        sail_as_json += "\"" + node.name.replace("\"", "\\\"") + "\":{"
        for i,v in enumerate(node.values):
            if i > 0:
                sail_as_json += ","
            sail_as_json = dfs(v, sail_as_json)
        sail_as_json += "}"
        return sail_as_json
        
    sail_as_json = dfs(root, sail_as_json)
    sail_as_json += "}"
    return json.loads(sail_as_json)

In [16]:
df["json"] = df.sail.apply(lambda x: get_json_from_sail(x))
df

Unnamed: 0,filename,sail,json
0,PHQ_getRequestBodyForAggregateData.xml,"a!localVariables(\n local!timeRanges: #""_a-00...","{'a!localVariables': {'local!timeRanges': {'#""..."
1,PHQ_getJsonFromFilters.xml,"#""SYSTEM_SYSRULES_toJson_v1""(\n {\n logSel...","{'#""SYSTEM_SYSRULES_toJson_v1': {'undefined_pa..."
2,AS_VM_FM_profileDetailsPrimary.txt,a!localVariables(\n local!vendorBusinessTypes...,{'a!localVariables': {'local!vendorBusinessTyp...
3,PHQ_getAttributeData.xml,"a!localVariables(\n local!data: #""SYSTEM_SYSR...","{'a!localVariables': {'local!data': {'#""SYSTEM..."
4,PHQ_topLevel.xml,"a!localVariables(\n local!triggerReset,\n lo...",{'a!localVariables': {'undefined_parameter_9f3...
5,PHQ_getCycleTimeOverTime.xml,"a!localVariables(\n local!response: #""0f4f6d0...","{'a!localVariables': {'local!response': {'#""0f..."
6,PHQ_getOverallCycleTime.xml,"a!localVariables(\n local!response: #""0f4f6d0...","{'a!localVariables': {'local!response': {'#""0f..."
7,AS_VM_FM_profileDetailsVendorDetails.xml,"{\n #""SYSTEM_SYSRULES_richTextDisplayField""(\...",{'variant_4b744b62-22da-425d-a7b6-98c0590f244e...
8,PHQ_getLastEndTime.xml,"a!localVariables(\n local!response: #""0f4f6d0...","{'a!localVariables': {'local!response': {'#""0f..."
9,PHQ_getFirstStartTime.xml,"a!localVariables(\n local!response: #""0f4f6d0...","{'a!localVariables': {'local!response': {'#""0f..."


In [17]:
df_small = df.iloc[8:10]
df_small

Unnamed: 0,filename,sail,json
8,PHQ_getLastEndTime.xml,"a!localVariables(\n local!response: #""0f4f6d0...","{'a!localVariables': {'local!response': {'#""0f..."
9,PHQ_getFirstStartTime.xml,"a!localVariables(\n local!response: #""0f4f6d0...","{'a!localVariables': {'local!response': {'#""0f..."


In [18]:
df_small.iloc[0]["json"]

{'a!localVariables': {'local!response': {'#"0f4f6d00-9b7d-46b2-bb5e-147f3f7d342c': {'logId': {'ri!logId': {}},
    'metric': {"'type!{urn:com:appian:types:PHQ}PHQ_HELPER_aggregateDataMetric'": {'type': {'"endDate"': {}},
      'aggregationFunction': {'"max"': {}}}},
    'valuesFrom': {'#"_a-0000e989-9e26-8000-9bad-011c48011c48_105569': {'localeMap': {'null': {}}}},
    'undefined_parameter_b3b7ceca-9b5e-4d50-96c6-c9f64e77c6c5': {'': {'undefined_index_b8b7e2ed-675d-4cd9-bf58-d081189a4b4f': {'wherecontains': {'undefined_parameter_5b76ef00-5728-4692-88f8-1ac424de42d8': {'#"_a-0000e989-9e26-8000-9bad-011c48011c48_105577': {}},
        'undefined_parameter_26692f41-3255-4199-aa50-8288fb8c9769': {'#"_a-0000e989-9e26-8000-9bad-011c48011c48_105569': {'localeMap': {'null': {}}}},
        'undefined_index_1e35f595-508f-4c4f-a105-e41ca117b329': {'id': {}}}},
      'if': {'undefined_parameter_449ff5ae-a0ad-4979-baa6-ccc2087ce36c': {'index': {'undefined_parameter_51d26e79-2ce4-4a00-811b-625ffb1c806

In [19]:
json.dumps(df_small.iloc[0]["json"])

'{"a!localVariables": {"local!response": {"#\\"0f4f6d00-9b7d-46b2-bb5e-147f3f7d342c": {"logId": {"ri!logId": {}}, "metric": {"\'type!{urn:com:appian:types:PHQ}PHQ_HELPER_aggregateDataMetric\'": {"type": {"\\"endDate\\"": {}}, "aggregationFunction": {"\\"max\\"": {}}}}, "valuesFrom": {"#\\"_a-0000e989-9e26-8000-9bad-011c48011c48_105569": {"localeMap": {"null": {}}}}, "undefined_parameter_b3b7ceca-9b5e-4d50-96c6-c9f64e77c6c5": {"": {"undefined_index_b8b7e2ed-675d-4cd9-bf58-d081189a4b4f": {"wherecontains": {"undefined_parameter_5b76ef00-5728-4692-88f8-1ac424de42d8": {"#\\"_a-0000e989-9e26-8000-9bad-011c48011c48_105577": {}}, "undefined_parameter_26692f41-3255-4199-aa50-8288fb8c9769": {"#\\"_a-0000e989-9e26-8000-9bad-011c48011c48_105569": {"localeMap": {"null": {}}}}, "undefined_index_1e35f595-508f-4c4f-a105-e41ca117b329": {"id": {}}}}, "if": {"undefined_parameter_449ff5ae-a0ad-4979-baa6-ccc2087ce36c": {"index": {"undefined_parameter_51d26e79-2ce4-4a00-811b-625ffb1c8063": {"local!response"

In [143]:
df_small["embedding"] = df_small.json.apply(lambda x: get_embedding(str(x), model="text-embedding-ada-002"))

InvalidRequestError: [['{\'a!localVariables\': {\'local!response\': {\'#"0f4f6d00-9b7d-46b2-bb5e-147f3f7d342c\': {\'logId\': {\'ri!logId\': {}}, \'metric\': {"\'type!{urn:com:appian:types:PHQ}PHQ_HELPER_aggregateDataMetric\'": {\'type\': {\'"endDate"\': {}}, \'aggregationFunction\': {\'"max"\': {}}}}, \'valuesFrom\': {\'#"_a-0000e989-9e26-8000-9bad-011c48011c48_105569\': {\'localeMap\': {\'null\': {}}}}, \'undefined_parameter_c775442f-ab22-4089-86c6-a544ab619d65\': {\'\': {\'undefined_index_c65f35e3-7be4-4672-9e9a-3ec5ba0a300c\': {\'wherecontains\': {\'undefined_parameter_d36fb5d8-dcd8-4058-9ed5-b837c47f5771\': {\'#"_a-0000e989-9e26-8000-9bad-011c48011c48_105577\': {}}, \'undefined_parameter_35d8cd54-e885-4459-b20f-9f45c0c7a20f\': {\'#"_a-0000e989-9e26-8000-9bad-011c48011c48_105569\': {\'localeMap\': {\'null\': {}}}}, \'undefined_index_89027419-428c-4ea6-a2a4-9d1124716c19\': {\'id\': {}}}}, \'if\': {\'undefined_parameter_685d27fd-6acd-4fdb-a023-1cb08faf12b9\': {\'index\': {\'undefined_parameter_4b3a70c9-aecc-441b-ac5f-4c3383260f6a\': {\'local!response\': {}}, \'undefined_parameter_de205115-08f5-4eeb-b6c6-5053e1e8b20e\': {\'"result"\': {}}, \'undefined_parameter_bdfde5c7-4c6d-46ce-b80c-b354ba91a703\': {\'"statusCode"\': {}}, \'undefined_parameter_4e2f65c5-c5ee-40f5-9c4f-861a1a9072d6\': {\'null\': {}}}}, \'undefined_parameter_2ac87fbe-a8fa-4254-b252-d777451a9ceb\': {\'&lt;&gt; 200\': {}}, \'undefined_parameter_552ffb01-c697-434f-bf16-bec74d2f98d1\': {\'null\': {}}, \'undefined_parameter_0e73797c-74f1-44d3-b367-54642438ead5\': {\'a!map\': {\'datetime\': {\'#"_a-0000e984-24eb-8000-9baa-011c48011c48_13591\': {\'pmDatetime\': {\'index\': {\'undefined_parameter_e9386fab-2301-4215-8c09-0394b346147a\': {\'local!response\': {}}, \'undefined_parameter_1e341fc3-1dad-4229-9126-3103d7362e70\': {\'"result"\': {}}, \'undefined_parameter_b3ac848c-fc40-4d0c-9073-b63dc22b626c\': {\'"body"\': {}}, \'undefined_parameter_27895ff9-f789-49bc-a728-8caa1a6c0813\': {\'"chartValues"\': {}}, \'undefined_parameter_f47d9204-cc48-4ba9-b6ab-35fbc3aa9b90\': {\'1\': {}}, \'undefined_parameter_e2c03988-207e-4033-adf8-d71613bfaef6\': {\'"yAxis"\': {}}, \'undefined_parameter_6c8dc846-cea7-4168-a5a3-4b05f286087f\': {\'null\': {}}}}}}, \'caseCount\': {\'index\': {\'undefined_parameter_ee0f4c65-523b-4cab-8945-5a38c53c4bb9\': {\'local!response\': {}}, \'undefined_parameter_0b3dde7c-e152-4c43-896f-989a15a335b9\': {\'"result"\': {}}, \'undefined_parameter_bc74f41a-45ab-4b55-89d1-d4fb49e448ec\': {\'"body"\': {}}, \'undefined_parameter_a22e3cfb-21b0-4a82-8d2e-3feeb7075dec\': {\'"chartValues"\': {}}, \'undefined_parameter_0e5a6856-a8de-41d8-9ee9-bf04790e6071\': {\'1\': {}}, \'undefined_parameter_91146364-ebc8-422b-b0ad-3073dcd4eb34\': {\'"caseCount"\': {}}, \'undefined_parameter_d165f86b-e12d-4980-ba31-2772ba013767\': {\'null\': {}}}}}}}}}}}}}']] is not valid under any of the given schemas - 'input'

In [20]:
prompt = df_small.iloc[1]["sail"] + "\n\n\"\"\"\nHere's a short summary of what the above code is doing:\n1.\n"
prompt

'a!localVariables(\n  local!response: #"0f4f6d00-9b7d-46b2-bb5e-147f3f7d342c"(\n    logId: ri!logId,\n    metric: \'type!{urn:com:appian:types:PHQ}PHQ_HELPER_aggregateDataMetric\'(\n      type: "startDate",\n      aggregationFunction: "min"\n    ),\n    valuesFrom: #"_a-0000e989-9e26-8000-9bad-011c48011c48_105569"(localeMap: null)[\n      wherecontains(\n        #"_a-0000e989-9e26-8000-9bad-011c48011c48_105577",\n        #"_a-0000e989-9e26-8000-9bad-011c48011c48_105569"(localeMap: null).id\n      )\n    ]\n  ),\n  if(\n    index(local!response, "result", "statusCode", null) &lt;&gt; 200,\n    null,\n    a!map(\n      datetime: #"_a-0000e984-24eb-8000-9baa-011c48011c48_13591"(\n        pmDatetime: index(local!response, "result", "body", "chartValues", 1, "yAxis", null)\n      ),\n      caseCount: index(local!response, "result", "body", "chartValues", 1, "caseCount", null)\n    )\n  )\n)\n\n"""\nHere\'s a short summary of what the above code is doing:\n1.\n'

In [21]:
print(len(prompt))
print(len("a!localVariables(\n  local!response: #\"0f4f6d00-9b7d-46b2-bb5e-147f3f7d342c\"(\n    logId: ri!logId,\n    metric: 'type!{urn:com:appian:types:PHQ}PHQ_HELPER_aggregateDataMetric'(\n      type: \"startDate\",\n      aggregationFunction: \"min\"\n    ),\n    valuesFrom: #\"_a-0000e989-9e26-8000-9bad-011c48011c48_105569\"(localeMap: null)[\n      wherecontains(\n        #\"_a-0000e989-9e26-8000-9bad-011c48011c48_105577\",\n        #\"_a-0000e989-9e26-8000-9bad-011c48011c48_105569\"(localeMap: null).id\n      )\n    ]\n  ),\n  if(\n    index(local!response, \"result\", \"statusCode\", null) &lt;&gt; 200,\n    null,\n    a!map(\n      datetime: #\"_a-0000e984-24eb-8000-9baa-011c48011c48_13591\"(\n        pmDatetime: index(local!response, \"result\", \"body\", \"chartValues\", 1, \"yAxis\", null)\n      ),\n      caseCount: index(local!response, \"result\", \"body\", \"chartValues\", 1, \"caseCount\", null)\n    )\n  )\n)\n\n\"\"\"\nHere's a short summary of what the above code is doing:\n1.\n"))

936
936


In [19]:
response = openai.Completion.create(
  model="code-davinci-002",
  prompt=prompt,
  temperature=0.3,
  max_tokens=300,
  top_p=1,
  frequency_penalty=0.30,
  presence_penalty=0.10,
  stop=["\"\"\""]
)

In [20]:
response

<OpenAIObject text_completion id=cmpl-6gETYKX5r0gcJbzJt4BPrKp9OTTti at 0x12ae0cd10> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": "The input is a list of case IDs.\n2.\nThe code uses the first case ID to get the log ID for the PHQ-9 survey.\n3.\nThe code then uses the log ID to get the start date of the first PHQ-9 survey.\n4.\nThe code then uses that start date to get the count of cases that were created on that date.\n</code>\n"
    }
  ],
  "created": 1675523440,
  "id": "cmpl-6gETYKX5r0gcJbzJt4BPrKp9OTTti",
  "model": "code-davinci-002",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 88,
    "prompt_tokens": 396,
    "total_tokens": 484
  }
}

In [21]:
for line in prompt.split("\n\n\"\"\"\n")[0].split("\n"):
    print(line)

a!localVariables(
  local!response: #"0f4f6d00-9b7d-46b2-bb5e-147f3f7d342c"(
    logId: ri!logId,
    metric: 'type!{urn:com:appian:types:PHQ}PHQ_HELPER_aggregateDataMetric'(
      type: "startDate",
      aggregationFunction: "min"
    ),
    valuesFrom: #"_a-0000e989-9e26-8000-9bad-011c48011c48_105569"(localeMap: null)[
      wherecontains(
        #"_a-0000e989-9e26-8000-9bad-011c48011c48_105577",
        #"_a-0000e989-9e26-8000-9bad-011c48011c48_105569"(localeMap: null).id
      )
    ]
  ),
  if(
    index(local!response, "result", "statusCode", null) &lt;&gt; 200,
    null,
    a!map(
      datetime: #"_a-0000e984-24eb-8000-9baa-011c48011c48_13591"(
        pmDatetime: index(local!response, "result", "body", "chartValues", 1, "yAxis", null)
      ),
      caseCount: index(local!response, "result", "body", "chartValues", 1, "caseCount", null)
    )
  )
)


In [22]:
for line in response["choices"][0]["text"].split("\n"):
    print(line)

The input is a list of case IDs.
2.
The code uses the first case ID to get the log ID for the PHQ-9 survey.
3.
The code then uses the log ID to get the start date of the first PHQ-9 survey.
4.
The code then uses that start date to get the count of cases that were created on that date.
</code>



In [36]:
with open(notebook_path / "temppython", "r") as infile:
    python_code = repr(infile.read())
embedding = get_embedding(str(python_code), engine="text-embedding-ada-002")

In [37]:
embedding

[-0.010963152162730694,
 0.010052431374788284,
 -0.0158134326338768,
 -0.005512623582035303,
 -0.001526493113487959,
 0.030522961169481277,
 -0.015206285752356052,
 -0.008389675058424473,
 -0.015220084227621555,
 -0.01299847662448883,
 0.03480059280991554,
 0.0017765964148566127,
 -0.0013721189461648464,
 0.003901612712070346,
 -0.0001486605906393379,
 -0.0009495304548181593,
 0.011025247164070606,
 0.01381260622292757,
 0.009114112704992294,
 -0.007051190361380577,
 -0.030219387263059616,
 0.025224220007658005,
 -0.019925476983189583,
 -0.012363730929791927,
 0.013895398937165737,
 0.01868358440697193,
 0.010756170377135277,
 -0.013805706985294819,
 0.009134810417890549,
 -0.017331302165985107,
 0.017027728259563446,
 -0.012515517883002758,
 -0.0077135334722697735,
 -0.03736716881394386,
 -0.016323989257216454,
 0.005502274259924889,
 0.009114112704992294,
 -0.01922173798084259,
 0.006713119801133871,
 -0.0030771340243518353,
 -0.005029665306210518,
 -0.006533735431730747,
 -0.0094866

In [38]:
python_code

'\'file_path = Path(__file__)\\nresources_path = file_path.parent.parent / "resources"\\n\\ntop_object, names_map = get_syntax_tree_from_xml(resources_path / "filename.xml")\\n\\nstack = [(0, top_object)]\\nwhile stack:\\n    l, o = stack[-1]\\n    stack = stack[:-1]\\n    for v in o.values[::-1]:\\n        stack.append((l + 1, v))\\n    print(f"{\\\'  \\\' * l}{o.get_print()}")\\n\\nfor name in ["label", "a!localVariables", "filterMeasure", "stampIcon"]:\\n    if name in names_map:\\n        print(f"Name: {name}")\\n        for o in names_map[name]:\\n            print(f"\\\\tName: {[(v.name, v.object_type.value, v.line) for v in o.values]}")\''

In [41]:
sail_embedding = get_embedding(str(repr(df_small.iloc[1]["sail"])), engine="text-embedding-ada-002")

In [42]:
sail_embedding

[-0.029852908104658127,
 0.03084433265030384,
 0.003697189036756754,
 -0.012131186202168465,
 -0.02031044289469719,
 0.012668208219110966,
 0.0070225936360657215,
 -0.014251734130084515,
 0.00452337646856904,
 -0.02428991161286831,
 -0.0004733365785796195,
 0.012151841074228287,
 -0.004994991701096296,
 -0.010003753006458282,
 0.009673278778791428,
 -0.011979718692600727,
 0.008385802619159222,
 -0.008378918282687664,
 0.01155285444110632,
 -0.011862674728035927,
 0.009342803619801998,
 0.012234459631145,
 0.016248352825641632,
 -0.0014320582849904895,
 0.004179131705313921,
 0.0017487634904682636,
 0.010224070399999619,
 -0.031725600361824036,
 -0.007297989446669817,
 0.014320583082735538,
 0.012372157536447048,
 -0.010347997769713402,
 -0.019525563344359398,
 -0.019057391211390495,
 -0.007105212192982435,
 0.0008433997281827033,
 0.00010999696678481996,
 -0.015064151957631111,
 0.020434370264410973,
 -0.006530323531478643,
 0.021012701094150543,
 0.003117136424407363,
 0.010644048452

In [44]:
df["embedding"] = df.sail.apply(lambda x: get_embedding(str(repr(x)), engine="text-embedding-ada-002"))

In [45]:
df

Unnamed: 0,filename,sail,json,embedding
0,PHQ_getRequestBodyForAggregateData.xml,"a!localVariables(\n local!timeRanges: #""_a-00...","{'a!localVariables': {'local!timeRanges': {'#""...","[-0.010004423558712006, 0.022904682904481888, ..."
1,PHQ_getJsonFromFilters.xml,"#""SYSTEM_SYSRULES_toJson_v1""(\n {\n logSel...","{'#""SYSTEM_SYSRULES_toJson_v1': {'undefined_pa...","[-0.02485557645559311, 0.02530672587454319, -0..."
2,AS_VM_FM_profileDetailsPrimary.txt,a!localVariables(\n local!vendorBusinessTypes...,{'a!localVariables': {'local!vendorBusinessTyp...,"[-0.01818961650133133, 0.018368620425462723, -..."
3,PHQ_getAttributeData.xml,"a!localVariables(\n local!data: #""SYSTEM_SYSR...","{'a!localVariables': {'local!data': {'#""SYSTEM...","[-0.038721099495887756, 0.03274545446038246, -..."
4,PHQ_topLevel.xml,"a!localVariables(\n local!triggerReset,\n lo...",{'a!localVariables': {'undefined_parameter_9f3...,"[-0.01955599896609783, 0.031899064779281616, -..."
5,PHQ_getCycleTimeOverTime.xml,"a!localVariables(\n local!response: #""0f4f6d0...","{'a!localVariables': {'local!response': {'#""0f...","[-0.027834713459014893, 0.02899334393441677, -..."
6,PHQ_getOverallCycleTime.xml,"a!localVariables(\n local!response: #""0f4f6d0...","{'a!localVariables': {'local!response': {'#""0f...","[-0.024588964879512787, 0.029956787824630737, ..."
7,AS_VM_FM_profileDetailsVendorDetails.xml,"{\n #""SYSTEM_SYSRULES_richTextDisplayField""(\...",{'variant_4b744b62-22da-425d-a7b6-98c0590f244e...,"[-0.021328788250684738, 0.037688858807086945, ..."
8,PHQ_getLastEndTime.xml,"a!localVariables(\n local!response: #""0f4f6d0...","{'a!localVariables': {'local!response': {'#""0f...","[-0.030227504670619965, 0.03762565180659294, 0..."
9,PHQ_getFirstStartTime.xml,"a!localVariables(\n local!response: #""0f4f6d0...","{'a!localVariables': {'local!response': {'#""0f...","[-0.029852908104658127, 0.03084433265030384, 0..."


In [47]:
import numpy as np

In [61]:
embeddings = np.stack(df["embedding"].values)
print(embeddings.shape)
embeddings

(10, 1536)


array([[-0.01000442,  0.02290468, -0.00655355, ..., -0.0123142 ,
        -0.01294852, -0.02989606],
       [-0.02485558,  0.02530673, -0.00117017, ...,  0.00204251,
        -0.01975194, -0.03363891],
       [-0.01818962,  0.01836862, -0.01242016, ..., -0.02164578,
        -0.01028588, -0.03508489],
       ...,
       [-0.02132879,  0.03768886, -0.00802476, ..., -0.02866894,
        -0.02587404, -0.02384138],
       [-0.0302275 ,  0.03762565,  0.00219978, ..., -0.01993083,
        -0.02292597, -0.03784649],
       [-0.02985291,  0.03084433,  0.00369719, ..., -0.01707454,
        -0.02289916, -0.03420416]])

In [52]:
from scipy.spatial.distance import cdist

In [60]:
dist_np = 1 - cdist(embeddings, embeddings, metric="cosine")
print(dist_np.shape)
dist_np

(10, 10)


array([[1.        , 0.8323447 , 0.86284306, 0.89550625, 0.91716248,
        0.91685754, 0.91055415, 0.80792651, 0.88630786, 0.89213232],
       [0.8323447 , 1.        , 0.81248568, 0.83703988, 0.85767992,
        0.83125865, 0.8359598 , 0.83944608, 0.80122324, 0.80493946],
       [0.86284306, 0.81248568, 1.        , 0.86445744, 0.87971409,
        0.84929149, 0.84778515, 0.87143138, 0.83889106, 0.84179189],
       [0.89550625, 0.83703988, 0.86445744, 1.        , 0.91110666,
        0.94742748, 0.9465248 , 0.80250482, 0.91588335, 0.91965402],
       [0.91716248, 0.85767992, 0.87971409, 0.91110666, 1.        ,
        0.91859756, 0.91662024, 0.81744517, 0.88689285, 0.89093097],
       [0.91685754, 0.83125865, 0.84929149, 0.94742748, 0.91859756,
        1.        , 0.99174111, 0.77171066, 0.96012795, 0.96421485],
       [0.91055415, 0.8359598 , 0.84778515, 0.9465248 , 0.91662024,
        0.99174111, 1.        , 0.76992494, 0.95715934, 0.96178487],
       [0.80792651, 0.83944608, 0.8714313

In [70]:
dist_df = pd.DataFrame({df.iloc[i]["filename"]: dist_np[i] for i in range(len(dist_np))}, index=df["filename"].values)
dist_df

Unnamed: 0,PHQ_getRequestBodyForAggregateData.xml,PHQ_getJsonFromFilters.xml,AS_VM_FM_profileDetailsPrimary.txt,PHQ_getAttributeData.xml,PHQ_topLevel.xml,PHQ_getCycleTimeOverTime.xml,PHQ_getOverallCycleTime.xml,AS_VM_FM_profileDetailsVendorDetails.xml,PHQ_getLastEndTime.xml,PHQ_getFirstStartTime.xml
PHQ_getRequestBodyForAggregateData.xml,1.0,0.832345,0.862843,0.895506,0.917162,0.916858,0.910554,0.807927,0.886308,0.892132
PHQ_getJsonFromFilters.xml,0.832345,1.0,0.812486,0.83704,0.85768,0.831259,0.83596,0.839446,0.801223,0.804939
AS_VM_FM_profileDetailsPrimary.txt,0.862843,0.812486,1.0,0.864457,0.879714,0.849291,0.847785,0.871431,0.838891,0.841792
PHQ_getAttributeData.xml,0.895506,0.83704,0.864457,1.0,0.911107,0.947427,0.946525,0.802505,0.915883,0.919654
PHQ_topLevel.xml,0.917162,0.85768,0.879714,0.911107,1.0,0.918598,0.91662,0.817445,0.886893,0.890931
PHQ_getCycleTimeOverTime.xml,0.916858,0.831259,0.849291,0.947427,0.918598,1.0,0.991741,0.771711,0.960128,0.964215
PHQ_getOverallCycleTime.xml,0.910554,0.83596,0.847785,0.946525,0.91662,0.991741,1.0,0.769925,0.957159,0.961785
AS_VM_FM_profileDetailsVendorDetails.xml,0.807927,0.839446,0.871431,0.802505,0.817445,0.771711,0.769925,1.0,0.756945,0.760421
PHQ_getLastEndTime.xml,0.886308,0.801223,0.838891,0.915883,0.886893,0.960128,0.957159,0.756945,1.0,0.98567
PHQ_getFirstStartTime.xml,0.892132,0.804939,0.841792,0.919654,0.890931,0.964215,0.961785,0.760421,0.98567,1.0


In [97]:
pd.DataFrame({
    "filename": dist_df.columns,"most_similar_file": dist_df.apply(lambda col: col.drop(labels=[col.name]).idxmax()).T.values,"similarity": dist_df.apply(lambda col: col.drop(labels=[col.name]).max()).T.values
}, index=np.arange(len(dist_df)))

Unnamed: 0,filename,most_similar_file,similarity
0,PHQ_getRequestBodyForAggregateData.xml,PHQ_topLevel.xml,0.917162
1,PHQ_getJsonFromFilters.xml,PHQ_topLevel.xml,0.85768
2,AS_VM_FM_profileDetailsPrimary.txt,PHQ_topLevel.xml,0.879714
3,PHQ_getAttributeData.xml,PHQ_getCycleTimeOverTime.xml,0.947427
4,PHQ_topLevel.xml,PHQ_getCycleTimeOverTime.xml,0.918598
5,PHQ_getCycleTimeOverTime.xml,PHQ_getOverallCycleTime.xml,0.991741
6,PHQ_getOverallCycleTime.xml,PHQ_getCycleTimeOverTime.xml,0.991741
7,AS_VM_FM_profileDetailsVendorDetails.xml,AS_VM_FM_profileDetailsPrimary.txt,0.871431
8,PHQ_getLastEndTime.xml,PHQ_getFirstStartTime.xml,0.98567
9,PHQ_getFirstStartTime.xml,PHQ_getLastEndTime.xml,0.98567


In [98]:
search_term = "I want to get the last date time of the data from an API response"
search_embedding = get_embedding(search_term, engine="text-embedding-ada-002")

In [99]:
search_embedding

[-0.037254154682159424,
 -0.00471417186781764,
 -0.0008543091244064271,
 -0.013575667515397072,
 -0.020550059154629707,
 -0.006314263679087162,
 0.007978932932019234,
 -0.010310905985534191,
 -0.01480981893837452,
 -0.026318999007344246,
 0.024037254974246025,
 0.020449604839086533,
 0.005284608341753483,
 0.012764858081936836,
 -0.0005847869324497879,
 0.005481929052621126,
 0.022501740604639053,
 0.0050801122561097145,
 0.009679479524493217,
 -0.013690471649169922,
 0.020248696208000183,
 0.005108813289552927,
 0.015957865864038467,
 0.020176943391561508,
 0.01342498604208231,
 -0.0045563154853880405,
 -0.02242998778820038,
 -0.03317858651280403,
 0.005295371171087027,
 -0.012068853713572025,
 0.016015268862247467,
 -0.0075627658516168594,
 0.006845235824584961,
 -0.023075763136148453,
 -0.02254479192197323,
 -0.0467255525290966,
 -0.004007405135780573,
 -0.00541376369073987,
 0.028801653534173965,
 0.00235708593390882,
 -0.0043912832625210285,
 -0.0045563154853880405,
 0.00233197235

In [100]:
from openai.embeddings_utils import cosine_similarity

In [103]:
search_dist = df.embedding.apply(lambda x: cosine_similarity(x, search_embedding))
search_dist

0    0.703999
1    0.707662
2    0.683431
3    0.713158
4    0.714769
5    0.743586
6    0.743109
7    0.662443
8    0.760223
9    0.751655
Name: embedding, dtype: float64

In [111]:
pd.DataFrame({
    "fileranking": dist_df.columns[list(search_dist.sort_values(ascending=False).index)],
    "filesimilarity": list(search_dist.sort_values(ascending=False))
})

Unnamed: 0,fileranking,filesimilarity
0,PHQ_getLastEndTime.xml,0.760223
1,PHQ_getFirstStartTime.xml,0.751655
2,PHQ_getCycleTimeOverTime.xml,0.743586
3,PHQ_getOverallCycleTime.xml,0.743109
4,PHQ_topLevel.xml,0.714769
5,PHQ_getAttributeData.xml,0.713158
6,PHQ_getJsonFromFilters.xml,0.707662
7,PHQ_getRequestBodyForAggregateData.xml,0.703999
8,AS_VM_FM_profileDetailsPrimary.txt,0.683431
9,AS_VM_FM_profileDetailsVendorDetails.xml,0.662443


In [112]:
search_term = "I want to allow vendors to create a profile and update their information"
search_embedding = get_embedding(search_term, engine="text-embedding-ada-002")

In [113]:
search_dist = df.embedding.apply(lambda x: cosine_similarity(x, search_embedding))
search_dist

0    0.681040
1    0.693005
2    0.734666
3    0.701354
4    0.707082
5    0.688759
6    0.686365
7    0.725578
8    0.679657
9    0.682707
Name: embedding, dtype: float64

In [114]:
pd.DataFrame({
    "fileranking": dist_df.columns[list(search_dist.sort_values(ascending=False).index)],
    "filesimilarity": list(search_dist.sort_values(ascending=False))
})

Unnamed: 0,fileranking,filesimilarity
0,AS_VM_FM_profileDetailsPrimary.txt,0.734666
1,AS_VM_FM_profileDetailsVendorDetails.xml,0.725578
2,PHQ_topLevel.xml,0.707082
3,PHQ_getAttributeData.xml,0.701354
4,PHQ_getJsonFromFilters.xml,0.693005
5,PHQ_getCycleTimeOverTime.xml,0.688759
6,PHQ_getOverallCycleTime.xml,0.686365
7,PHQ_getFirstStartTime.xml,0.682707
8,PHQ_getRequestBodyForAggregateData.xml,0.68104
9,PHQ_getLastEndTime.xml,0.679657
