In [2]:
import pandas as pd
import re
import itertools
from openai import AzureOpenAI
import os
from typing import List
import tqdm


def clean_up(df: pd.DataFrame) -> None:
    """cleans up the dataframe inplace"""
    df["SegmentMix"] = df.SegmentMix.apply(lambda row: re.split("[;\,]", str(row)))\
        .apply(lambda ls: [float(str(i).strip().replace("%", ""))/100 
                       if i else i for i in ls] 
           if isinstance(ls, list) else ls)
    df["KeyDriverSegments"] = df.KeyDriverSegments.str.split("[;\,]")\
        .apply(lambda ls: [str(i).strip().lower() for i in ls] 
            if isinstance(ls, list) else ls)
    
def get_prompt(segment_list: List[str]):
    segment_list_str = ", ".join(segment_list)
    prompt_template = ("As an experienced investment analyst, you will be given a list of segments. " 
    "The segments will be indicated by <segment> </segment>"
    "Your task is to get the ultimate parent of each segment, and return "
    "that ultimate parent "
    "for example, you should reply `automotive` "
    "for both `global automotive` and `china auto`, "
    "as they are both part of automotive. "
    "Only reply with the parent segments, in a list, seperated by comma. "
    "For example, `industrial, general consumer, bikes` is a good reply. "
    "Trye to retain the specific nuances like `semiconductor equipment`. "
    f"<segments> {segment_list_str}. </segments>")
    return prompt_template

client = AzureOpenAI(
    api_key="19594a377a8f4b9a8a853e9dfc01433e",
    azure_endpoint="https://impax-gpt-4.openai.azure.com/",
    api_version="2023-07-01-preview"
    )

client_gpt35 = AzureOpenAI(
    api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version = os.getenv("OPENAI_API_VERSION"),
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )

def get_parent_segments(segment_list: List[str]):
    prompt = get_prompt(segment_list)
    response = client.chat.completions.create(
        model="ipx-gpt-4",#os.getenv("AZURE_OPENAI_DEPLOYMENT"),
        messages=[
            {"role": "system", "content": "You are a experienced investment analyst, proficient at analyzing companies."},
            {"role": "user", "content": prompt}
        ]
    )
    return response

async def get_segment_parent(segment: str, parent_list: List[str]):
    parent_list_str = ", ".join(parent_list)
    prompt = (f"We have the following parent segments: {parent_list_str}. "
    f"Based on these parent, what is the parent segment for: {segment}? "
    "Only return the parent name.")
    response = client_gpt35.chat.completions.create(
        model=os.getenv("AZURE_OPENAI_DEPLOYMENT"),#"ipx-gpt-4",
        messages=[
            {"role": "system", "content": "You are a experienced investment analyst, proficient at analyzing companies."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content

async def process_segments(segments):
    parent_res = get_parent_segments(segments)
    parents = parent_res.choices[0].message.content.split(", ")
    segment_parents = {}
    for segment in tqdm.tqdm(segments):
        segment_parents[segment] = await get_segment_parent(segment, parents)
    return segment_parents

def clean_up_response(response: str) -> str:
    res = re.split("( is )|( would be )|(would most likely be )", response)[-1]
    res = re.sub("""[\"\'\.]""", "", res).lower()
    if "fmcg" in res:
        return "fmcg"
    if "industrial" in res:
        return "industrial"
    return res

In [3]:
df = pd.read_clipboard(header=[0], index_col=None)
clean_up(df)

segments = set(itertools.chain(*df.KeyDriverSegments.dropna()))
segment_parents = await process_segments(segments)

for k, v in segment_parents.items():
    segment_parents[k] = clean_up_response(v)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [00:48<00:00,  2.10it/s]


In [None]:
import utils

# write parent segment nodes
parent_segments = set(segment_parents.values())
for parent_segment in parent_segments:
    utils.Neo4jDriver.create_node(
        database="aeCompanyDrivers",
        node_type="ParentSegment",
        parent_name=parent_segment
    )

# write children segments
for segment in segments:
    utils.Neo4jDriver.create_node(
        database="aeCompanyDrivers",
        node_type="Segment",
        name=segment
    )

# write child-parent segment relationship
for child, parent in segment_parents.items():
    if child != parent:
        utils.Neo4jDriver.create_relationship(
            database="aeCompanyDrivers",
            source_node_type="Segment",
            source_node_properties=dict(name=child),
            target_node_type="ParentSegment",
            target_node_properties=dict(parent_name=parent),
            relationship_type="part_of",
            percentage=1.
        )

# write company nodes
for ind, row in df.iterrows():
    utils.Neo4jDriver.create_node(
        database="aeCompanyDrivers",
        node_type="Company",
        figi=row.FIGI,
        tidcker=row.Ticker,
        name=row.Name,
        isin=row.ISIN,
    )

# write company relationship with segments
for ind, row in df.iterrows():
    if isinstance(row.KeyDriverSegments, list) and isinstance(row.SegmentMix, list):
        segment_dict = dict(zip(row.KeyDriverSegments, row.SegmentMix)) 
        for segment, weight in segment_dict.items():
            utils.Neo4jDriver.create_relationship(
                database="aeCompanyDrivers",
                source_node_type="Company",
                source_node_properties=dict(figi=row.FIGI),
                target_node_type="Segment",
                target_node_properties=dict(name=segment),
                relationship_type="has_segment",
                percentage=weight
            )