In [6]:
# resource https://learn.deeplearning.ai/langchain/lesson/2/models,-prompts-and-parsers

In [1]:
# !pip install --upgrade pip
# !pip install openai
# !pip install --upgrade langchain

In [2]:
from langchain.chat_models import ChatOpenAI

In [5]:
import os
openai_api_key = os.getenv('OPENAI_API_KEY')

In [112]:
# To control the randomness and creativity of the generated
# text by an LLM, use temperature = 0.0
chat = ChatOpenAI(
    temperature=0.0,
    model_name='gpt-3.5-turbo'
    # model_name='GPT-4'
    )
# IMPORTANT: delete output if you preview it, as it contains the api key

### Prompt template

In [262]:
from langchain.prompts import ChatPromptTemplate

schema = """ {'DIM_ADDRESS': [{'column_name': 'ADDRESS_CITY_NAME', 'data_type': 'TEXT'}, {'column_name': 'ADDRESS_COUNTRY_NAME', 'data_type': 'TEXT'}, {'column_name': 'ADDRESS_AGE', 'data_type': 'NUMBER'}, {'column_name': 'ADDRESS_PK', 'data_type': 'TEXT'}, {'column_name': 'ADDRESS_STATE_NAME', 'data_type': 'TEXT'}], 'DIM_CREDIT_CARD': [{'column_name': 'CREDITCARD_ID', 'data_type': 'NUMBER'}, {'column_name': 'CREDITCARD_PK', 'data_type': 'TEXT'}, {'column_name': 'CREDITCARD_TYPE', 'data_type': 'TEXT'}], 'DIM_CUSTOMER': [{'column_name': 'BUSINESS_ENTITY_ID', 'data_type': 'NUMBER'}, {'column_name': 'CUSTOMER_FULL_NAME', 'data_type': 'TEXT'}, {'column_name': 'CUSTOMER_ID', 'data_type': 'NUMBER'}, {'column_name': 'CUSTOMER_PK', 'data_type': 'TEXT'}, {'column_name': 'CUSTOMER_STORE_BUSINESS_ENTITY_ID', 'data_type': 'NUMBER'}, {'column_name': 'CUSTOMER_STORE_NAME', 'data_type': 'TEXT'}], 'DIM_DATE': [{'column_name': 'DATE_DAY', 'data_type': 'DATE'}, {'column_name': 'DATE_PK', 'data_type': 'TEXT'}, {'column_name': 'DAY_OF_MONTH', 'data_type': 'NUMBER'}, {'column_name': 'DAY_OF_WEEK', 'data_type': 'NUMBER'}, {'column_name': 'DAY_OF_WEEK_NAME', 'data_type': 'TEXT'}, {'column_name': 'DAY_OF_YEAR', 'data_type': 'NUMBER'}, {'column_name': 'NEXT_DATE_DAY', 'data_type': 'DATE'}, {'column_name': 'PRIOR_DATE_DAY', 'data_type': 'DATE'}, {'column_name': 'PRIOR_YEAR_DATE_DAY', 'data_type': 'DATE'}, {'column_name': 'PRIOR_YEAR_OVER_YEAR_DATE_DAY', 'data_type': 'DATE'}], 'DIM_ORDER_STATUS': [{'column_name': 'ORDER_STATUS', 'data_type': 'NUMBER'}, {'column_name': 'ORDER_STATUS_NAME', 'data_type': 'TEXT'}, {'column_name': 'ORDER_STATUS_PK', 'data_type': 'TEXT'}], 'DIM_PRODUCT': [{'column_name': 'PRODUCT_CATEGORY_NAME', 'data_type': 'TEXT'}, {'column_name': 'PRODUCT_CLASS', 'data_type': 'TEXT'}, {'column_name': 'PRODUCT_COLOR', 'data_type': 'TEXT'}, {'column_name': 'PRODUCT_ID', 'data_type': 'NUMBER'}, {'column_name': 'PRODUCT_NAME', 'data_type': 'TEXT'}, {'column_name': 'PRODUCT_NUMBER', 'data_type': 'TEXT'}, {'column_name': 'PRODUCT_PK', 'data_type': 'TEXT'}, {'column_name': 'PRODUCT_SUBCATEGORY_NAME', 'data_type': 'TEXT'}], 'FCT_SALES': [{'column_name': 'CREDITCARD_FK', 'data_type': 'TEXT'}, {'column_name': 'CUSTOMER_FK', 'data_type': 'TEXT'}, {'column_name': 'ORDER_DATE_FK', 'data_type': 'TEXT'}, {'column_name': 'ORDER_STATUS_FK', 'data_type': 'TEXT'}, {'column_name': 'PRODUCT_FK', 'data_type': 'TEXT'}, {'column_name': 'SALE_ORDER_DETAIL_ID', 'data_type': 'NUMBER'}, {'column_name': 'SALE_ORDER_ID', 'data_type': 'NUMBER'}, {'column_name': 'SALE_ORDER_QUANTITY', 'data_type': 'NUMBER'}, {'column_name': 'SALE_PK', 'data_type': 'TEXT'}, {'column_name': 'SALE_REVENUE', 'data_type': 'NUMBER'}, {'column_name': 'SALE_UNIT_PRICE', 'data_type': 'NUMBER'}, {'column_name': 'SHIP_ADDRESS_FK', 'data_type': 'TEXT'}]} """

table_name = "DIM_ADDRESS"
# column_name = "ADDRESS_CITY_NAME"
column_name = "ADDRESS_ID"
# column_name = "ADDRESS_AGE"

# table_name = "FCT_SALES"
# column_name = "SALE_ORDER_QUANTITY"
# column_name = "CREDITCARD_FK"

# table_name = "DIM_CUSTOMER"
# column_name = "CUSTOMER_PK"

format_instructions = """\

Format the output as JSON with the following keys:
* is_key_column: is the column likely to be a primary, foreign, natural, or surrogate key column? \
    Value should be "true" or "false" (enclosed in double quotes). \
    Look for substrings like 'id', 'pk', 'fk', 'key', 'primary', 'foreign', etc. \
* dimension_type: the type of the dimension. \
    The dimension type should be determined by considering: 
        1. the data_type of the column.
        2. the column_name.
* measure_1_type: the type of the first measure. \
    This should be created if end users (analysts) might create useful \
        analyses through aggregating the values in this column by a certain \
        aggregation (measure type). \
    If a measure made from this column is not appropriate then this field \
        should be "null" (enclosed in double quotes).
* measure_2_type: the type of the second measure. \
    Use the same logic as for measure_1_type, but, only if an additional \
        measure would be useful. \
    If a second measure made from this column is not appropriate then this \
        field should be "null" (enclosed in double quotes). \

"""

cube_objects_from_schema_template = """\
The dictionary below details a snowflake schema for a list of tables and their columns. \
Act as a developer creating a semantic layer in cube.js on top of a data warehouse. \
I want you to, for the colum_name '{column_name}' of table '{table_name}' (the key of \
the dictionary) decide:
    2. which dimension type this column should create
    3. whether it would be useful to create up to two measures from this column.

Follow these rules:
    1. you have choose a dimension type for every column
    2. you can choose between 0 and 2 measure types for every column
    3. the dimension type should be one of the following:
        * string
        * number
        * time
        * boolean
    4. the measure type should be one of the following (listed in order of preference):
        * sum
        * avg
        * count_distinct
        * max
        * min
    5. you should aim to make measures whenever appropriate and preferrably 2.

schema: {schema}

{format_instructions}
```
"""

In [263]:
prompt = ChatPromptTemplate.from_template(template=cube_objects_from_schema_template)
prompt

ChatPromptTemplate(input_variables=['column_name', 'table_name', 'format_instructions', 'schema'], output_parser=None, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['column_name', 'format_instructions', 'schema', 'table_name'], output_parser=None, partial_variables={}, template="The dictionary below details a snowflake schema for a list of tables and their columns. Act as a developer creating a semantic layer in cube.js on top of a data warehouse. I want you to, for the colum_name '{column_name}' of table '{table_name}' (the key of the dictionary) decide:\n    2. which dimension type this column should create\n    3. whether it would be useful to create up to two measures from this column.\n\nFollow these rules:\n    1. you have choose a dimension type for every column\n    2. you can choose between 0 and 2 measure types for every column\n    3. the dimension type should be one of the following:\n        * string\n        * number\n  

In [264]:
messages = prompt.format_messages(
    table_name=table_name,
    column_name=column_name,
    schema=schema, 
    format_instructions=format_instructions,
)
print(messages[0].content)

The dictionary below details a snowflake schema for a list of tables and their columns. Act as a developer creating a semantic layer in cube.js on top of a data warehouse. I want you to, for the colum_name 'ADDRESS_ID' of table 'DIM_ADDRESS' (the key of the dictionary) decide:
    2. which dimension type this column should create
    3. whether it would be useful to create up to two measures from this column.

Follow these rules:
    1. you have choose a dimension type for every column
    2. you can choose between 0 and 2 measure types for every column
    3. the dimension type should be one of the following:
        * string
        * number
        * time
        * boolean
    4. the measure type should be one of the following (listed in order of preference):
        * sum
        * avg
        * count_distinct
        * max
        * min
    5. you should aim to make measures whenever appropriate and preferrably 2.

schema:  {'DIM_ADDRESS': [{'column_name': 'ADDRESS_CITY_NAME', 'da

In [265]:
response = chat(messages)

In [266]:
print(response.content)

{
  "is_key_column": "true",
  "dimension_type": "number",
  "measure_1_type": "count_distinct",
  "measure_2_type": "null"
}


# todos:

fix
* figure out how to avoid exposing api keys when committing to github
* provide list of measure and dimension types as input
* specify how to name dimensions and measures

change
* enable the use of `output_parser`
    * make a separate call for each column, so the schema is known?
    * how to deal with multiple measures? Limit to 3?

add 
 * foreign key: whether the column is a foreign key of the table

In [12]:
output_dict = output_parser.parse(response.content)

NameError: name 'output_parser' is not defined

In [None]:
output_dict

In [None]:
type(output_dict)