In [1]:
import pandas as pd
from langchain.schema import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
import json
import os
from langchain.chains.summarize import load_summarize_chain
from tqdm import tqdm
from langchain import PromptTemplate


tqdm.pandas()

In [2]:
beer_jsons = []
for f in os.listdir("./beer-database"):
    with open("./beer-database/" + f, "r") as fp:
        beer_jsons.extend(json.load(fp)["data"])

In [3]:
beer_df = pd.json_normalize(beer_jsons, max_level=None, record_prefix=True)
beer_df.head()

Unnamed: 0,id,name,nameDisplay,description,abv,availableId,styleId,isOrganic,isRetired,status,...,beerVariation.style.srmMin,beerVariation.style.srmMax,beerVariation.style.ogMin,beerVariation.style.fgMin,beerVariation.style.fgMax,beerVariation.style.createDate,beerVariation.style.updateDate,beerVariation.ibu,beerVariation.year,beerVariation.style.ogMax
0,nKMJPZ,Nova Vert,Nova Vert,We can build it. We have the technology. Make ...,6.8,2.0,25,N,N,verified,...,,,,,,,,,,
1,W7NmQw,November Gale Pale Ale,November Gale Pale Ale,"A golden pale ale, this beer has medium malt c...",5.0,,25,N,N,verified,...,,,,,,,,,,
2,sFzINW,"November IPA of the Month - ""NovAmber""","November IPA of the Month - ""NovAmber"" (2014)",In the blustery month that brings us Thanksgiv...,6.9,3.0,30,N,Y,verified,...,,,,,,,,,,
3,dB3Jip,Now & Then,Now & Then,"A beer brewed to commemorate a fully evolved, ...",7.0,,30,N,N,verified,...,,,,,,,,,,
4,703UnG,Nox Atra Imperial Stout,Nox Atra Imperial Stout,In Latin ‘Nox Atra’ means ‘Dark as Night’ – an...,8.5,,43,N,N,verified,...,,,,,,,,,,


In [4]:
beer_na = beer_df.isna().sum(axis=0)
beer_na[beer_na > 0]

description                        7626
abv                                2604
availableId                       16057
labels.contentAwareIcon             458
labels.contentAwareMedium           458
                                  ...  
beerVariation.style.createDate    30000
beerVariation.style.updateDate    30000
beerVariation.ibu                 30061
beerVariation.year                30134
beerVariation.style.ogMax         30139
Length: 70, dtype: int64

In [5]:
beer_df_filter = beer_df.loc[~beer_df.description.isna()]
beer_df_filter = beer_df_filter.loc[~beer_df.availableId.isna()]
beer_df_filter = beer_df_filter.loc[~beer_df.abv.isna()]
beer_df_filter = beer_df_filter.loc[beer_df["available.name"] == "Year Round"]

beer_na = beer_df_filter.isna().sum(axis=0)
beer_df_filter.head()

Unnamed: 0,id,name,nameDisplay,description,abv,availableId,styleId,isOrganic,isRetired,status,...,beerVariation.style.srmMin,beerVariation.style.srmMax,beerVariation.style.ogMin,beerVariation.style.fgMin,beerVariation.style.fgMax,beerVariation.style.createDate,beerVariation.style.updateDate,beerVariation.ibu,beerVariation.year,beerVariation.style.ogMax
16,9gPqS1,Numazu Lager,Numazu Lager,"Clean, soft Numazu water combines with German ...",5.5,1.0,94,N,N,verified,...,,,,,,,,,,
32,HCmqoo,Nut Brown,Nut Brown,AleSmith Nut Brown is a tribute to one of our ...,5.0,1.0,12,N,N,verified,...,,,,,,,,,,
33,UsMdAB,Nut Brown Ale,Nut Brown Ale,Don't be intimidated by the rich walnut color....,4.2,1.0,10,N,N,verified,...,,,,,,,,,,
35,H8FoBz,Nut Brown Ale,Nut Brown Ale,BBC Nut Brown Ale has a diverse grain bill whi...,4.9,1.0,10,N,N,verified,...,,,,,,,,,,
36,blq7Dn,Nut Brown Ale,Nut Brown Ale,"Like the rich, dark brews of southern England ...",5.0,1.0,12,N,N,verified,...,,,,,,,,,,


In [6]:
len(beer_df_filter.index)

5531

In [7]:
fields_of_interests = [
    "name",
    "description",
    "abv",
    "ibu",
    "srm.name",
    "style.category.name",
    "style.name",
    "style.description"
]

beer_select_df = beer_df_filter[fields_of_interests].copy()
beer_select_df.loc[beer_select_df.abv.isna(), "abv"] = -1
beer_select_df["abv"] = pd.to_numeric(beer_select_df["abv"])
beer_select_df.loc[beer_select_df.ibu.isna(), "ibu"] = -1
beer_select_df["ibu"] = pd.to_numeric(beer_select_df["ibu"])
beer_select_df.loc[beer_select_df["srm.name"] == "Over 40", "srm.name"] = "41"
beer_select_df["srm.name"] = pd.to_numeric(beer_select_df["srm.name"].fillna("-1"))

for c in ["name", "style.category.name", "style.name", "style.description"]:
    beer_select_df.loc[beer_select_df[c].isna(), c] = "Unknown"
beer_select_df.head()

Unnamed: 0,name,description,abv,ibu,srm.name,style.category.name,style.name,style.description
16,Numazu Lager,"Clean, soft Numazu water combines with German ...",5.5,-1.0,-1,North American Lager,American-Style Light (Low Calorie) Lager,"These beers are extremely light colored, light..."
32,Nut Brown,AleSmith Nut Brown is a tribute to one of our ...,5.0,17.0,25,British Origin Ales,English-Style Brown Ale,English brown ales range from copper to brown ...
33,Nut Brown Ale,Don't be intimidated by the rich walnut color....,4.2,-1.0,-1,British Origin Ales,English-Style Pale Mild Ale,English pale mild ales range from golden to am...
35,Nut Brown Ale,BBC Nut Brown Ale has a diverse grain bill whi...,4.9,29.0,-1,British Origin Ales,English-Style Pale Mild Ale,English pale mild ales range from golden to am...
36,Nut Brown Ale,"Like the rich, dark brews of southern England ...",5.0,23.0,-1,British Origin Ales,English-Style Brown Ale,English brown ales range from copper to brown ...


In [8]:
beer_select_df.loc[beer_select_df["abv"] >= 0, "abv"].describe()

count    5531.000000
mean        6.002440
std         1.533078
min         0.000000
25%         5.000000
50%         5.600000
75%         6.800000
max        22.000000
Name: abv, dtype: float64

In [9]:
beer_select_df.loc[beer_select_df["ibu"] >= 0, "ibu"].describe()

count    3642.000000
mean       39.686826
std        24.142273
min         0.000000
25%        21.000000
50%        33.000000
75%        55.000000
max       200.000000
Name: ibu, dtype: float64

In [10]:
beer_select_df.loc[beer_select_df["srm.name"] >= 0, "srm.name"].describe()

count    1500.000000
mean       14.444667
std        12.395754
min         1.000000
25%         5.000000
50%         9.000000
75%        20.000000
max        41.000000
Name: srm.name, dtype: float64

In [11]:
API_KEY = "./api_key"
with open(API_KEY, "r") as fp:
    key_content = fp.read().strip()

In [12]:
DESC_TEMPLATE = """Name of beer: %s


Category of Beer: %s


Style of Beer: %s


Description of Style: %s


Description of Beer: %s
"""

summary_prompt_template = "Write a concise summary of the following beer. " \
                          "The summary should contain sufficient information to let a consumer decide if he likes the beer." \
                          "\n\n{text}\n\nSUMMARY:"
PROMPT = PromptTemplate(template=summary_prompt_template, input_variables=["text"])


llm = OpenAI(openai_api_key=key_content, temperature=0)
summary_chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMPT)


def generate_beer_summary(row):
    beer_desc = DESC_TEMPLATE % (row["name"], row["style.category.name"],
                                 row["style.name"], row["style.description"], row["description"])
    return summary_chain({"input_documents":
                              [Document(page_content=beer_desc)]},
                         return_only_outputs=True)["output_text"].strip()


#beer_select_df["summary"] = beer_select_df.progress_apply(generate_beer_summary, axis=1)
beer_select_df = pd.read_json("./beer.json")

In [13]:
beer_select_df.head()

Unnamed: 0,name,description,abv,ibu,srm.name,style.category.name,style.name,style.description,summary
16,Numazu Lager,"Clean, soft Numazu water combines with German ...",5.5,-1.0,-1,North American Lager,American-Style Light (Low Calorie) Lager,"These beers are extremely light colored, light...",Numazu Lager is an American-Style Light (Low C...
32,Nut Brown,AleSmith Nut Brown is a tribute to one of our ...,5.0,17.0,25,British Origin Ales,English-Style Brown Ale,English brown ales range from copper to brown ...,AleSmith Nut Brown is a British-style brown al...
33,Nut Brown Ale,Don't be intimidated by the rich walnut color....,4.2,-1.0,-1,British Origin Ales,English-Style Pale Mild Ale,English pale mild ales range from golden to am...,Nut Brown Ale is a British Origin Ale with an ...
35,Nut Brown Ale,BBC Nut Brown Ale has a diverse grain bill whi...,4.9,29.0,-1,British Origin Ales,English-Style Pale Mild Ale,English pale mild ales range from golden to am...,BBC Nut Brown Ale is a traditional English-sty...
36,Nut Brown Ale,"Like the rich, dark brews of southern England ...",5.0,23.0,-1,British Origin Ales,English-Style Brown Ale,English brown ales range from copper to brown ...,"Nut Brown Ale is a British Origin Ale, specifi..."


In [14]:
def append_beer_info(row):
    info_desc = " The ABV of %s is %s %%. The IBU is %s. The SRM is %s."
    abv = row["abv"]
    ibu = row["ibu"]
    srm = row["srm.name"]
    if abv < 0:
        abv = "unknown"
    if ibu < 0:
        ibu = "unknown"
    if srm < 0:
        srm = "unknown"
    return row["summary"] + info_desc % (row["name"], abv, ibu, srm)

#beer_select_df["summary"] = beer_select_df.progress_apply(append_beer_info, axis=1)

100%|██████████| 5531/5531 [00:00<00:00, 74439.73it/s]


In [15]:
metadata_field_info = [
    AttributeInfo(
        name="name",
        description="The name of the beer",
        type="string",
    ),
    AttributeInfo(
        name="abv",
        description="The alcohol content of the beer in ABV percent. The mean of ABV is 6, and the standard deviation of ABV is 1.53. "
                    "The min, 25 percentile, 50 percentile, 75 percentile, and max of ABV are 0, 5, 5.6, 6.8, and 22. "
                    "The value is set to -1 if the ABV is not available.",
        type='float',
    ),
    AttributeInfo(
        name="ibu",
        description="The biterness of the beer measured in IBU (International Bittering Unit). The mean of IBU is 39.69, "
                    "and the standard deviation of IBU is 24.14. The min, 25 percentile, 50 percentile, 75 percentile, and max of IBU is "
                    "0, 21, 33, 55, and 200. "
                    "The value is set to -1 if the ABV is not available.",
        type='float',
    ),
    AttributeInfo(
        name="srm", description="The Standard Reference Method (SRM) value of the beer. The mean of SRM is 14.44, "
                                "and the standard deviation is 12.4. "
                                "The min, 25 percentile, 50 percentile, 75 percentile, and max of SRM is "
                                "1, 5, 8, 20, and 41. "
                                "The value is set to -1 if the ABV is not available.",
        type='integer'
    )
]

docs = []
for i, r in beer_select_df.iterrows():
    meta_info = {
        "name": r["name"],
        "abv": r["abv"],
        "ibu": r["ibu"],
        "srm": r["srm.name"]
    }
    docs.append(
        Document(
            page_content=r["summary"],
            metadata=meta_info
        )
    )

document_content_description = "Description of a beer"
len(docs)

5531

In [16]:
embeddings = OpenAIEmbeddings(openai_api_key=key_content)
vectorstore = Chroma.from_documents(docs, embeddings, persist_directory="./beer_vectors")
retriever = SelfQueryRetriever.from_llm(
    llm, vectorstore, document_content_description, metadata_field_info, verbose=True
)

In [17]:
retriever.get_relevant_documents('Which light beers have high alcohol contents?')

query='light beer' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GT: 'gt'>, attribute='abv', value=6.8), Comparison(comparator=<Comparator.LT: 'lt'>, attribute='srm', value=8)]) limit=None


[Document(page_content='Beer Camp Hoppy Lager is a North American Lager with a light body and straw color. It is clean and crisp with subtle and complex flavors, light malt sweetness, and very light hop bitterness, flavor, and aroma. It is a collaboration between Beer Camp and Ballast Point, and is perfect for hop lovers. The ABV of Beer Camp Hoppy Lager is 7.0 %. The IBU is 55.0. The SRM is -1.', metadata={'name': 'Beer Camp Hoppy Lager', 'abv': 7.0, 'ibu': 55.0, 'srm': -1}),
 Document(page_content='Beer Camp Hoppy Lager is a North American Lager with a light body and straw color. It is clean and crisp with subtle and complex flavors, light malt sweetness, and very light hop bitterness, flavor, and aroma. It is a collaboration between Beer Camp and Ballast Point, and is perfect for hop lovers. The ABV of Beer Camp Hoppy Lager is 7.0 %. The IBU is 55.0. The SRM is unknown.', metadata={'name': 'Beer Camp Hoppy Lager', 'abv': 7.0, 'ibu': 55.0, 'srm': -1}),
 Document(page_content='TESTBEE

In [18]:
#beer_select_df.to_json("beer.json")

In [19]:
#vectorstore.persist()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))