# Evaporate Demo

In [1]:
from llama_index import (
    SimpleDirectoryReader,
    ServiceContext,
    LLMPredictor
)
from llama_index.experimental.evaporate import EvaporateExtractor
from langchain.llms.openai import OpenAIChat, OpenAI
import requests

  from .autonotebook import tqdm as notebook_tqdm


### Load data

In [2]:
wiki_titles = ["Toronto", "Seattle", "Chicago", "Boston", "Houston"]

In [3]:
from pathlib import Path

import requests
for title in wiki_titles:
    response = requests.get(
        'https://en.wikipedia.org/w/api.php',
        params={
            'action': 'query',
            'format': 'json',
            'titles': title,
            'prop': 'extracts',
            # 'exintro': True,
            'explaintext': True,
        }
    ).json()
    page = next(iter(response['query']['pages'].values()))
    wiki_text = page['extract']

    data_path = Path('data')
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", 'w') as fp:
        fp.write(wiki_text)


In [4]:
# Load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(input_files=[f"data/{wiki_title}.txt"]).load_data()

In [5]:
# llm_predictor = LLMPredictor(llm=OpenAIChat(temperature=0, model_name="gpt-4"))
llm_predictor = LLMPredictor(llm=OpenAIChat(temperature=0, model_name="gpt-3.5-turbo"))
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor, chunk_size=512
)



In [6]:
# get nodes for each document
city_nodes = {}
for wiki_title in wiki_titles:
    docs = city_docs[wiki_title]
    nodes = service_context.node_parser.get_nodes_from_documents(docs)
    city_nodes[wiki_title] = nodes

In [7]:
# a list of nodes, one node per city, corresponding to intro paragraph
city_pop_nodes = []
city_pop_nodes = [city_nodes["Toronto"][0], city_nodes["Seattle"][0]]

### Evaporate Extractor Demo

Here we demonstrate each function within the Evaporate Extractor

In [8]:
extractor = EvaporateExtractor(service_context)

#### Extract Fields

We demonstrate how to identify common fields across different chunks.

In [9]:
# Try with just Boston
boston_fields = extractor.identify_fields(city_nodes["Boston"][:1], topic="Boston")

In [10]:
boston_fields

['colleges and universities', 'area', 'population', 'firsts', 'state']

In [11]:
# Try with Toronto and Seattle (should extract "population")
existing_fields = extractor.identify_fields(city_pop_nodes, topic="city", fields_top_k=1)

In [12]:
existing_fields

['population']

#### Extract Functions from Fields

In [11]:
def get_fn_str_dict(nodes: list, existing_fields: set) -> dict:
    fn_str_dict = {}
    for field in existing_fields:
        fn_str = extractor.extract_fn_from_nodes(nodes, field)
        # fn_str = extractor.extract_fn_from_nodes(city_pop_nodes, field)
        print(f"Field: {field}")
        print(fn_str)
        fn_str_dict[field] = fn_str
    return fn_str_dict

In [15]:
boston_fn_str_dict = get_fn_str_dict(city_nodes["Boston"][:1], boston_fields)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 814 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


Field: colleges and universities
def get_colleges_and_universities_field(text: str):
    """
    Function to extract colleges and universities. 
    """
    
    # Use regex to find the colleges and universities field
    pattern = r"colleges and universities, notably (.*?),"
    colleges_and_universities_field = re.findall(pattern, text)
    
    # Return the result as a list
    return colleges_and_universities_field[0].split(" and ")


INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 591 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


Field: area
def get_area_field(text: str):
    """
    Function to extract area. 
    """
    
    # Use regex to find the area field
    area_field = re.findall(r'area of about (.*?) sq mi', text)
    
    # Return the result as a list
    return area_field


INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 589 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


Field: population
def get_population_field(text: str):
    """
    Function to extract population. 
    """
    
    # Use regex to find the population field
    population_field = re.findall(r'population of (.*?) as', text)
    
    # Return the result as a list
    return population_field


INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 684 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


Field: firsts
def get_firsts_field(text: str):
    """
    Function to extract firsts. 
    """
    
    # Use regex to find the "firsts" field
    pattern = r"firsts\s*include\s*(.*?)\."
    firsts_field = re.findall(pattern, text, re.DOTALL)
    
    # Split the field into a list
    firsts_list = firsts_field[0].split(',')
    
    # Strip whitespace from each item in the list
    firsts_list = [item.strip() for item in firsts_list]
    
    return firsts_list


INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 665 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens


Field: state
def get_state_field(text: str):
    """
    Function to extract state. 
    """
    
    # Use regex to find the state field
    pattern = r"\b(US:)\b"
    matches = re.findall(pattern, text)
    
    # Return the result as a list
    return list(matches)


#### Run Function for each Field

In [27]:
def get_result_dict(nodes: list, fn_str_dict: dict) -> dict:
    result_dict = {}
    for field in fn_str_dict.keys():
        fn_str = fn_str_dict[field]
        result = extractor.run_fn_on_nodes(nodes, fn_str, field)
        # result = extractor.run_fn_on_nodes(city_pop_nodes, fn_str, field)
        result_dict[field] = result
    return result_dict

In [28]:
boston_result_dict = get_result_dict(city_nodes["Boston"][:1], boston_fn_str_dict)

In [29]:
boston_result_dict

{'colleges and universities': [['Harvard', 'MIT']],
 'area': [['48.4']],
 'population': [['675,647']],
 'firsts': [["the United States' first public park (Boston Common",
   '1634)',
   'first public or state school (Boston Latin School',
   '1635) first subway system (Tremont Street subway',
   '1897)',
   'and first large public library (Boston Public Library',
   '1848)']],
 'state': [[]]}

### Try Running E2E

In [9]:
extractor.extract_datapoints_with_fn(city_nodes["Boston"][:1], topic="Boston")

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 631 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 597 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 t

[{'colleges and universities': ['Harvard and MIT'],
  'state': [],
  'key events': ['the Boston Massacre',
   'the Boston Tea Party',
   'the Battle of Bunker Hill',
   'and the siege of Boston.'],
  'area': ['48.4'],
  'firsts': ["the United States' first public park (Boston Common",
   ' 1634)',
   ' first public or state school (Boston Latin School',
   ' 1635) first subway system (Tremont Street subway',
   ' 1897)',
   ' and first large public library (Boston Public Library',
   ' 1848)']}]