# Metadata Entity Extraction w/ Marvin

This notebook walks through using [`Marvin`](https://github.com/PrefectHQ/marvin) to extract entities from text. Marvin uses the LLM to identify and extract entities.

## Setup

In [None]:
# !pip install marvin

In [2]:
from llama_index import SimpleDirectoryReader
from llama_index.indices.service_context import ServiceContext
from llama_index.llms import OpenAI
from llama_index.node_parser import SimpleNodeParser
from llama_index.node_parser.extractors import (
    MetadataExtractor,
)
from llama_index.text_splitter import TokenTextSplitter
from llama_index.node_parser.extractors.marvin_entity_extractor import (
    MarvinEntityExtractor,
)

In [3]:
import os
import openai

os.environ["OPENAI_API_KEY"] = "sk-..."
openai.api_key = os.environ["OPENAI_API_KEY"]

In [9]:
documents = SimpleDirectoryReader("data").load_data()

# limit document text length
documents[0].text = documents[0].text[:10000]

In [10]:
import marvin
from marvin import ai_model

try:
    from pydantic.v1 import BaseModel, Field
except ImportError:
    from pydantic import BaseModel, Field

marvin.settings.openai.api_key = os.environ["OPENAI_API_KEY"]


@ai_model
class SportsSupplement(BaseModel):
    name: str = Field(..., description="The name of the sports supplement")
    description: str = Field(..., description="A description of the sports supplement")
    pros_cons: str = Field(
        ..., description="The pros and cons of the sports supplement"
    )

In [11]:
llm_model = "gpt-3.5-turbo"

llm = OpenAI(temperature=0.1, model_name=llm_model, max_tokens=512)
service_context = ServiceContext.from_defaults(llm=llm)

# construct text splitter to split texts into chunks for processing
# this takes a while to process, you can increase processing time by using larger chunk_size
# file size is a factor too of course
text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)

# set the global service context object, avoiding passing service_context when building the index
from llama_index import set_global_service_context

set_global_service_context(service_context)

# create metadata extractor
metadata_extractor = MetadataExtractor(
    extractors=[
        MarvinEntityExtractor(
            marvin_model=SportsSupplement, llm_model_string=llm_model
        ),  # let's extract custom entities for each node.
    ],
)

# create node parser to parse nodes from document
node_parser = SimpleNodeParser(
    text_splitter=text_splitter,
    metadata_extractor=metadata_extractor,
)

# use node_parser to get nodes from the documents
nodes = node_parser.get_nodes_from_documents(documents)

In [12]:
from pprint import pprint

for i in range(5):
    pprint(nodes[i].metadata)

{'marvin_entities': {'description': 'AAKG, also known as L-arginine '
                                    'alpha-ketoglutarate, is a sports '
                                    'supplement that is used to improve peak '
                                    'power output and strength in weight '
                                    'training. It has been found to enhance '
                                    'maximum effort bench press and Wingate '
                                    'peak power performance.',
                     'name': 'AAKG',
                     'pros_cons': 'Pros: Improved maximum effort bench press '
                                  'and Wingate peak power performance. Cons: '
                                  'None mentioned.'}}
{'marvin_entities': {'description': 'Gulping down baking soda (sodium '
                                    'bicarbonate) makes the blood more '
                                    'alkaline, improving performance in '
                 