# Building structured data extraction with Tecton

## Objective

**We want to extract some information from the transcript of a call with a customer and Progressive Insurance.**

## Assumption

1. Customer conversations with Progressive Insurance are sometimes recorded
2. These conversations are transcribed and saved



In [None]:
import dotenv
from tecton_gen_ai.testing import set_dev_mode

dotenv.load_dotenv()
set_dev_mode()

##  Transcripts

In [None]:
from tecton_gen_ai.testing import make_local_source
from tecton.types import Field as TectonField, String
from tecton import Entity
import pandas as pd
from datetime import datetime

df = pd.read_json('transcripts.jsonl', lines=True)
df["ts"] = pd.to_datetime(df.ts)
transcripts = df.sort_values(by=["ts"]).reset_index(drop=True)


src = make_local_source(
    "src",
    transcripts,
    timestamp_field="ts",
)

user_id = Entity(
    name="policy_holder_id",
    join_keys=[TectonField(name="policy_holder_id", dtype=String)],
    description="Policy holder id, starting with `user_`",
)

transcripts

Unnamed: 0,transcript,ts,policy_holder_id,transcript_id
0,"Agent: Good morning, this is Alex from Progres...",2024-10-10,user_example_policy_holder,814a82db
1,"Agent: Hello, this is Jamie from Progressive I...",2024-10-11,user_example_policy_holder_2,9b7c3f1e
2,"Agent: Good afternoon, this is Taylor from Pro...",2024-10-12,user_example_policy_holder_3,c3d4e5f6
3,"Agent: Good morning, this is Morgan from Progr...",2024-10-13,user_example_policy_holder_4,d7e8f9g0
4,"Agent: Hello, this is Jordan from Progressive ...",2024-10-14,user_example_policy_holder_5,h1i2j3k4
5,"Agent: Good afternoon, this is Casey from Prog...",2024-10-15,user_example_policy_holder_6,l5m6n7o8
6,"Agent: Hello, this is Riley from Progressive I...",2024-10-16,user_example_policy_holder_7,p9q0r1s2
7,"Agent: Good morning, this is Avery from Progre...",2024-10-17,user_example_policy_holder_8,t3u4v5w6


## Using LLM to extract structured auto claim information

In [None]:
from pydantic import BaseModel, Field
from tecton_gen_ai.extraction import llm_extraction


class ClaimDetails(BaseModel):
    number_of_vehicles: int = Field(description="Number of vehicles involved in the claim")
    vehicle_details: list[str] = Field(description="Details of the vehicles involved")
    weather_impact: bool = Field(description="Whether weather impacted the incident")
    incident_date: datetime = Field(description="Date of the incident")
    location: str = Field(description="Location of the incident")
    injury_reported: bool = Field(description="Whether any injuries were reported")
    injury_description: str = Field(description="Descriptions of any injuries were reported")
    police_report: bool = Field(description="Whether a police report was filed")

transcript_claims = llm_extraction(
    src,
    name="claim_info",
    extraction_config=[
        {
            "model": "openai/gpt-4o-mini",
            "column": "transcript",
            "schema": ClaimDetails,
        },
    ],
    entities=[user_id],
)

transcript_claims_judge = llm_extraction(
    src,
    name="claim_info_judge",
    extraction_config=[
        {
            "model": "openai/gpt-4o",
            "column": "transcript",
            "schema": ClaimDetails,
        },
    ],
    entities=[user_id],
)

In [None]:
transcript_claims.get_features_in_range(start_time=datetime(2024, 10, 8), end_time=datetime(2024, 10, 30)).to_pandas()


Unnamed: 0,policy_holder_id,number_of_vehicles,vehicle_details,weather_impact,incident_date,location,injury_reported,injury_description,police_report,_valid_from,_valid_to
0,user_example_policy_holder_3,2,"[2021 Subaru Outback, 2017 Chevrolet Malibu]",False,2023-10-05 00:00:00+00:00,at a stoplight,False,,False,2024-10-13 00:00:00+00:00,2024-10-30 00:00:00+00:00
1,user_example_policy_holder_2,1,"[2019 Honda Accord, blue]",False,2023-10-16 23:00:00+00:00,Driveway,False,,True,2024-10-12 00:00:00+00:00,2024-10-30 00:00:00+00:00
2,user_example_policy_holder_7,1,[deer],False,2023-10-09 00:00:00+00:00,Highway 12,False,,True,2024-10-17 00:00:00+00:00,2024-10-30 00:00:00+00:00
3,user_example_policy_holder_5,1,[Flood damaged car],True,2023-10-15 00:00:00+00:00,Claimant's neighborhood driveway,False,,False,2024-10-15 00:00:00+00:00,2024-10-30 00:00:00+00:00
4,user_example_policy_holder,2,"[2018 Toyota Camry, 2020 Ford F-150]",True,2024-10-09 00:00:00+00:00,intersection of Main Street and 5th Avenue,True,minor injury to arm,True,2024-10-11 00:00:00+00:00,2024-10-30 00:00:00+00:00
5,user_example_policy_holder_4,2,"[Claimant's parked vehicle, Unknown vehicle in...",False,2023-10-28 00:00:00+00:00,Street where the claimant's vehicle was parked,False,,True,2024-10-14 00:00:00+00:00,2024-10-30 00:00:00+00:00
6,user_example_policy_holder_8,1,[Car],False,2023-10-21 00:00:00+00:00,Garage,False,,False,2024-10-18 00:00:00+00:00,2024-10-30 00:00:00+00:00
7,user_example_policy_holder_6,1,[Car],False,2023-10-13 00:00:00+00:00,Parked overnight,False,,True,2024-10-16 00:00:00+00:00,2024-10-30 00:00:00+00:00


In [None]:
transcript_claims_judge.get_features_in_range(start_time=datetime(2024, 10, 8), end_time=datetime(2024, 10, 30)).to_pandas()


Unnamed: 0,policy_holder_id,number_of_vehicles,vehicle_details,weather_impact,incident_date,location,injury_reported,injury_description,police_report,_valid_from,_valid_to
0,user_example_policy_holder_7,1,[deer],False,2023-10-09 00:00:00+00:00,Highway 12,False,,True,2024-10-17 00:00:00+00:00,2024-10-30 00:00:00+00:00
1,user_example_policy_holder_2,1,"[2019 Honda Accord, blue]",False,2023-10-16 23:00:00+00:00,Driveway,False,,True,2024-10-12 00:00:00+00:00,2024-10-30 00:00:00+00:00
2,user_example_policy_holder_3,2,"[2021 Subaru Outback, 2017 Chevrolet Malibu]",False,2023-10-05 00:00:00+00:00,at a stoplight,False,,False,2024-10-13 00:00:00+00:00,2024-10-30 00:00:00+00:00
3,user_example_policy_holder,2,"[2018 Toyota Camry, 2020 Ford F-150]",True,2024-10-09 00:00:00+00:00,intersection of Main Street and 5th Avenue,True,minor injury to arm,True,2024-10-11 00:00:00+00:00,2024-10-30 00:00:00+00:00
4,user_example_policy_holder_5,1,[Flood damaged car],True,2023-10-15 00:00:00+00:00,Claimant's neighborhood driveway,False,,False,2024-10-15 00:00:00+00:00,2024-10-30 00:00:00+00:00
5,user_example_policy_holder_8,1,[Car],False,2023-10-21 00:00:00+00:00,Garage,False,,False,2024-10-18 00:00:00+00:00,2024-10-30 00:00:00+00:00
6,user_example_policy_holder_6,1,[Car],False,2023-10-13 00:00:00+00:00,Parked overnight,False,,True,2024-10-16 00:00:00+00:00,2024-10-30 00:00:00+00:00
7,user_example_policy_holder_4,2,"[Claimant's parked vehicle, Unknown vehicle in...",False,2023-10-28 00:00:00+00:00,Street where the claimant's vehicle was parked,False,,True,2024-10-14 00:00:00+00:00,2024-10-30 00:00:00+00:00
