# Data Processing Demo

In [None]:
from pathlib import Path
import llm
import json
from pydantic import BaseModel
from typing import Literal

## Load and Parse the Datafile

In [2]:
data_file = Path("./data/Copy_of_2306_SDA_Story.txt")
assert data_file.exists(), f"Data file {data_file} does not exist."
data = data_file.read_text(encoding="utf-8")

In [3]:
lines = data.splitlines()
chunks = []
current_chunk = ""

for line in lines:
    if line.startswith("06_SDA_"):
        chunks.append(current_chunk)
        current_chunk = line
    else:
        current_chunk += "\n" + line
chunks.append(current_chunk)
chunks = [chunk.strip() for chunk in chunks if chunk.strip()]

In [21]:
print(f"Number of chunks: {len(chunks)}\n")

Number of chunks: 16



## Run the Chunks through a Model

Question: How does the sentiment of participant’s diary entries change over the course of the 9-weeks?

In [22]:
class Sentiment(BaseModel):
    # restrict value to "hopeful" or "pessimistic"
    sentiment: Literal["hopeful", "pessimistic"]


model = llm.get_model("gpt-4o-mini")
for i, chunk in enumerate(chunks):
    response = model.prompt(
        chunk,
        system="What is the sentiment of the following text? Is it hopeful or pessimistic?",
        schema=Sentiment,
    )
    sentiment = json.loads(response.text())
    print(f"Sentiment for chunk {i}: {sentiment['sentiment']}")

Sentiment for chunk 0: hopeful
Sentiment for chunk 1: hopeful
Sentiment for chunk 2: hopeful
Sentiment for chunk 3: hopeful
Sentiment for chunk 4: hopeful
Sentiment for chunk 5: pessimistic
Sentiment for chunk 6: hopeful
Sentiment for chunk 7: hopeful
Sentiment for chunk 8: hopeful
Sentiment for chunk 9: hopeful
Sentiment for chunk 10: hopeful
Sentiment for chunk 11: hopeful
Sentiment for chunk 12: hopeful
Sentiment for chunk 13: hopeful
Sentiment for chunk 14: pessimistic
Sentiment for chunk 15: hopeful


In [24]:
print(chunks[5][:100])

06_SDA_07102023.m4a [Week 4A] [00:00:00 - 00:02:34]
Speaker 1: So this weekend I lost a lot of time 
