In [38]:
import duckdb

con = duckdb.connect("../../data/DIS/DIS_transcripts.duckdb")
df = con.execute("SELECT * FROM transcripts").fetchdf()
con.close()

df = df[
    df["content"].notna() &                                       # not NaN
    df["content"].str.strip().ne("") &                      # not empty string
    df["content"].str.strip().str.lower().ne("executives")  # not "executives"
]

df.head()

Unnamed: 0,paragraph_number,speaker,content,fiscal_year,fiscal_quarter
12,1,Executives,"Bob Iger – President, CEO Tom Staggs - CFO ...",2007,3
13,2,Analysts,Anthony Noto - Goldman Sachs Imran Khan - JP...,2007,3
14,3,Operator,"Good day, ladies and gentlemen. Thank you very...",2007,3
15,4,Lowell Singer,"Thanks, operator. Good afternoon, everyone. We...",2007,3
16,5,Bob Iger,"Thanks, Lowell. Look forward to those importan...",2007,3


In [39]:
# speaker_df = df.groupby(
#     ["fiscal_year", "fiscal_quarter", "speaker", "paragraph_number"]
# )["content"].apply(lambda x: " ".join(x.dropna())).reset_index()

# speaker_df.tail()

In [40]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np


tokenizer = AutoTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
model = AutoModelForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
model.eval()

print(model.config.id2label)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

def chunk_text_tokenwise(text, max_tokens=510):
    input_ids = tokenizer.encode(text, add_special_tokens=False)
    return [input_ids[i:i + max_tokens] for i in range(0, len(input_ids), max_tokens)]


def classify_sentiment(text):
    if not text.strip():
        return None, None

    # Tokenize and chunk
    input_ids = tokenizer.encode(text, add_special_tokens=False)
    chunks = [input_ids[i:i + 510] for i in range(0, len(input_ids), 510)]

    print(f"Processing {len(chunks)} chunks for tokenized input of length {len(input_ids)} - Max chunk length: {max(len(c) for c in chunks)}")

    scores = []

    for chunk_ids in chunks:
        input_ids_tensor = torch.tensor(
            [tokenizer.build_inputs_with_special_tokens(chunk_ids)],
            dtype=torch.long
        ).to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids_tensor)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            scores.append(probs.cpu().numpy())

    scores = np.vstack(scores)
    avg_probs = scores.mean(axis=0)

    label_idx = int(avg_probs.argmax())
    label = model.config.id2label[label_idx]
    confidence = float(avg_probs[label_idx])

    return label, confidence

df["sentiment"] = df["content"].apply(classify_sentiment)
df["sentiment_label"] = df["sentiment"].apply(lambda x: x[0] if x else None)
df["sentiment_confidence"] = df["sentiment"].apply(lambda x: x[1] if x else None)
df = df.drop(columns=["sentiment"])

{0: 'negative', 1: 'neutral', 2: 'positive'}
Processing 1 chunks for tokenized input of length 27 - Max chunk length: 27
Processing 1 chunks for tokenized input of length 140 - Max chunk length: 140
Processing 1 chunks for tokenized input of length 66 - Max chunk length: 66


Token indices sequence length is longer than the specified maximum sequence length for this model (1527 > 512). Running this sequence through the model will result in indexing errors


Processing 1 chunks for tokenized input of length 156 - Max chunk length: 156
Processing 3 chunks for tokenized input of length 1527 - Max chunk length: 510
Processing 4 chunks for tokenized input of length 1726 - Max chunk length: 510
Processing 1 chunks for tokenized input of length 212 - Max chunk length: 212
Processing 1 chunks for tokenized input of length 95 - Max chunk length: 95
Processing 1 chunks for tokenized input of length 222 - Max chunk length: 222
Processing 1 chunks for tokenized input of length 113 - Max chunk length: 113
Processing 1 chunks for tokenized input of length 164 - Max chunk length: 164
Processing 1 chunks for tokenized input of length 83 - Max chunk length: 83
Processing 1 chunks for tokenized input of length 119 - Max chunk length: 119
Processing 1 chunks for tokenized input of length 399 - Max chunk length: 399
Processing 1 chunks for tokenized input of length 25 - Max chunk length: 25
Processing 1 chunks for tokenized input of length 87 - Max chunk len

In [41]:
from transformers import pipeline

summarizer = pipeline("summarization", model="google/pegasus-xsum", tokenizer="google/pegasus-xsum")

def chunk_text_for_summarization(text, tokenizer, max_tokens=512):
    input_ids = tokenizer.encode(text, add_special_tokens=False)
    chunks = [input_ids[i:i + max_tokens] for i in range(0, len(input_ids), max_tokens)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

def summarize_long_text(text):
    if not text.strip():
        return None

    try:
        chunks = chunk_text_for_summarization(text, tokenizer)
        summaries = []

        for chunk in chunks:
            input_len = len(tokenizer.encode(chunk, add_special_tokens=False))
            max_len = max(10, min(60, int(input_len * 0.6)))  # cap at 60, floor at 20

            summary = summarizer(
                chunk,
                max_length=max_len,
                min_length=max(10, int(max_len * 0.5)),
                do_sample=False
            )[0]["summary_text"]

            summaries.append(summary)

        return " ".join(summaries)

    except Exception as e:
        return f"[summary error: {e}]"

df["summary"] = df["content"].apply(summarize_long_text)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0
Your max_length is set to 10, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 10, but your input_length is only 3. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1)
Your max_length is set to 10, but your input_length is only 8. Since this is a summarization task, where outputs shorter than the input are typical

In [42]:
conn = duckdb.connect("../../data/DIS/DIS_transcripts_sentiment.duckdb")
conn.execute("DROP TABLE IF EXISTS transcripts_sentiment")
conn.register("df", df)
conn.execute("CREATE TABLE transcripts_sentiment AS SELECT * FROM df")
conn.close()

In [43]:
conn = duckdb.connect("../../data/DIS/DIS_transcripts_sentiment.duckdb")
df_duck = conn.execute("SELECT * FROM transcripts_sentiment").fetchdf()
conn.close()

df_duck.tail(n=25)

Unnamed: 0,paragraph_number,speaker,content,fiscal_year,fiscal_quarter,sentiment_label,sentiment_confidence,summary
4505,25,Operator,Absolutely. And our next question today comes ...,2025,2,neutral,0.995668,BBC Radio 4's Today programme has been asking ...
4506,26,David Karnovsky,"Hi. Thank you. Bob, just on ESPN flagship, as ...",2025,2,neutral,0.995596,In our series of letters from African journali...
4507,27,Bob Iger,"First of all to the last point, if you are a s...",2025,2,neutral,0.996973,Here is the full transcript of Disney's earnin...
4508,28,Carlos Gomez,"Thanks, David. Operator, next question, please.",2025,2,neutral,0.999333,What is the best way to tell if
4509,29,Operator,Thank you. And our next question today comes f...,2025,2,neutral,0.99743,Each day we bring you a question from a BBC News
4510,30,Michael Morris,Thank you. Good morning. I wanted to ask a cou...,2025,2,neutral,0.976996,Here is the full transcript of Walt Disney Com...
4511,31,Hugh Johnston,"Okay. Hey, Michael. Hugh here . That's -- I th...",2025,2,neutral,0.882891,"Michael Eisner, chairman and chief executive o..."
4512,32,Bob Iger,That's China.,2025,2,neutral,0.999476,China's President Xi Jinping has been
4513,33,Hugh Johnston,"In China, correct. Because the Chinese consume...",2025,2,positive,0.995498,What are the challenges you are seeing in Chin...
4514,34,Carlos Gomez,"Thanks, Mike. Operator, next question, please.",2025,2,neutral,0.999259,Is there a way for me to
