In [2]:
from typing import List, Dict, Any

business_request = """
### Deals search criteria
* Completed date (last 10 years)
* Geography (Europe)

* Deal technique (Divestment)
* Sector (Financial services)
* Size (TBD)
"""

def df_to_prompt_inputs(df: pd.DataFrame) -> List[Dict[str, Any]]:
    """
    Convert a DataFrame with columns ['source_name', 'title', 'body', 'company_names','company_codes','modification_date]
    into a list of dicts compatible with your prompt template keys:
      - news_source
      - article_title
      - article_body
      - mentioned_companies
      - mentioned_company_codes
      - modification_date
    """
    df_renamed = df.rename(columns={
        'source_name': 'news_source',
        'title': 'article_title',
        'body': 'article_body',
        'company_names': 'companies',
        'company_codes': 'company_codes',
        'modification_date': 'modification_date'
    })
    df_renamed['business_request'] = business_request
    return df_renamed[
        ['an', 'news_source', 'article_title', 'article_body', 'companies', 'company_codes','modification_date', 'business_request']
    ].reset_index().to_dict(orient='records')
    
def chunk_list(items: List[Any], batch_size: int) -> List[List[Any]]:
    """
    Split a list into sublists (batches) of max length `batch_size`.
    """
    return [items[i:i + batch_size] for i in range(0, len(items), batch_size)]

chunks = chunk_list(df_to_prompt_inputs(df), 20)

In [3]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_deployment="gpt-4.1-nano",
    api_version="2025-03-01-preview")

In [4]:
from pydantic import BaseModel,Field

class CarveOutAssessment(BaseModel):
    is_co: bool = Field(description="Is the article about a future corporate carve-out?")
    target_company_code: str = Field(description="Code of the company that may be open to divest/carve-out")
    is_relevant: bool = Field(description="Is the article relevant to the conditions of business request?")
    short_reasoning: str = Field(description="Justification for the answers provided; not more than 2 sentences")

In [5]:
template = """
You are an expert financial analyst specializing in identifying potential corporate carve-out opportunities in the financial services sector.
Your task is to analyze news articles to determine whether they indicate a company is considering divesting part of its business in the future.
Focus on European financial services companies across ALL segments including banking, wealth management, insurance, financial data,
payment processing, fund administration, brokers, and other financial services.
    
CRITICAL EVALUATION CRITERIA:
1. MUST IDENTIFY AS CARVE-OUT PROSPECT:
   - Company signals intention to divest specific business units or segments
   - Company announces strategic refocusing that implies non-core units could be sold
   - New strategic plans mentioning focus on specific segments (implying others might be divested)
   - Language about simplifying corporate structure or streamlining operations
   - References to non-core assets, underperforming units, or subscale operations
   - Management discussing 'evaluating options' for specific business segments
    
2. MUST NOT BE INCORRECTLY CLASSIFIED AS CARVE-OUT:
   - Simple stake sales without business unit separation 
   - Private equity firms exiting investments (not corporate carve-outs)
   - IPO plans without specific business unit divestiture
   - Already completed carve-outs (though mention these in reasoning as potentially interesting)
   - General M&A activity without specific divestiture signals
   - Opinion pieces or columns without factual business announcements
    
3. TIMING RELEVANCE:
   - FOCUS ON FUTURE OPPORTUNITIES: Companies currently considering or likely to consider divestitures
   - DE-PRIORITIZE: Already completed transactions (though note these in reasoning)
    
4. KEY SIGNALS OF POTENTIAL CARVE-OUTS:
   - New CEO appointments or management changes
   - Announcements of strategic reviews or new strategic plans
   - Explicit focus on 'core businesses' or 'key segments'
   - Financial pressure (debt issues, dividend concerns, performance challenges)
   - Regulatory challenges that might prompt divestiture
   - Simplification of corporate structure
   - Discontinuation of operations in certain areas
    
When selecting target_company_code:
- If multiple companies might be considering divestitures, select the primary/most likely one
- For corporate groups where a parent may divest subsidiaries, consider both parent and subsidiary codes
- If a company is refocusing on specific segments, this implies other segments could be divested
    
Business request: {business_request}

Analyze this article and provide a structured assessment of potential carve-out opportunities:
news source: {news_source}
article title: {article_title}
article body: {article_body}
companies: {companies}
company codes: {company_codes}
"""

In [6]:
from langchain.output_parsers import (
    PydanticOutputParser,
    OutputFixingParser,
    StructuredOutputParser,
)
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate, load_prompt

def build_simple_prompt(llm, template: str):
    return PromptTemplate.from_template(template) | llm | StrOutputParser()


def build_runnable_with_pydantic(template, pydantic_model, llm, **kwargs):
    output_parser = PydanticOutputParser(pydantic_object=pydantic_model)
    prompt = PromptTemplate.from_template(template, partial_variables=kwargs)
    output_fixing_parser = OutputFixingParser.from_llm(
        llm=llm, parser=output_parser
    )
    return prompt | llm | output_fixing_parser

In [7]:
chain = build_runnable_with_pydantic(
    template=template,
    pydantic_model=CarveOutAssessment,
    llm=llm,
    business_request=business_request
)

In [8]:
import time
from typing import List, Dict, Any
from pathlib import Path
from datetime import datetime
import pandas as pd
from tqdm.auto import tqdm

def process_and_save(
    chunks: List[List[Dict[str, Any]]],
    chain,
    partial_every: int = 10,
    partial_dir: str = "partials",
    final_path: str = "classification_results.csv",
    max_retries: int = 5,
    initial_delay: float = 5.0,
) -> pd.DataFrame:
    """
    Iterate over each chunk of inputs, run the chain with retry logic, and save results:
      - Shows a progress bar over chunks.
      - Retries on failure with exponential backoff.
      - Saves partial CSV after every `partial_every` chunks into `partial_dir`.
      - Saves the final combined results as `final_path`.
      - Uses company codes from input chunks instead of LLM responses.
    
    Args:
        chunks: List of batches of prompt-input dicts.
        chain: RunnableSequence with .batch().
        partial_every: Save a partial CSV every N chunks.
        partial_dir: Directory to save partial CSVs.
        final_path: Filepath for the final CSV.
        max_retries: How many times to retry a failed chunk.
        initial_delay: Base delay (in seconds) for exponential backoff.
    
    Returns:
        DataFrame of all successful results.
    """
    partial_dir = f"{partial_dir}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
    Path(partial_dir).mkdir(parents=True, exist_ok=True)
    all_records = []
    failed_chunks = []

    for idx, batch in enumerate(tqdm(chunks, desc="Processing chunks")):
        # Retry logic per chunk
        for attempt in range(1, max_retries + 1):
            try:
                assessments = chain.batch(batch)
                break
            except Exception as e:
                print(f"[Chunk {idx}][Attempt {attempt}] Error: {e}")
                if attempt < max_retries:
                    delay = initial_delay * (2 ** (attempt - 1))
                    print(f"  Retrying in {delay:.1f}s...")
                    time.sleep(delay)
                else:
                    print(f"  Chunk {idx} failed after {max_retries} attempts, skipping.")
                    failed_chunks.append(idx)
        else:
            # All retries failed: skip to next chunk
            continue

        # On success, create records and enrich with original data from chunks
        records = []
        for i, assessment in enumerate(assessments):
            record = assessment.dict()
            
            # Add original company codes and any other metadata from the input chunk
            record['original_company_codes'] = batch[i].get('company_codes', '')
            record['original_companies'] = batch[i].get('companies', '')
            record['an'] = batch[i].get('an', '')

            records.append(record)
            
        all_records.extend(records)

        # Save partials
        if (idx + 1) % partial_every == 0:
            df_partial = pd.DataFrame(all_records)
            df_partial.to_csv(Path(partial_dir) / f"partial_{idx+1}.csv", index=False)
            print(f"Saved partial results to {partial_dir}/partial_{idx+1}.csv")
        time.sleep(5)
    
    # Save final results
    df_final = pd.DataFrame(all_records)
    final_file = Path(f"{final_path}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv")
    final_file.parent.mkdir(parents=True, exist_ok=True)
    df_final.to_csv(final_file, index=False)

    if failed_chunks:
        print(f"Warning: the following chunks failed and were skipped: {failed_chunks}")
    
    return df_final

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
df_results = process_and_save(
    chunks[:],
    chain,
    partial_every=10,
    partial_dir="partial_outputs",
    final_path="classification_results.csv",
    max_retries=4,
    initial_delay=5.0)

Processing chunks:   0%|          | 0/26 [00:00<?, ?it/s]

Processing chunks:  35%|███▍      | 9/26 [10:00<19:46, 69.81s/it]

Saved partial results to partial_outputs_2025-05-07_17-14-09/partial_10.csv


Processing chunks:  73%|███████▎  | 19/26 [18:09<05:38, 48.34s/it]

Saved partial results to partial_outputs_2025-05-07_17-14-09/partial_20.csv


Processing chunks: 100%|██████████| 26/26 [23:26<00:00, 54.09s/it]


In [10]:
merged_df = pd.merge(
    df, 
    df_results,
    on=['an'],  # Adjust these keys as needed
    how='inner',
    suffixes=('_original', '_new')
)

In [11]:
merged_df["is_co"] = merged_df["is_co"].astype(int)
merged_df["is_relevant_new"] = merged_df["is_relevant_new"].astype(int)

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# For carve-out detection metrics
y_true_co = merged_df['is_about_carve_out']  # Ground truth
y_pred_co = merged_df['is_co']  # Predictions

co_metrics = {
    'accuracy': accuracy_score(y_true_co, y_pred_co),
    'precision': precision_score(y_true_co, y_pred_co),
    'recall': recall_score(y_true_co, y_pred_co),
    'f1': f1_score(y_true_co, y_pred_co),
    'confusion_matrix': confusion_matrix(y_true_co, y_pred_co)
}

# For relevance detection metrics
y_true_rel = merged_df['is_relevant_original']  # Ground truth 
y_pred_rel = merged_df['is_relevant_new']  # Predictions

rel_metrics = {
    'accuracy': accuracy_score(y_true_rel, y_pred_rel),
    'precision': precision_score(y_true_rel, y_pred_rel),
    'recall': recall_score(y_true_rel, y_pred_rel),
    'f1': f1_score(y_true_rel, y_pred_rel),
    'confusion_matrix': confusion_matrix(y_true_rel, y_pred_rel)
}

# Print results
print("Carve-out Detection Metrics:")
for metric, value in co_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric}: {value:.4f}")
print(f"Confusion Matrix:\n{co_metrics['confusion_matrix']}")

print("\nRelevance Detection Metrics:")
for metric, value in rel_metrics.items():
    if metric != 'confusion_matrix':
        print(f"{metric}: {value:.4f}")
print(f"Confusion Matrix:\n{rel_metrics['confusion_matrix']}")

Carve-out Detection Metrics:
accuracy: 0.9143
precision: 0.8433
recall: 0.8370
f1: 0.8401
Confusion Matrix:
[[346  21]
 [ 22 113]]

Relevance Detection Metrics:
accuracy: 0.7948
precision: 0.2966
recall: 0.9773
f1: 0.4550
Confusion Matrix:
[[356 102]
 [  1  43]]


In [27]:
from pydantic import BaseModel, Field
from typing import List, Optional
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser


class CarveOutPhrases(BaseModel):
    """Structured output for identifying intent and action phrases in financial news articles."""
    
    search_phrases: List[str] = Field(
        description="List of concise 3-6 word phrases that combine intent and action related to divestitures",
        default_factory=list
    )


def build_runnable_with_pydantic(template, pydantic_model, llm, **kwargs):
    output_parser = PydanticOutputParser(pydantic_object=pydantic_model)
    prompt = PromptTemplate.from_template(template, partial_variables=kwargs)
    output_fixing_parser = OutputFixingParser.from_llm(
        llm=llm, parser=output_parser
    )
    return prompt | llm | output_fixing_parser


carve_out_template = """
You are a financial analyst with expertise in corporate divestitures, carve-outs, and spin-offs.

Your task is to analyze the following financial news article and extract concise 3-6 word phrases that capture both the intent and action related to potential carve-outs or divestitures.

# ARTICLE TITLE:
{title}

# ARTICLE TEXT:
{body}

# EXAMPLES OF GOOD PHRASES:
Intent-focused phrases:
- "plan to divest"
- "intend to sell"
- "looking to dispose"
- "considering strategic options"
- "mulling asset sale"
- "weighing divestiture options"
- "strategic review of assets"
- "evaluating options for division"
- "exploring alternatives for subsidiary"
- "assessing future of unit"
- "not core to strategy"
- "exploring strategic options to"

Action-focused phrases:
- "divest non-core assets"
- "sell business unit"
- "spin-off division"
- "dispose of subsidiary"
- "carve-out regional operations"
- "exit non-strategic markets"
- "separate underperforming businesses"
- "monetize legacy assets"
- "offload international operations"
- "part ways with division"

# INSTRUCTIONS:
- Create concise 3-6 word phrases that combine both intent and action elements
- Focus on the exact language used in this specific article
- Extract phrases that would be useful for finding similar articles
- Determine if this article discusses a potential/actual carve-out or divestiture
- Rate confidence that this describes a real carve-out opportunity (1-5)

# OUTPUT FORMAT:
Extract phrases that are concise (3-6 words) but still capture the key aspects of the potential transaction.
Each phrase should be useful as a search term for finding similar articles about carve-outs.
"""

def create_carve_out_extraction_chain(llm):
    return build_runnable_with_pydantic(
        template=carve_out_template,
        pydantic_model=CarveOutPhrases,
        llm=llm,
        format_instructions=PydanticOutputParser(pydantic_object=CarveOutPhrases).get_format_instructions()
    )

In [29]:
from tqdm.auto import tqdm
# Example usage
def process_articles(df, llm):
    """
    Process each article in the dataframe to extract intent and action phrases
    
    Args:
        df: DataFrame containing financial articles
        llm: The language model to use for extraction
    
    Returns:
        Sets of unique intent and action phrases found across all articles
    """
    extraction_chain = create_carve_out_extraction_chain(llm)
    
    all_search_phrases = set()
    
    df = df[df['is_about_carve_out'] == 1]
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting phrases"):
        result = extraction_chain.invoke({
            "title": row['title'],
            "body": row['body']
        })
                
        all_search_phrases.update(result.search_phrases)
            
    return all_search_phrases

In [30]:
a = process_articles(df, llm)

Extracting phrases: 100%|██████████| 135/135 [05:48<00:00,  2.58s/it]


In [31]:
a

{'Altice must sell Portuguese operator',
 'BASF sells Brazilian coatings business',
 'BP mulls Castrol sale',
 'BP to offload Castrol',
 'Barclays exploring payments unit sale',
 'Barclays to sell payments arm',
 'Barclays to sell payments stake',
 'BlackRock trims AIB ownership',
 'Brookfield may acquire payments unit',
 'De La Rue takeover agreement',
 'HSBC considering exiting international markets',
 'HSBC reviewing retail banking operations',
 'IPO of 25-30% stake',
 'KKR may exit Northumbrian',
 'KKR must divest Northumbrian stake',
 'KKR reviewing water asset overlap',
 'Kantar to be broken up',
 'Macquarie faces Exolum divestment',
 'NAB divests MLC stake',
 'NAB strategic exit from MLC',
 'NAB to sell MLC stake',
 'NatWest full government exit plan',
 'Nexi mulls unit sale',
 'Nippon Life expansion via acquisition',
 'Nippon Life to acquire MLC',
 'Santander plans business spinoff',
 'Schroders looking to sell business',
 'Schroders weighs exiting sub-scale markets',
 'SocGen 