In [None]:
!pip install OpenAI python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1


In [None]:
import pandas as pd
import openai
import json
import time
import re
import os
from dotenv import load_dotenv


def extract_tickers_from_news(df, api_key, title_col='title', content_col='summary'):
    """
    Extract Yahoo Finance ticker symbols and company names from news dataframe using OpenAI.

    Parameters:
    df (pd.DataFrame): News dataframe
    api_key (str): OpenAI API key
    title_col (str): Column name for titles
    content_col (str): Column name for content/summary

    Returns:
    pd.DataFrame: Original dataframe with 'tickers' and 'companies' columns added
    """

    # Initialize OpenAI client
    client = openai.OpenAI(api_key="")

    # Copy dataframe
    result_df = df.copy()
    ticker_lists = []
    company_lists = []

    print(f"Processing {len(df)} news items...")

    for i, row in df.iterrows():
        # Combine title and content
        title = str(row.get(title_col, ''))
        content = str(row.get(content_col, ''))
        text = f"{title} {content}".strip()

        # Skip if no text
        if not text:
            ticker_lists.append([])
            company_lists.append([])
            continue

        try:
            # Call OpenAI API
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {
                        "role": "system",
                        "content": "Extract publicly traded companies from news text. Return JSON with 'tickers' and 'companies' arrays. Example: {\"tickers\": [\"AAPL\", \"MSFT\"], \"companies\": [\"Apple Inc\", \"Microsoft Corporation\"]}. If no companies found, return {\"tickers\": [], \"companies\": []}."
                    },
                    {
                        "role": "user",
                        "content": f"Extract company tickers and names from this news text: {text[:1000]}"  # Limit text length
                    }
                ],
                max_tokens=100,
                temperature=0
            )

            # Parse response
            content = response.choices[0].message.content.strip()

            try:
                # Try to parse as JSON
                result = json.loads(content)
                if isinstance(result, dict) and 'tickers' in result and 'companies' in result:
                    # Clean tickers and companies
                    clean_tickers = [ticker.upper() for ticker in result['tickers'] if isinstance(ticker, str) and ticker.isalpha()]
                    clean_companies = [company.strip() for company in result['companies'] if isinstance(company, str)]
                    ticker_lists.append(clean_tickers)
                    company_lists.append(clean_companies)
                else:
                    ticker_lists.append([])
                    company_lists.append([])
            except json.JSONDecodeError:
                # Fallback: extract ticker-like patterns
                ticker_pattern = r'\b[A-Z]{1,5}\b'
                matches = re.findall(ticker_pattern, content)
                ticker_lists.append(matches)
                company_lists.append([])

        except Exception as e:
            print(f"Error processing row {i}: {e}")
            ticker_lists.append([])
            company_lists.append([])

        # Progress update
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{len(df)} items")

        # Rate limiting
        time.sleep(0.5)

    # Add tickers and companies to dataframe
    result_df['tickers'] = ticker_lists
    result_df['companies'] = company_lists

    print(f"Extraction complete! Found companies in {sum(1 for t in ticker_lists if t)} out of {len(df)} items")
    return result_df


In [None]:
sample_data = {
    'title': [
        'Exxon Mobil Reports Strong Q3 Earnings',
        'Tesla Expands Supercharger Network',
        'Apple Announces New iPhone'
    ],
    'summary': [
        'Exxon Mobil Corporation exceeded expectations...',
        'Tesla Inc. continues infrastructure expansion...',
          'Apple Inc. unveiled its latest smartphone...'
    ]
}

df = pd.DataFrame(sample_data)


# Load environment variables from .env file
load_dotenv()

# Extract tickers (API key loaded from environment)
API_KEY = os.getenv("API_KEY")
result = extract_tickers_from_news(df, API_KEY)


# Display results
print("\nResults:")
for i, row in result.iterrows():
     print(f"Title: {row['title']}")
     print(f"Tickers: {row['tickers']}")
     print(f"Companies: {row['companies']}")
     print("-" * 40)

Processing 3 news items...
Extraction complete! Found companies in 3 out of 3 items

Results:
Title: Exxon Mobil Reports Strong Q3 Earnings
Tickers: ['XOM']
Companies: ['Exxon Mobil Corporation']
----------------------------------------
Title: Tesla Expands Supercharger Network
Tickers: ['TSLA']
Companies: ['Tesla Inc.']
----------------------------------------
Title: Apple Announces New iPhone
Tickers: ['AAPL']
Companies: ['Apple Inc']
----------------------------------------
