In [50]:
%load_ext autoreload

%autoreload 2
from openai import OpenAI
import feedparser
import time
import os
import pandas as pd
from dotenv import load_dotenv
import yfinance as yf
from datetime import datetime, timedelta
import numpy as np
from tqdm import tqdm
import requests
import urllib.request

# Get OpenAI API key from environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
feed_url = 'https://pythoninvest.com/rss-feed-612566707351.xml'
feed_content = None  # Initialize feed_content

# Set up headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}

# Create a custom URL opener with headers for feedparser
opener = urllib.request.build_opener()
opener.addheaders = [(k, v) for k, v in headers.items()]
feedparser.USER_AGENT = headers['User-Agent']

# First get the raw content
try:
    response = requests.get(feed_url, headers=headers)
    response.raise_for_status()  # Raise an exception for bad status codes
    print("Feed content retrieved successfully")
    print("Content length:", len(response.text))
    
    # Now parse with feedparser using the raw content
    feed = feedparser.parse(response.text)
    
    print("\nFeed parsing results:")
    print(f"Number of entries: {len(feed.entries)}")
    if len(feed.entries) > 0:
        print("\nFirst entry available fields:", feed.entries[0].keys())
        print("\nFirst entry content:")
        if 'turbo_content' in feed.entries[0]:
            print("\nturbo_content found")
            feed_content = feed.entries[0]['turbo_content']
        elif 'content' in feed.entries[0]:
            print("\ncontent found")
            feed_content = feed.entries[0]['content'][0]['value']
        elif 'description' in feed.entries[0]:
            print("\ndescription found")
            feed_content = feed.entries[0]['description']
        else:
            print("\nNo content found in expected fields")
            feed_content = response.text
        
        print("\nContent to process:")
        print(feed_content[:500], "...")
except Exception as e:
    print(f"Error: {str(e)}")
    if isinstance(e, requests.exceptions.RequestException):
        print("\nResponse details:")
        if hasattr(e.response, 'status_code'):
            print(f"Status code: {e.response.status_code}")
        if hasattr(e.response, 'headers'):
            print(f"Headers: {dict(e.response.headers)}")

Feed content retrieved successfully
Content length: 1002823

Feed parsing results:
Number of entries: 58

First entry available fields: dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'summary', 'summary_detail', 'turbo_content'])

First entry content:

turbo_content found

Content to process:
<header><h1>Week 17-24 July 2023</h1></header><figure><img src="https://static.tildacdn.com/tild6131-3163-4135-a565-356661323733/the-broad-industry-o.svg"/></figure><div class="t-redactor__embedcode"><script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=ca-pub-5845276189467216"
     crossorigin="anonymous"></script></div><div class="t-redactor__text"><strong>DISCLAIMER</strong><br />The content provided below was generated by AI (OpenAI's ChatGPT) using titles  ...


In [52]:
# Try direct feedparser approach with custom opener as fallback
if feed_content is None:
    try:
        print("Trying direct feedparser approach with custom headers...")
        # Install our custom opener
        urllib.request.install_opener(opener)
        feed = feedparser.parse(feed_url)
        
        print(f"Number of entries: {len(feed.entries)}")
        if len(feed.entries) > 0:
            print("\nFirst entry available fields:", feed.entries[0].keys())
            print("\nFirst entry content:")
            if 'turbo_content' in feed.entries[0]:
                print("\nturbo_content found")
                feed_content = feed.entries[0]['turbo_content']
            elif 'content' in feed.entries[0]:
                print("\ncontent found")
                feed_content = feed.entries[0]['content'][0]['value']
            elif 'description' in feed.entries[0]:
                print("\ndescription found")
                feed_content = feed.entries[0]['description']
            else:
                print("\nNo content found in expected fields")
                feed_content = response.text if 'response' in locals() else None
            
            if feed_content is not None:
                print("\nContent to process:")
                print(feed_content[:500], "...")
            else:
                print("\nNo content available to process")
    except Exception as e:
        print(f"Error with direct feedparser: {str(e)}")

In [53]:
# Clean and prepare content for processing
if feed_content is not None:
    # Clean content by replacing problematic characters and normalizing newlines
    feed_content = feed_content.replace('\r\n', '\n').replace('\r', '\n')
    feed_content = feed_content.replace('"', '\"')  # Escape double quotes
    feed_content = feed_content.strip()  # Remove leading/trailing whitespace
    
    print("Content cleaned and prepared for processing")
    print("Content length after cleaning:", len(feed_content))

Content cleaned and prepared for processing
Content length after cleaning: 16416


In [54]:
prompt_template = '''Expert Web Scraper.

HTML Content: {content}

Perform different types of text extraction:

1) Extract individual news text AS IT IS from given HTML.

HTML Content format:
INDIVIDUAL NEWS SUMMARY
Start date for the articles: <start_date>; End date for the articles: <end_date>
NEWS SUMMARY for (<ticker>, <count>), which changed on <growth>% last trading day:
<text>

You need to extract the actual values from the HTML content for each field marked with <>. Do not return placeholder values.
For example, if the HTML contains "Start date for the articles: 2023-07-17; End date for the articles: 2023-07-24", 
use these actual dates in your JSON output, not placeholders like <start_date> or <end_date>.

Required fields to extract:
- Date ranges (in YYYY-MM-DD format)
- Mentioned ticker (actual stock symbol)
- News count (numeric value)
- Growth percentage (numeric value)
- News text (actual news content)

Format:
{{
  "content": [
    {{
      "type": "individual",
      "start_date": "YYYY-MM-DD",  // Use actual date from content
      "end_date": "YYYY-MM-DD",    // Use actual date from content
      "ticker": "SYMBOL",          // Use actual ticker from content
      "count": 123,                // Use actual count from content
      "growth": 12.34,             // Use actual growth % from content
      "text": "actual news text"   // Use actual text from content
    }},
    // repeat for all news items found
  ]
}}

2) Extract market news 1 day or 1 week text AS IT IS from given HTML:
HTML Content format:
[<model_name> <period> summary] MARKET NEWS SUMMARY ('multiple_tickers', <news_count> ) -- i.e. <news_count> news summary for the last 24 hours before <end_date> UTC time:

Extract the actual values for:
- Model name (actual name used)
- Period (actual period mentioned)
- News count (actual numeric value)
- News summary (actual text content)

Output JSON format:
{{
  "content": [
    {{
      "type": "market_"+period,     // Concatenate with actual period value
      "end_date": "YYYY-MM-DD",    // Use actual date from content
      "start_date": "YYYY-MM-DD",  // Calculate 24h before end_date
      "ticker": "multiple_tickers",
      "count": 123,                // Use actual count from content
      "model": "actual_model_name",// Use actual model name from content
      "text": "actual summary"     // Use actual text from content
    }},
  ]
}}

Constraints:
1. Return valid JSON only
2. Use actual values from the content, not placeholders
3. Ensure dates are in YYYY-MM-DD format
4. Ensure numeric values (count, growth) are numbers, not strings
'''

In [55]:
load_dotenv()  # Load variables from .env file

# GPT-4o-mini client setup with increased timeout
client = OpenAI()

MAX_RETRIES = 3
RETRY_DELAY = 5
BATCH_SIZE = 5

def llm(prompt, model):
    for attempt in range(MAX_RETRIES):
        try:
            response = client.chat.completions.create(
                model=model,
                temperature=0.0,
                timeout=5*60,
                messages=[{"role": "user", "content": prompt}]
            )
            return response
        except Exception as e:
            if attempt < MAX_RETRIES - 1:
                print(f"Attempt {attempt + 1} failed. Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
            else:
                raise e

In [56]:
if feed_content is not None:
    print("Processing feed content...")
    # Format prompt with cleaned content
    prompt = prompt_template.format(content=feed_content)
    extracted = llm(prompt=prompt, model="gpt-4o-mini")
else:
    print("No feed content available to process")

Processing feed content...


In [57]:
if 'extracted' in locals():
    msg = extracted.choices[0].message.content
    print(msg)

```json
{
  "content": [
    {
      "type": "individual",
      "start_date": "2023-07-17",
      "end_date": "2023-07-24",
      "ticker": "TSLA",
      "count": 42,
      "growth": -2.43,
      "text": "Billionaire investor Chamath Palihapitiya believes that Tesla is experiencing its \"iPhone moment\" and is on the verge of a major breakthrough. Meanwhile, investor David Trainer argues that Tesla is overvalued by over 1,000% due to its \"disconnected\" fundamentals. Tesla's second-quarter earnings report showed a rise in sales but also highlighted concerns about profitability and margins. Tesla's stock price dropped around 9% in response to the report. Despite mixed opinions, some analysts remain bullish on Tesla's future growth potential."
    },
    {
      "type": "individual",
      "start_date": "2023-07-17",
      "end_date": "2023-07-24",
      "ticker": "MSFT",
      "count": 11,
      "growth": 0.06,
      "text": "Indian company Birlasoft has partnered with Microsoft to es

In [58]:
if 'msg' in locals():
    msg2 = msg.replace("```json","").replace("```","")
    import json

    data = json.loads(msg2)

    # Converting to a DataFrame
    df = pd.DataFrame(data["content"])
    
    # Rename 'growth' to 'growth_last_day' and divide by 100
    if 'growth' in df.columns:
        df['growth_last_day'] = df['growth'].apply(lambda x: x/100 if pd.notnull(x) else x)
        df = df.drop('growth', axis=1)
    
    df.head(20)

In [59]:
if 'df' in locals():
    # Function to calculate returns
    def calculate_returns(ticker_symbol, end_date, days=5):
        try:
            # Convert end_date to datetime if it's string
            if isinstance(end_date, str):
                end_date = pd.to_datetime(end_date)
            
            # Add one day to include the end_date in the data
            download_end = end_date + timedelta(days=1)
            # Add extra days to ensure we have enough trading days
            download_start = end_date - timedelta(days=days + 5)
            
            # Download data
            ticker = yf.Ticker(ticker_symbol)
            hist = ticker.history(start=download_start, end=download_end)
            
            if len(hist) < 2:
                return None, None
            
            # Get the closing price for end_date and days ago
            end_price = hist['Close'].iloc[-1]
            start_price = hist['Close'].iloc[-days-1] if len(hist) > days else hist['Close'].iloc[0]
            
            # Calculate return
            return_val = (end_price - start_price) / start_price
            
            return return_val, end_price
        except Exception as e:
            return None, None

    # Calculate growth metrics for each row
    market_data = []
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating market data"):
        end_date = pd.to_datetime(row['end_date'])
        
        # Calculate market (S&P 500) returns
        market_daily_return, _ = calculate_returns('^GSPC', end_date, days=1)
        market_weekly_return, _ = calculate_returns('^GSPC', end_date, days=5)
        
        # Initialize weekly return and growth above market
        weekly_return = None
        growth_above_market = None
        
        # For individual stocks, calculate their specific weekly return
        if row['type'] == 'individual':
            weekly_return, _ = calculate_returns(row['ticker'], end_date, days=5)
            if weekly_return is not None and market_weekly_return is not None:
                growth_above_market = weekly_return - market_weekly_return
        else:  # For market_1_day or market_1_week, use market weekly return
            weekly_return = market_weekly_return
            growth_above_market = 0 if market_weekly_return is not None else None
        
        market_data.append({
            'weekly_return': weekly_return,
            'market_daily_return': market_daily_return,
            'market_weekly_return': market_weekly_return,
            'growth_above_market': growth_above_market
        })

    # Add market data to DataFrame
    market_df = pd.DataFrame(market_data)
    df = pd.concat([df, market_df], axis=1)

    # Keep returns as float values (no percentage string conversion)
    df.tail()

Calculating market data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 36.34it/s]


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  11 non-null     object 
 1   start_date            11 non-null     object 
 2   end_date              11 non-null     object 
 3   ticker                11 non-null     object 
 4   count                 11 non-null     int64  
 5   text                  11 non-null     object 
 6   model                 2 non-null      object 
 7   growth_last_day       9 non-null      float64
 8   weekly_return         11 non-null     float64
 9   market_daily_return   11 non-null     float64
 10  market_weekly_return  11 non-null     float64
 11  growth_above_market   11 non-null     float64
dtypes: float64(5), int64(1), object(6)
memory usage: 1.2+ KB


In [61]:
df

Unnamed: 0,type,start_date,end_date,ticker,count,text,model,growth_last_day,weekly_return,market_daily_return,market_weekly_return,growth_above_market
0,individual,2023-07-17,2023-07-24,TSLA,42,Billionaire investor Chamath Palihapitiya beli...,,-0.0243,-0.073421,0.004034,0.007042,-0.080463
1,individual,2023-07-17,2023-07-24,MSFT,11,Indian company Birlasoft has partnered with Mi...,,0.0006,-0.001793,0.004034,0.007042,-0.008836
2,individual,2023-07-17,2023-07-24,AAPL,11,Warren Buffett's investment in Apple has resul...,,0.0051,-0.006392,0.004034,0.007042,-0.013434
3,individual,2023-07-17,2023-07-24,AMZN,11,Amazon is planning to provide high-speed inter...,,-0.0094,-0.035639,0.004034,0.007042,-0.042682
4,individual,2023-07-17,2023-07-24,NFLX,19,Netflix's stock has been fluctuating after the...,,-0.0023,-0.048172,0.004034,0.007042,-0.055215
5,individual,2023-07-17,2023-07-24,VZ,13,Verizon Business and the VA Palo Alto Health C...,,0.0139,0.080102,0.004034,0.007042,0.07306
6,individual,2023-07-17,2023-07-24,CVNA,21,"Carvana, an online used car dealer, announced ...",,-0.0246,0.265809,0.004034,0.007042,0.258767
7,individual,2023-07-17,2023-07-24,DJIA,8,The stock market rally is expected to continue...,,-0.0058,0.005208,0.004034,0.007042,-0.001834
8,individual,2023-07-17,2023-07-24,SPY,7,The news includes various headlines related to...,,0.0044,0.007453,0.004034,0.007042,0.000411
9,market_1_day,2023-07-23,2023-07-24,multiple_tickers,261,- JPMorgan's strong earnings report is expecte...,market,,0.007042,0.004034,0.007042,0.0


In [62]:
# SAVE TO PARQUET - comment for now, as we save a bulk scrape in 02_get_content_data_flattened.py script

# if 'df' in locals():
#     # Save to File
#     output_file_path = 'data/news_feed.parquet'
#     os.makedirs("data", exist_ok=True)
#     df.to_parquet(output_file_path, compression="brotli")
#     print(f"Data saved to {output_file_path}")