In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
from openai import OpenAI
import feedparser
import time
import os
import pandas as pd

In [27]:
from dotenv import load_dotenv


# Get OpenAI API key from environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

In [28]:
feed_url = 'https://pythoninvest.com/rss-feed-612566707351.xml'
output_file_path = 'data/input_news_feed.json'

feed = feedparser.parse(feed_url)

In [29]:
print(f'Length of feed entries: {len(feed['entries'])}')

Length of feed entries: 49


In [30]:
print(f'Keys of one feed entry: {feed['entries'][0].keys()}')

Keys of one feed entry: dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'summary', 'summary_detail', 'turbo_content'])


In [31]:
# # If the RSS feed format would be same , then the input to the method would be an element of the python dict `feed`. 
# # So input to this method would be an element which is of the type `dict`.
# def extract_turbo_content(entry_element):
#     """
#     Extracts turbo:content from the raw XML
#     """
#     try:
#         if entry_element:  
#             content = entry_element.get('turbo_content')  
#         return content if content else None

#     except (AttributeError, KeyError, TypeError) as e:
#         # AttributeError: If entry_element doesn't support dictionary operations
#         # KeyError: If 'turbo_content' doesn't exist
#         # TypeError: If entry_element is not of the expected type
#         print(f"Error extracting turbo content: {str(e)}")

In [32]:
feed['entries'][0]['link']


'https://pythoninvest.com/tpost/yk09rupzv1-week-17-24-july-2023'

In [33]:
feed['entries'][0]['turbo_content']



In [34]:
prompt_template = '''Expert Web Scraper.
HTML Content: {content}

Perform different types of text extraction:
1) Extract individual news text AS IT IS from given HTML.
HTML Content format:
INDIVIDUAL NEWS SUMMARY
Start date for the articles: <start_date>; End date for the articles: <end_date>
NEWS SUMMARY for (<ticker>, <count>), which changed on <growth>% last trading day:
<text?

You need to extract all fields in <> :
- Date ranges
- mentioned ticker 
- news count
- growth percentage
- news for the ticker

Format:
{{
  "content": [
    {{
      "type": "individual",
      "start_date": <start date for articles>,
      "end_date": <end date for articles>,
      "ticker": <ticker symbol from news>,
      "count": <articles count from news>,
      "growth": <growth %>
      "text": <news for the ticker from html>,
    }},
    // repeat for all news
  ]
}}

2) Extract market news 1 day or 1 week text AS IT IS from given HTML:
HTML Content format:
[<model_name> <period> summary] MARKET NEWS SUMMARY ('multiple_tickers', <news_count> ) -- i.e. <news_count> news summary for the last 24 hours before <end_date> UTC time:

Extract text AS IT IS from given HTML:
- <model_name>
- <period>
- <news_count>
- <news_summary>


Output JSON format:
{{
  "content": [
    {{
      "type": "market_"+<period>,
      "end_date": <end_date>,
      "start_date": <24 hours before end_date>,
      "ticker": "multiple_tickers",
      "count": <news_count>,
      "model": <model_name>
      "text": <news_summary>,
    }},
  ]
}}


Constraints:
Return JSON only. 
'''

In [35]:
load_dotenv()  # Load variables from .env file


True

In [36]:

# Implement error handling and retries for API calls.
# Increase the timeout for API requests.
# Implement checkpointing to save progress periodically, allowing you to resume from where it left off in case of errors.
# Consider processing the data in smaller batches to reduce the impact of individual timeouts.

# GPT-4o-mini client setup with increased timeout
client = OpenAI()
    # base_url='https://api.openai.com/v1/',
    # api_key=os.getenv("OPENAI_API_KEY"),
    # timeout=5 * 60.0  # Increase timeout to 5 minutes
# )

MAX_RETRIES = 3
RETRY_DELAY = 5
BATCH_SIZE = 5

def llm(prompt, model):
    for attempt in range(MAX_RETRIES):
        try:
            response = client.chat.completions.create(
                model=model,
                temperature=0.0,
                timeout=5*60,
                messages=[{"role": "user", "content": prompt}]
            )
            return response
        except Exception as e:
            if attempt < MAX_RETRIES - 1:
                print(f"Attempt {attempt + 1} failed. Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
            else:
                raise e

In [37]:
prompt = prompt_template.format(content = feed['entries'][0]['turbo_content'])


In [38]:
# Result: 
extracted = llm(prompt=prompt, model="gpt-4o-mini")

In [39]:
extracted



In [40]:
msg = extracted.choices[0].message.content

In [41]:
print(msg)

```json
{
  "content": [
    {
      "type": "individual",
      "start_date": "2023-07-17",
      "end_date": "2023-07-24",
      "ticker": "TSLA",
      "count": 42,
      "growth": "2.43%",
      "text": "Billionaire investor Chamath Palihapitiya believes that Tesla is experiencing its \"iPhone moment\" and is on the verge of a major breakthrough. Meanwhile, investor David Trainer argues that Tesla is overvalued by over 1,000% due to its \"disconnected\" fundamentals. Tesla's second-quarter earnings report showed a rise in sales but also highlighted concerns about profitability and margins. Tesla's stock price dropped around 9% in response to the report. Despite mixed opinions, some analysts remain bullish on Tesla's future growth potential."
    },
    {
      "type": "individual",
      "start_date": "2023-07-17",
      "end_date": "2023-07-24",
      "ticker": "MSFT",
      "count": 11,
      "growth": "0.06%",
      "text": "Indian company Birlasoft has partnered with Microsoft 

In [42]:
msg2 = msg.replace("```json","").replace("```","")

In [43]:
import json
data = json.loads(msg2)


In [44]:
# Converting to a DataFrame
df = pd.DataFrame(data["content"])

In [45]:
df.head(20)

Unnamed: 0,type,start_date,end_date,ticker,count,growth,text,model
0,individual,2023-07-17,2023-07-24,TSLA,42,2.43%,Billionaire investor Chamath Palihapitiya beli...,
1,individual,2023-07-17,2023-07-24,MSFT,11,0.06%,Indian company Birlasoft has partnered with Mi...,
2,individual,2023-07-17,2023-07-24,AAPL,11,0.51%,Warren Buffett's investment in Apple has resul...,
3,individual,2023-07-17,2023-07-24,AMZN,11,-0.94%,Amazon is planning to provide high-speed inter...,
4,individual,2023-07-17,2023-07-24,NFLX,19,-0.23%,Netflix's stock has been fluctuating after the...,
5,individual,2023-07-17,2023-07-24,VZ,13,1.39%,Verizon Business and the VA Palo Alto Health C...,
6,individual,2023-07-17,2023-07-24,CVNA,21,-2.46%,"Carvana, an online used car dealer, announced ...",
7,individual,2023-07-17,2023-07-24,DJIA,8,-0.58%,The stock market rally is expected to continue...,
8,individual,2023-07-17,2023-07-24,SPY,7,0.44%,The news includes various headlines related to...,
9,market_1_day,2023-07-23,2023-07-24,multiple_tickers,261,,- JPMorgan's strong earnings report is expecte...,market


In [46]:
# save to File

# import os
# output_file_path = '../data/news_feed.parquet'
# os.makedirs("../data", exist_ok=True)
# df.to_parquet(output_file_path, compression="brotli")