In [2]:
%load_ext autoreload
%autoreload 2

In [3]:

from openai import OpenAI
import feedparser
import time
import json

In [4]:
feed_url = 'https://pythoninvest.com/rss-feed-612566707351.xml'
output_file_path = 'data/input_news_feed.json'

feed = feedparser.parse(feed_url)

In [5]:
type(feed) # feed a dictionary


feedparser.util.FeedParserDict

In [6]:
feed['entries'][0].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'published', 'published_parsed', 'authors', 'author', 'author_detail', 'tags', 'summary', 'summary_detail', 'turbo_content'])

In [7]:
feed['entries'][0]['link']

# we're mostly interested in turbo_content

'https://pythoninvest.com/tpost/yk09rupzv1-week-17-24-july-2023'

In [8]:
# OpenAI client setup with increased timeout
# Read on Ollama - OpenAI compatibility: https://github.com/ollama/ollama/blob/main/docs/openai.md
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
    timeout = 5 * 60.0  # Increase timeout to 3*60 seconds
)

MAX_RETRIES = 3
RETRY_DELAY = 5
BATCH_SIZE = 1


In [9]:
def llm(prompt, model):
    for attempt in range(MAX_RETRIES):
        try:
            response = client.chat.completions.create(
                model=model,
                temperature=0.0,
                messages=[{"role": "user", "content": prompt}]
            )
            return response
        except Exception as e:
            if attempt < MAX_RETRIES - 1:
                print(f"Attempt {attempt + 1} failed. Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
            else:
                raise e


In [10]:
# LIST OF CURRENT LOCAL MODELS
!ollama list

NAME                        ID              SIZE      MODIFIED     
codellama:70b               e59b580dfce7    38 GB     2 days ago      
codellama:latest            8fdf8f752f6e    3.8 GB    2 days ago      
phi3.5:latest               3b387c8dd9b7    2.2 GB    2 months ago    
mixtral:8x22b               e8479ee1cb51    79 GB     2 months ago    
phi3:14b                    cf611a26b048    7.9 GB    2 months ago    
deepseek-coder-v2:latest    8577f96d693e    8.9 GB    2 months ago    
qwen2:72b                   14066dfa503f    41 GB     2 months ago    
mistral-large:latest        0ca7dfa0bf06    69 GB     2 months ago    
gemma2:27b                  53261bc9c192    15 GB     2 months ago    
llama3:70b                  786f3184aec0    39 GB     2 months ago    
llama3.1:latest             91ab477bec9d    4.7 GB    2 months ago    


In [11]:
# example calling a local LLM - is it working at all?
res = llm(prompt = "Tell me a joke",model="llama3.1:latest")
print(res.choices[0].message.content)

Here's one:

What do you call a fake noodle?

An impasta!

Hope that made you laugh! Do you want to hear another one?


* Inspired by this code:
https://github.com/alexeygrigorev/fitness-assistant/blob/main/notebooks/evaluation-data-generation.ipynb

In [12]:
ind_prompt_template = '''Expert Web Scraper.
HTML Content: {content}

Extract text AS IT IS from given HTML:
- Date ranges
- mentioned ticker 
- news count
- growth percentage
- news for the ticker

Format:
{{
  "content": [
    {{
      "type": "individual",
      "start_date": <start date for articles>,
      "end_date": <end date for articles>,
      "ticker": <ticker symbol from news>,
      "count": <articles count from news>,
      "growth": <growth %>
      "text": <news for the ticker from html>,
    }},
    // repeat for all news
  ]
}}

Constraints:
Return JSON only. 
'''

In [13]:
# Extract individual news
def extract_ind_news_json(model, ind_prompt_template, ind_html):
    print(f'')
    ind_news_prompt = ind_prompt_template.format(content=ind_html)
    print(f"----model used {model} \n ------\n prompt \n{ind_news_prompt}------\n ind_html  {ind_html}------")
    
    start = time.time()
    ind_scraped_data = llm(prompt = ind_news_prompt, model=model)
    print(f"time taken for llm is {time.time() - start}")
    
    ind_news_json = json.loads(ind_scraped_data.choices[0].message.content)
    
    # print(ind_news_json)
    return ind_news_json

In [None]:
# Let's try it locally
# scraped_data = llm(prompt = prompt, model="llama3:70b")

# Result: 
ind_news = extract_ind_news_json(model="llama3.1:latest", 
                                 ind_prompt_template=ind_prompt_template, 
                                 ind_html = feed['entries'][0]['turbo_content'])


----model used llama3.1:latest 
 ------
 prompt 
Expert Web Scraper.
HTML Content: <header><h1>Week 17-24 July 2023</h1></header><figure><img src="https://static.tildacdn.com/tild6131-3163-4135-a565-356661323733/the-broad-industry-o.svg"/></figure><div class="t-redactor__embedcode"><script async src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=ca-pub-5845276189467216"

Extract text AS IT IS from given HTML:
- Date ranges
- mentioned ticker 
- news count
- growth percentage
- news for the ticker

Format:
{
  "content": [
    {
      "type": "individual",
      "start_date": <start date for articles>,
      "end_date": <end date for articles>,
      "ticker": <ticker symbol from news>,
      "count": <articles count from news>,
      "growth": <growth %>
      "text": <news for the ticker from html>,
    },
    // repeat for all news
  ]
}

Constraints:
Return JSON only. 
------
 ind_html  <header><h1>Week 17-24 July 2023</h1></header><figure><img src="https://

In [54]:
print(ind_news)

Here is the JSON file contents with individual news summaries:

```
[
    {
        "company": "Pentair",
        "summary": "Performance in the second quarter is expected to be affected by inflated costs and supply chain issues."
    },
    {
        "company": "Robert Half",
        "summary": "Second-quarter earnings and revenues are expected to decline sequentially."
    },
    {
        "company": "VeriSign",
        "summary": "Second-quarter performance is expected to benefit from continued growth in domain name registrations."
    },
    {
        "company": "AMC",
        "summary": "The court ruling on preferred equity shares could take the company back to the drawing board."
    },
    {
        "company": "Cloros Mortgage Trust",
        "summary": "Upgraded from underweight to neutral by JP Morgan."
    },
    {
        "company": "Galmed Pharmaceuticals",
        "summary": "Upgraded from hold to buy by Maxim Group."
    },
    {
        "company": "Wells Fargo",
        

In [None]:
# TODO: DEBUG and learn how to read and store content in some array like: 
# replace "content" : null TO
# "content" : [{
#   {"type":'individual',
#    "ticker":'NVDA',
#    "text":<summary_text>}, 
#   ...., <all 1 ticker summaries>
#   {"type":'market_1day',
#    "model" :"gpt3.5 or gpt4"  
#    "ticker":'many',
#    "text":<summary_text>}, 
#  {"type":'market_1week',
#    "model" :"gpt3.5 or gpt4"  
#    "ticker":'many',
#    "text":<summary_text>} 
#  ]
