In [15]:
%load_ext autoreload

%autoreload 2

import pandas as pd
import os
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from dotenv import load_dotenv
import yfinance as yf
from datetime import datetime, timedelta
import numpy as np
from tqdm import tqdm


# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY')
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
# Load the data
df = pd.read_parquet('data/news_feed_flattened.parquet')

# Function to calculate returns
def calculate_returns(ticker_symbol, end_date, days=5):
    try:
        # Convert end_date to datetime if it's string
        if isinstance(end_date, str):
            end_date = pd.to_datetime(end_date)
        
        # Add one day to include the end_date in the data
        download_end = end_date + timedelta(days=1)
        # Add extra days to ensure we have enough trading days
        download_start = end_date - timedelta(days=days + 5)
        
        # Download data
        ticker = yf.Ticker(ticker_symbol)
        hist = ticker.history(start=download_start, end=download_end)
        
        if len(hist) < 2:
            return None, None
        
        # Get the closing price for end_date and days ago
        end_price = hist['Close'].iloc[-1]
        start_price = hist['Close'].iloc[-days-1] if len(hist) > days else hist['Close'].iloc[0]
        
        # Calculate return
        return_val = (end_price - start_price) / start_price
        
        return return_val, end_price
    except Exception as e:
        return None, None

# Calculate growth metrics for each row
market_data = []
for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating market data"):
    end_date = pd.to_datetime(row['end_date'])
    
    # Calculate market (S&P 500) returns
    market_daily_return, _ = calculate_returns('^GSPC', end_date, days=1)
    market_weekly_return, _ = calculate_returns('^GSPC', end_date, days=5)
    
    # Initialize weekly return and growth above market
    weekly_return = None
    growth_above_market = None
    
    # For individual stocks, calculate their specific weekly return
    if row['type'] == 'individual':
        weekly_return, _ = calculate_returns(row['ticker'], end_date, days=5)
        if weekly_return is not None and market_weekly_return is not None:
            growth_above_market = weekly_return - market_weekly_return
    else:  # For market_1_day or market_1_week, use market weekly return
        weekly_return = market_weekly_return
        growth_above_market = 0 if market_weekly_return is not None else None
    
    market_data.append({
        'weekly_return': weekly_return,
        'market_daily_return': market_daily_return,
        'market_weekly_return': market_weekly_return,
        'growth_above_market': growth_above_market
    })

# Add market data to DataFrame
market_df = pd.DataFrame(market_data)
df = pd.concat([df, market_df], axis=1)

# Convert returns to percentages for display
for col in ['weekly_return', 'market_daily_return', 'market_weekly_return', 'growth_above_market']:
    df[col] = df[col].apply(lambda x: f"{x*100:.2f}%" if pd.notnull(x) else None)

df.tail()

  end_date = pd.to_datetime(row['end_date'])
  end_date = pd.to_datetime(row['end_date'])
  end_date = pd.to_datetime(row['end_date'])
  end_date = pd.to_datetime(row['end_date'])
Calculating market data:  28%|█████████████████████████████                                                                         | 204/717 [00:54<02:19,  3.68it/s]$GPS: possibly delisted; no timezone found
  end_date = pd.to_datetime(row['end_date'])
  end_date = pd.to_datetime(row['end_date'])
  end_date = pd.to_datetime(row['end_date'])
  end_date = pd.to_datetime(row['end_date'])
  end_date = pd.to_datetime(row['end_date'])
Calculating market data:  56%|████████████████████████████████████████████████████████▊                                             | 399/717 [01:51<01:21,  3.91it/s]$NYCB: possibly delisted; no timezone found
  end_date = pd.to_datetime(row['end_date'])
  end_date = pd.to_datetime(row['end_date'])
  end_date = pd.to_datetime(row['end_date'])
Calculating market data: 100%|███████████

Unnamed: 0,type,start_date,end_date,ticker,count,growth,text,link,model,weekly_return,market_daily_return,market_weekly_return,growth_above_market
712,individual,2024-11-17,2024-12-23,CELH,18,-0.96%,Celsius Holdings Inc. (CELH) is facing a decli...,https://pythoninvest.com/tpost/ve9dp9v641-week...,,-13.54%,0.73%,-1.65%,-11.90%
713,individual,2024-11-17,2024-12-23,SMCI,16,2.56%,Super Micro Computer (SMCI) has been experienc...,https://pythoninvest.com/tpost/ve9dp9v641-week...,,-3.11%,0.73%,-1.65%,-1.46%
714,individual,2024-11-17,2024-12-23,CMG,29,-0.47%,"Chipotle Mexican Grill, Inc. is facing a secur...",https://pythoninvest.com/tpost/ve9dp9v641-week...,,-4.97%,0.73%,-1.65%,-3.32%
715,market_1_day,2024-12-22,2024-12-23,multiple_tickers,64,,Recent market updates indicate a mix of positi...,https://pythoninvest.com/tpost/ve9dp9v641-week...,GPT4,-1.65%,0.73%,-1.65%,0.00%
716,market_1_week,2024-11-17,2024-12-23,multiple_tickers,2332,,1. **GLP-1 Analogues Growth**: The global mark...,https://pythoninvest.com/tpost/ve9dp9v641-week...,GPT4,-1.65%,0.73%,-1.65%,0.00%


In [17]:
# Create documents from DataFrame
documents = []
for _, row in df.iterrows():
    # Combine all relevant fields into a single text
    content = f"Type: {row['type']}\n"
    content += f"Period: {row['start_date']} to {row['end_date']}\n"
    content += f"Ticker: {row['ticker']}\n"
    content += f"Growth: {row['growth']}\n"
    content += f"Weekly Return: {row['weekly_return']}\n"
    content += f"Market Daily Return: {row['market_daily_return']}\n"
    content += f"Market Weekly Return: {row['market_weekly_return']}\n"
    content += f"Growth Above Market: {row['growth_above_market']}\n"
    content += f"Count: {row['count']}\n"
    content += f"Content: {row['text']}"
    
    # Create Document object with metadata
    doc = Document(
        page_content=content,
        metadata={
            'type': row['type'],
            'ticker': row['ticker'],
            'link': row['link'],
            'start_date': str(row['start_date']),
            'end_date': str(row['end_date']),
            'weekly_return': row['weekly_return'],
            'market_daily_return': row['market_daily_return'],
            'market_weekly_return': row['market_weekly_return'],
            'growth_above_market': row['growth_above_market']
        }
    )
    documents.append(doc)

In [18]:
# TOP 10 tickers
df.ticker.value_counts().head(11)

ticker
multiple_tickers    134
AMZN                 52
NVDA                 51
TSLA                 50
AAPL                 47
SPY                  39
GOOG                 37
MSFT                 33
DJIA                 26
SMCI                 21
QQQ                  20
Name: count, dtype: int64

In [19]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

# Split documents into chunks
splits = text_splitter.split_documents(documents)

In [20]:
# Initialize embeddings
embeddings = OpenAIEmbeddings()

# Create vector store
vectorstore = FAISS.from_documents(splits, embeddings)

In [21]:
# Initialize LLM
llm = ChatOpenAI(model_name="gpt-4", temperature=0)

# Create QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 7}),
    return_source_documents=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
        template="""You are a financial news analyst assistant. Your task is to provide accurate, 
                well-structured responses based on the provided news articles context. 
                Focus on summarizing insights and trends, with special emphasis on outstanding 
                performance metrics. When analyzing news, prioritize the following:

                1. News about stocks with significant growth above market (Growth Above Market)
                2. News about stocks with high weekly returns (Weekly Return)
                3. Recent market developments with notable trends

                For individual stocks, always mention their performance relative to the market when
                available. Highlight cases where a stock significantly outperforms or underperforms
                the market average.

                Do not exceed the line length of 80 characters to ensure readability. Use concise
                and clear language when summarizing complex details.

                Insert approximate dates where possible (e.g., Nov'24) if the exact date isn't
                available.

                USE ONLY FACTS YOU SEE IN THE NEWS, DO NOT HALLUCINATE. If details are missing,
                omit them or state that the information is not available.

            Question: {question}
            Context: {context}

            Answer: Let's analyze this based on the provided information.""",
            input_variables=["context", "question"]
        )
    }
)

In [22]:
def get_response(query):
    """Get response using the RAG chain.
    
    Args:
        query: User's query string
    
    Returns:
        Generated response and source documents
    """
    # result = qa_chain({"query": query})

    # (2025) this is a new method instead of the deprecated one
    result = qa_chain.invoke({"query": query})

    return result["result"], result["source_documents"]

In [23]:
# Test with pharma sector query
tech_query = "What are the recent trends in pharma sector news? Focus on growth patterns and significant developments."

response, sources = get_response(tech_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")
    print(f"Weekly Return: {doc.metadata['weekly_return']}")
    print(f"Market Weekly Return: {doc.metadata['market_weekly_return']}")
    print(f"Growth Above Market: {doc.metadata['growth_above_market']}")

Generated Response:

In the pharmaceutical sector, there have been significant developments and growth patterns. Novo Nordisk received a positive CHMP opinion for its insulin icodec, indicating a potential market expansion for the product. AstraZeneca plans to acquire Fusion Therapeutics, which could strengthen its market position and portfolio. The FDA approval of Dupixent by Regeneron and Sanofi marks a significant development in treating chronic lung diseases. This reflects ongoing advancements in healthcare and pharmaceuticals. 

The increased focus on personalized and precision medicine showcases growing investment opportunities in healthcare, particularly in drug development and diagnostics. This trend is expected to continue, with growth anticipated in areas such as neuroscience, needle-free injectors, and orthobiologics markets. 

However, not all news is positive. AbbVie faced a setback as its schizophrenia drug emraclidine failed to meet trial endpoints, leading to a negative

In [24]:
# Test with specific company query
company_query = "What are the latest developments and market sentiment for NVDA (NVIDIA)?"

response, sources = get_response(company_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")
    print(f"Weekly Return: {doc.metadata['weekly_return']}")
    print(f"Market Weekly Return: {doc.metadata['market_weekly_return']}")
    print(f"Growth Above Market: {doc.metadata['growth_above_market']}")

Generated Response:

From Nov'24 to Dec'24, NVDA saw a growth of -1.68% and a weekly return of -4.91%, underperforming the market. The news highlighted potential benefits from the AI boom but also concerns about high valuation and competition.

In early Jan'24, NVDA had a slight negative growth of -0.2% but a high weekly return of 11.43%, significantly outperforming the market. The company's dominance in the AI market and positive reviews from analysts were noted.

In mid-Jun'24, despite a slight negative growth of -0.7%, NVDA had a high weekly return of 12.13%, again outperforming the market. The news highlighted debates about the company's market dominance and growth potential.

In late Jan'24, NVDA saw a growth of 2.35% and a weekly return of 4.71%, outperforming the market. The company's potential growth in the AI market was noted, but some investors expressed caution about market valuations.

In mid-Sep'24, NVDA had a negative growth of -1.95% but a high weekly return of 9.69%, ou

In [25]:
# Test with specific company query (TSLA)
company_query = "What are the latest developments and market sentiment for Tesla (TSLA)?"

response, sources = get_response(company_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")
    print(f"Weekly Return: {doc.metadata['weekly_return']}")
    print(f"Market Weekly Return: {doc.metadata['market_weekly_return']}")
    print(f"Growth Above Market: {doc.metadata['growth_above_market']}")

Generated Response:

From Dec'23, Tesla (TSLA) faced significant attention and volatility with discussions 
around the launch of the Tesla robotaxi, investor searches, and debates about the 
company's focus. The stock experienced price declines and increased scrutiny, with 
a weekly return of 1.77% and growth of -1.68%.

In Sep'24, Tesla's stock saw positive movement with a weekly return of 4.86% and 
growth of -1.52%. News included high price targets, increased production in China, 
and the rollout of full self-driving software. However, concerns over Tesla's 
dominance and pricing power were also noted.

In the same month, Tesla's stock surged due to the announcement about full self-driving 
software rollout and robotics ambitions, with a weekly return of 1.01% and growth of 2.63%.

From Nov'24 to Dec'24, Tesla's stock was subject to mixed predictions, with a weekly 
return of -7.00% and growth of 2.27%. Analysts raised concerns about its high valuation 
but were optimistic about its

In [26]:
# Test with market summary query
market_query = "Provide a summary of recent market trends in December 2024. Do not add news before December 2024"

response, sources = get_response(market_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")
    print(f"Weekly Return: {doc.metadata['weekly_return']}")
    print(f"Market Weekly Return: {doc.metadata['market_weekly_return']}")
    print(f"Growth Above Market: {doc.metadata['growth_above_market']}")

Generated Response:

In December 2023, the market saw a weekly return of 2.68%, matching the market average. The U.S. economy added 199,000 jobs, surpassing expectations and indicating a stronger labor market. Consumer sentiment surged by 16.1% YoY, with inflation expectations declining. Tech firms, particularly those in AI, and clean tech sector companies were highlighted for potential growth in 2024.

However, the DJIA underperformed the market with a weekly return of -0.11%, a growth of -0.68%, and a growth above market of -0.86%. Despite a rise in consumer confidence and durable-goods orders, the leading economic index fell for the 20th consecutive month, hinting at a potential recession.

In the first week of January 2024, the market weekly return was -0.13%. Jobless claims rose slightly, and wholesale inventories declined by 0.2%. Companies like Cingulate Inc. and Microbot Medical saw significant stock price increases. The tech, energy, and healthcare sectors were expected to dom