In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import os
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.documents import Document
from dotenv import load_dotenv
from tqdm import tqdm

# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY')
)

In [25]:
# Load the data with market stats
df = pd.read_parquet('data/news_feed_with_market_stats.parquet')
df.tail()

Unnamed: 0,type,start_date,end_date,ticker,count,text,link,model,growth_last_day,weekly_return,market_daily_return,market_weekly_return,growth_above_market
854,individual,2023-08-08,2023-08-14,DJIA,9,U.S. bank lending remained flat in the latest ...,https://pythoninvest.com/tpost/u7h0i2kxy1-week...,,0.004,0.007191,0.00575,-0.006356,0.013547
855,individual,2023-08-08,2023-08-14,RIVN,12,"Rivian, the electric vehicle maker, reported i...",https://pythoninvest.com/tpost/u7h0i2kxy1-week...,,-0.0005,-0.112026,0.00575,-0.006356,-0.10567
856,individual,2023-08-08,2023-08-14,SPY,7,Here's a summary of the news: - Jim Cramer adv...,https://pythoninvest.com/tpost/u7h0i2kxy1-week...,,0.0055,-0.005769,0.00575,-0.006356,0.000587
857,market_1_day,2023-08-12,2023-08-13,multiple_tickers,228,This week's financial news was characterized b...,https://pythoninvest.com/tpost/u7h0i2kxy1-week...,market news,,-0.003122,-0.00107,-0.003122,
858,market_1_week,2023-08-08,2023-08-14,multiple_tickers,2210,This week's financial news was characterized b...,https://pythoninvest.com/tpost/u7h0i2kxy1-week...,market news,,-0.006356,0.00575,-0.006356,


In [26]:
def create_document(row):
    """Create a Document object from a DataFrame row."""
    content = f"Type: {row['type']}\n"
    content += f"Period: {row['start_date']} to {row['end_date']}\n"
    content += f"Ticker: {row['ticker']}\n"
    content += f"Growth (last day): {row['growth_last_day']:.2%}\n"
    content += f"Weekly Return: {row['weekly_return']:.2%}\n"
    content += f"Market Daily Return: {row['market_daily_return']:.2%}\n"
    content += f"Market Weekly Return: {row['market_weekly_return']:.2%}\n"
    content += f"Growth Above Market: {row['growth_above_market']:.2%}\n"
    content += f"Count: {row['count']}\n"
    content += f"Content: {row['text']}"
    
    return Document(
        page_content=content,
        metadata={
            'type': row['type'],
            'ticker': row['ticker'],
            'link': row['link'],
            'start_date': str(row['start_date']),
            'end_date': str(row['end_date']),
            'weekly_return': row['weekly_return'],
            'market_daily_return': row['market_daily_return'],
            'market_weekly_return': row['market_weekly_return'],
            'growth_above_market': row['growth_above_market']
        }
    )

# Create documents from DataFrame
documents = [create_document(row) for _, row in df.iterrows()]

In [27]:
# TOP 10 tickers - we will ask questions about them later
df.ticker.value_counts().head(11)

ticker
multiple_tickers    268
NVDA                 54
AMZN                 53
TSLA                 51
AAPL                 47
GOOG                 39
SPY                  37
MSFT                 31
DJIA                 24
SMCI                 21
QQQ                  20
Name: count, dtype: int64

In [28]:
def setup_qa_chain(documents):
    """Set up the QA chain with the given documents."""
    # Initialize text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    
    # Split documents
    splits = text_splitter.split_documents(documents)
    
    # Initialize embeddings and vector store
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(splits, embeddings)
    
    # Initialize LLM
    llm = ChatOpenAI(model_name="gpt-4", temperature=0)
    
    # Create prompt template
    prompt = PromptTemplate(
        template="""You are a financial news analyst assistant. Your task is to provide accurate, 
                well-structured responses based on the provided news articles context. Present the 
                information in chronological order, from earliest to most recent events.

                Format each section with a concise header showing period and performance:
                [YYYY-MM-DD..YYYY-MM-DD, +/-X.X% vs market]

                For individual stocks:
                1. Start each section with the period and growth header format shown above
                2. Follow with key developments and context during that period
                3. Include weekly returns comparison (stock vs market) if significant
                4. Explain what drove the performance

                For market-wide analysis:
                1. Use the same chronological structure with period headers
                2. Highlight notable sector or stock-specific movements
                3. Include market-wide return metrics when relevant

                Example format:
                [2024-01-01..2024-01-07, +2.3% vs market]
                Key developments and analysis...

                [2024-01-08..2024-01-14, -1.5% vs market]
                Key developments and analysis...

                Keep each section concise and focused. Do not exceed the line length of 80 
                characters to ensure readability.

                Structure your response to tell a compelling story, even without showing sources.
                Focus on chronological progression while maintaining accuracy and including all 
                key metrics.

                USE ONLY FACTS YOU SEE IN THE NEWS, DO NOT HALLUCINATE. If details are missing,
                omit them or state that the information is not available.

                Question: {question}
                Context: {context}

                Answer: Let's analyze this based on the provided information.""",
        input_variables=["context", "question"]
    )
    
    # Create and return chain
    chain = (
        {"context": vectorstore.as_retriever(search_kwargs={"k": 7}), "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return chain, vectorstore.as_retriever(search_kwargs={"k": 7})

# Setup chain
chain, retriever = setup_qa_chain(documents)

In [29]:
def print_sources(sources):
    """Print source documents in a formatted way."""
    print("\nSource Documents:")
    for i, doc in enumerate(sources, 1):
        print(f"\nSource {i}:")
        print(f"Type: {doc.metadata['type']}")
        print(f"Ticker: {doc.metadata['ticker']}")
        print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
        print(f"Link: {doc.metadata['link']}")
        print(f"Weekly Return: {doc.metadata['weekly_return']:.2%}")
        print(f"Market Weekly Return: {doc.metadata['market_weekly_return']:.2%}")
        print(f"Growth Above Market: {doc.metadata['growth_above_market']:.2%}")

def analyze_news(question, show_sources=True):
    """Analyze news based on a question."""
    # Get response
    response = chain.invoke(question)
    sources = retriever.invoke(question)
    
    # Calculate date range from relevant documents
    dates = [(doc.metadata['start_date'], doc.metadata['end_date']) for doc in sources]
    min_date = min(date[0] for date in dates)
    max_date = max(date[1] for date in dates)
    weeks = round((pd.to_datetime(max_date) - pd.to_datetime(min_date)).days / 7)
    
    # Determine if this is a ticker-specific analysis
    ticker_sources = [doc for doc in sources if doc.metadata['type'] == 'individual']
    if ticker_sources:
        # Get the most common ticker from individual sources
        tickers = [doc.metadata['ticker'] for doc in ticker_sources]
        if tickers:
            most_common_ticker = max(set(tickers), key=tickers.count)
            print(f"\nLong term news for {most_common_ticker} in the last {weeks} weeks ({min_date}..{max_date}):\n")
        else:
            print(f"\nAnalysis for the last {weeks} weeks ({min_date}..{max_date}):\n")
    else:
        print(f"\nAnalysis for the last {weeks} weeks ({min_date}..{max_date}):\n")
    
    print(response)
    
    if show_sources:
        print_sources(sources)

In [30]:
# Test with pharma sector query - kinda ok
tech_query = "What are the recent trends in pharma sector news? Focus on growth patterns and significant developments."
analyze_news(tech_query)


Analysis for the last 13 weeks (2024-10-04..2025-01-06 00:00:00):

[2024-10-04..2024-11-04, -1.9% vs market]
The pharma sector saw a shift towards personalized and precision medicine, particularly in drug development and diagnostics. This trend indicates growing investment opportunities in healthcare.

[2024-11-12..2024-12-16, +0.35% vs market]
Regeneron and Sanofi received FDA approval for Dupixent, a significant development in treating chronic lung diseases. This approval reflects ongoing advancements in the healthcare sector.

[2024-11-28..2025-01-06, +0.076% vs market]
The pharma sector saw advancements in drug pipelines, particularly in obesity and diabetes treatments. These advancements are attracting investor interest.

[2024-11-10..2024-11-11, +5.05% vs market]
AbbVie faced a setback as its schizophrenia drug emraclidine failed to meet trial endpoints, leading to a negative stock reaction. However, the healthcare sector is expected to see growth in areas such as neuroscience, 

In [31]:
# Test with market summary query - wrong
market_query = "Provide a summary of recent market trends in December 2024. Do not add news before December 2024"
analyze_news(market_query)


Long term news for DJIA in the last 15 weeks (2023-12-11..2024-03-26 00:00:00):

[2023-12-11..2023-12-18, +2.56% vs market]
During this period, the U.S. economy added 199,000 jobs in November 2023, exceeding expectations and indicating a stronger labor market. Consumer sentiment in the U.S. surged by 16.1% year-over-year in December, with inflation expectations declining. Technology firms, particularly Alphabet, focused on artificial intelligence (AI) advancements. Despite a shift in market sentiment towards value stocks, Cathie Wood of ARK Invest actively purchased shares in growth companies. Morgan Stanley closely watched several companies within the clean tech sector, including renewable energy, for potential growth trends in 2024.

[2023-12-14..2023-12-25, -0.11% vs market]
The DJIA experienced a weekly return of -0.11%, underperforming the market by 0.86%. Despite a surge in consumer confidence and a rise in durable-goods orders, the leading economic index fell for the 20th conse

In [32]:
# Test with specific company query
company_query = "What are the latest developments and market sentiment for NVDA (NVIDIA)?"
analyze_news(company_query)


Long term news for NVDA in the last 68 weeks (2023-08-25..2024-12-16 00:00:00):

[2023-08-25..2023-09-04, +5.41% vs market]
Nvidia (NVDA) saw a weekly return of 5.41%, outperforming the market by 2.91%. The company's strong financial performance, partnerships, and growth potential in the AI and semiconductor markets were key factors driving this performance. The company's recent earnings report surpassed expectations, leading analysts to believe that the stock has room to grow further.

[2024-01-05..2024-01-15, +11.43% vs market]
Nvidia continued its strong performance with a weekly return of 11.43%, significantly outperforming the market by 9.59%. The company's dominance in the AI market and recent announcements of AI-driven solutions at CES 2024 contributed to its positive momentum. Wall Street analysts remained optimistic about the company's growth potential.

[2024-01-22..2024-01-29, +4.71% vs market]
In this period, Nvidia's weekly return was 4.71%, outperforming the market by 3.

In [33]:
# Test with specific company query (AMZN)
company_query = "What are the latest developments and market sentiment for Amazon (AMZN)?"
analyze_news(company_query)


Long term news for AMZN in the last 64 weeks (2023-09-01..2024-11-25 00:00:00):

[2023-09-01..2023-09-11, +3.61% vs market]
During this period, Amazon's stock showed a positive performance with a weekly return of 3.61%, outperforming the market by 4.23%. The company's growth was driven by its competitive advantage in logistics and the performance of its web services segment. Despite potential legal and regulatory concerns, the overall market sentiment remained bullish.

[2023-10-31..2023-11-06, +5.30% vs market]
Amazon's stock continued to perform well, with a weekly return of 5.30%, slightly above the market by 0.52%. The company's positive performance was highlighted by a 60% increase in 2023 and improved operating margins. Amazon's e-commerce and cloud computing segments showed strong performance, driving investor interest. However, there were conflicting viewpoints on whether it was a good time to buy the stock.

[2024-02-27..2024-03-04, +1.63% vs market]
In this period, Amazon's 

In [34]:
# Test with specific company query 
company_query = "What are the latest developments and market sentiment for Tesla (TSLA)?"
analyze_news(company_query)


Long term news for TSLA in the last 55 weeks (2023-12-01..2024-12-23 00:00:00):

[2023-12-01..2023-12-11, +1.77% vs market]
Tesla's stock saw a weekly return of 1.77%, outperforming the market by 0.61%. The period was marked by discussions about the potential impact of tax credits on Tesla vehicle purchases, varying stock price predictions for 2024, and debates about the future of artificial intelligence in Tesla's vehicles. The Cybertruck generated significant consumer interest, and there were discussions about potential competition in the electric vehicles market.

[2024-04-02..2024-04-08, -1.28% vs market]
Tesla's stock experienced a weekly return of -1.28%, underperforming the market by 0.49%. The period was characterized by significant attention and volatility, with news about the launch of the Tesla robotaxi, increased investor searches, and debates about the company's focus on autonomy versus lower-cost electric vehicles. Analysts were divided on the stock's future, with opinio

In [35]:
# Test with specific company query 
company_query = "What are the latest developments and market sentiment for Apple (AAPL)?"
analyze_news(company_query)


Long term news for AAPL in the last 63 weeks (2023-09-15..2024-12-02 00:00:00):

[2023-09-15..2023-09-25, -1.06% vs market]
During this period, Apple's stock saw a weekly return of -1.06%, performing better than the market which had a weekly return of -2.61%. The key developments included Morgan Stanley analyst Erik Woodring reiterating an "overweight" rating and a $215 price target for Apple due to strong demand for the iPhone 15. However, Apple's FineWoven iPhone cases faced criticism for their susceptibility to scratches and premature wear.

[2023-11-24..2023-12-04, -0.19% vs market]
Apple's stock had a weekly return of -0.19%, underperforming the market's weekly return of 0.43%. Analysts remained optimistic about Apple's future, citing strong demand for the iPhone 15 and potential for expansion into new services. Despite recent declines, Apple was still seen as outperforming its competitors.

[2024-03-05..2024-03-11, -1.34% vs market]
In this period, Apple's stock saw a weekly ret

In [36]:
# Test with specific company query 
company_query = "What are the latest developments and market sentiment for Alphabet/Google (GOOG)?"
analyze_news(company_query)


Long term news for GOOG in the last 63 weeks (2023-09-08..2024-11-25 00:00:00):

[2023-09-08..2023-09-18, +0.89% vs market]
Alphabet Inc. (GOOG) saw a weekly return of 0.89%, outperforming the market by 1.64%. The company's stock performance, growth potential in artificial intelligence, and positive earnings reports were highlighted. A survey indicated that over half of Americans believe artificial intelligence could spread misinformation in the 2024 election.

[2024-02-05..2024-02-12, +2.62% vs market]
Alphabet Inc. (GOOG) experienced a weekly return of 2.62%, outperforming the market by 1.02%. Despite a recent dip in its stock price following a quarterly earnings report, Alphabet's potential for growth and its massive cash reserves suggest an optimistic outlook for its future performance. Google's AI chatbot "Google Gemini" was also noted for its impact in the AI chatbot landscape.

[2024-02-13..2024-02-19, -5.63% vs market]
Alphabet Inc. (GOOG) saw a weekly return of -5.63%, underp

In [37]:
# Test with specific company query 
company_query = "What are the latest developments and market sentiment for Microsoft (MSFT)?"
analyze_news(company_query)


Long term news for MSFT in the last 43 weeks (2024-01-29..2024-11-25 00:00:00):

[2024-01-29..2024-02-05, -0.99% vs market]
Microsoft's stock performance was under scrutiny as the company's earnings report showed a significant increase in cloud revenue, driven by investments in artificial intelligence (AI). Despite a weekly return of -0.99%, analysts maintained mostly positive opinions, raising price targets and predicting substantial growth potential. The company's strong performance led it to surpass Apple as the world's largest company.

[2024-02-05..2024-02-12, +2.37% vs market]
Microsoft's stock was closely watched due to its earnings growth, price strength, and strategic developments. The company's launch of Azure Quantum, a cloud-based platform for quantum programming, was seen as a positive factor. Despite discussions around the company's valuation, Microsoft's focus on AI and its potential impact on the company's future performance garnered attention. The stock saw a weekly r

In [38]:
# Test with specific company query 
company_query = "What are the latest developments and market sentiment for Supermicro (SMCI)?"
analyze_news(company_query)


Long term news for SMCI in the last 42 weeks (2024-03-05..2024-12-23 00:00:00):

[2024-09-10..2024-09-16, +9.57% vs market]
Super Micro Computer, Inc. (SMCI) faced multiple class action lawsuits alleging violations of federal securities laws, including misrepresentations of financial growth, related party transactions, and compliance with export restrictions. Despite these allegations, the company's stock saw a weekly return of 9.57%, 6.61% above the market.

[2024-09-17..2024-09-23, +3.75% vs market]
The company continued to face legal challenges with more class action lawsuits being filed against it. Despite these challenges, the company's stock saw a weekly return of 3.75%, 2.23% above the market.

[2024-10-01..2024-10-07, +14.65% vs market]
SMCI stock rebounded by up to 18% following an update on robust sales of its server systems, including liquid cooling solutions. However, the company continued to face legal challenges, making it a risky investment. The stock saw a weekly retur

In [39]:
# Test with specific company query 
company_query = "What are the latest developments and market sentiment for the market tickers (SPY, QQQ, or DJIA)?"
analyze_news(company_query)


Long term news for SPY in the last 49 weeks (2023-12-03..2024-11-11 00:00:00):

[2023-12-14..2023-12-25, +0.92% vs market]
During this period, the S&P 500 ETF Trust (SPY) saw a weekly return of 0.92%, outperforming the market by 0.17%. Key developments included a downward revision of the Q3 GDP from 5.2% to 4.9%, an influx of $20.8 billion into the S&P 500 ETF Trust in a single day, and Goldman Sachs' revision of the S&P 500 index target to 5100. President Joe Biden's approval rating hit a new low of 34%.

[2024-02-05..2024-02-12, +1.71% vs market]
In this period, SPY had a weekly return of 1.71%, outperforming the market by 0.11%. The period was marked by mixed consumer sentiment, with stable inflation expectations and growing optimism about household finances, credit access, and the stock market. However, economists expressed unease with the Federal Reserve's monetary policies, viewing them as overly restrictive. The Consumer Price Index inflation adjustments for the final quarter o