In [None]:
%load_ext autoreload

%autoreload 2

import pandas as pd
import os
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from dotenv import load_dotenv
from tqdm import tqdm

# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY')
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Load the data with market stats
df = pd.read_parquet('data/news_feed_with_market_stats.parquet')
df.tail()

Unnamed: 0,type,start_date,end_date,ticker,count,text,link,model,growth_last_day,weekly_return,market_daily_return,market_weekly_return,growth_above_market
854,individual,2023-08-08,2023-08-14,DJIA,9,U.S. bank lending remained flat in the latest ...,https://pythoninvest.com/tpost/u7h0i2kxy1-week...,,0.004,0.007191,0.00575,-0.006356,0.013547
855,individual,2023-08-08,2023-08-14,RIVN,12,"Rivian, the electric vehicle maker, reported i...",https://pythoninvest.com/tpost/u7h0i2kxy1-week...,,-0.0005,-0.112026,0.00575,-0.006356,-0.10567
856,individual,2023-08-08,2023-08-14,SPY,7,Here's a summary of the news: - Jim Cramer adv...,https://pythoninvest.com/tpost/u7h0i2kxy1-week...,,0.0055,-0.005769,0.00575,-0.006356,0.000587
857,market_1_day,2023-08-12,2023-08-13,multiple_tickers,228,This week's financial news was characterized b...,https://pythoninvest.com/tpost/u7h0i2kxy1-week...,market news,,-0.003122,-0.00107,-0.003122,
858,market_1_week,2023-08-08,2023-08-14,multiple_tickers,2210,This week's financial news was characterized b...,https://pythoninvest.com/tpost/u7h0i2kxy1-week...,market news,,-0.006356,0.00575,-0.006356,


In [None]:
# Create documents from DataFrame
documents = []
for _, row in df.iterrows():
    # Combine all relevant fields into a single text
    content = f"Type: {row['type']}\n"
    content += f"Period: {row['start_date']} to {row['end_date']}\n"
    content += f"Ticker: {row['ticker']}\n"
    content += f"Growth (last day): {row['growth_last_day']:.2%}\n"
    content += f"Weekly Return: {row['weekly_return']:.2%}\n"
    content += f"Market Daily Return: {row['market_daily_return']:.2%}\n"
    content += f"Market Weekly Return: {row['market_weekly_return']:.2%}\n"
    content += f"Growth Above Market: {row['growth_above_market']:.2%}\n"
    content += f"Count: {row['count']}\n"
    content += f"Content: {row['text']}"
    
    # Create Document object with metadata
    doc = Document(
        page_content=content,
        metadata={
            'type': row['type'],
            'ticker': row['ticker'],
            'link': row['link'],
            'start_date': str(row['start_date']),
            'end_date': str(row['end_date']),
            'weekly_return': row['weekly_return'],
            'market_daily_return': row['market_daily_return'],
            'market_weekly_return': row['market_weekly_return'],
            'growth_above_market': row['growth_above_market']
        }
    )
    documents.append(doc)

In [None]:
# TOP 10 tickers
df.ticker.value_counts().head(11)

ticker
multiple_tickers    268
NVDA                 54
AMZN                 53
TSLA                 51
AAPL                 47
GOOG                 39
SPY                  37
MSFT                 31
DJIA                 24
SMCI                 21
QQQ                  20
Name: count, dtype: int64

In [None]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

# Split documents into chunks
splits = text_splitter.split_documents(documents)

In [None]:
# Initialize embeddings
embeddings = OpenAIEmbeddings()

# Create vector store
vectorstore = FAISS.from_documents(splits, embeddings)

In [None]:
# Initialize LLM
llm = ChatOpenAI(model_name="gpt-4", temperature=0)

# Create QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 7}),
    return_source_documents=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
        template="""You are a financial news analyst assistant. Your task is to provide accurate, 
                well-structured responses based on the provided news articles context. 
                Focus on summarizing insights and trends, with special emphasis on outstanding 
                performance metrics. When analyzing news, prioritize the following:

                1. News about stocks with significant growth above market (Growth Above Market)
                2. News about stocks with high weekly returns (Weekly Return)
                3. Recent market developments with notable trends

                For individual stocks, always mention their performance relative to the market when
                available. Highlight cases where a stock significantly outperforms or underperforms
                the market average.

                Do not exceed the line length of 80 characters to ensure readability. Use concise
                and clear language when summarizing complex details.

                Insert approximate dates where possible (e.g., Nov'24) if the exact date isn't
                available.

                USE ONLY FACTS YOU SEE IN THE NEWS, DO NOT HALLUCINATE. If details are missing,
                omit them or state that the information is not available.

            Question: {question}
            Context: {context}

            Answer: Let's analyze this based on the provided information.""",
            input_variables=["context", "question"]
        )
    }
)

In [None]:
def get_response(query):
    """Get response using the RAG chain.
    
    Args:
        query: User's query string
    
    Returns:
        Generated response and source documents
    """
    # result = qa_chain({"query": query})

    # (2025) this is a new method instead of the deprecated one
    result = qa_chain.invoke({"query": query})

    return result["result"], result["source_documents"]

In [None]:
# Test with pharma sector query
tech_query = "What are the recent trends in pharma sector news? Focus on growth patterns and significant developments."

response, sources = get_response(tech_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")
    print(f"Weekly Return: {doc.metadata['weekly_return']:.2%}")
    print(f"Market Weekly Return: {doc.metadata['market_weekly_return']:.2%}")
    print(f"Growth Above Market: {doc.metadata['growth_above_market']:.2%}")

Generated Response:

In the pharmaceutical sector, there are significant advancements in drug pipelines, 
particularly in obesity and diabetes treatments, attracting investor interest. 
Additionally, there's an increased focus on personalized and precision medicine, 
indicating growing investment opportunities in healthcare, especially in drug 
development and diagnostics. However, AbbVie faced a setback as its schizophrenia 
drug failed to meet trial endpoints, negatively impacting its stock. The healthcare 
sector anticipates growth in areas like neuroscience, needle-free injectors, and 
orthobiologics markets. Overall, the market climate reflects cautious optimism, 
with a clear focus on healthcare advancements.

Source Documents:

Source 1:
Type: market_1_week
Ticker: multiple_tickers
Period: 2024-11-12 to 2024-12-16 00:00:00
Link: https://pythoninvest.com/tpost/cmkmvphfr1-week-9-16-december-2024
Weekly Return: 0.35%
Market Weekly Return: 0.35%
Growth Above Market: nan%

Source 2:


In [None]:
# Test with specific company query
company_query = "What are the latest developments and market sentiment for NVDA (NVIDIA)?"

response, sources = get_response(company_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")
    print(f"Weekly Return: {doc.metadata['weekly_return']:.2%}")
    print(f"Market Weekly Return: {doc.metadata['market_weekly_return']:.2%}")
    print(f"Growth Above Market: {doc.metadata['growth_above_market']:.2%}")

Generated Response:

From Nov'24 to Dec'24, NVDA saw a weekly return of -4.91%, underperforming the market's 
weekly return of 0.35%. The news highlighted NVDA's potential in the AI market, with 
upcoming product launches and strong financial performance being key factors. However, 
concerns were raised about its high valuation and rising competition.

In a later period, NVDA's weekly return was 6.77%, slightly outperforming the market's 
5.05% return. Wall Street analysts were bullish on NVDA due to its strong AI ecosystem 
and expected earnings growth. The consensus estimate suggested around $1.29 trillion in 
revenue between 2025 and 2030.

In Jan'24, NVDA had a weekly return of 4.71%, significantly outperforming the market's 
1.60% return. Analysts were enthusiastic about NVDA's position in the AI market and its 
growth potential. However, some investors expressed caution about market valuations.

In another period in Jan'24, NVDA saw a weekly return of 11.43%, greatly outperformin

In [None]:
# Test with specific company query (TSLA)
company_query = "What are the latest developments and market sentiment for Tesla (TSLA)?"

response, sources = get_response(company_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")
    print(f"Weekly Return: {doc.metadata['weekly_return']:.2%}")
    print(f"Market Weekly Return: {doc.metadata['market_weekly_return']:.2%}")
    print(f"Growth Above Market: {doc.metadata['growth_above_market']:.2%}")

Generated Response:

In the period from Dec'23, Tesla (TSLA) saw a weekly return of 1.77%, outperforming the market weekly return of 1.15%. The news highlighted discussions about the impact of tax credits, stock price predictions, and the future of AI in Tesla's vehicles. 

By Sep'24, Tesla's weekly return increased to 4.86%, again surpassing the market's 2.96%. The news was mixed, with optimism about increased production in China and the rollout of self-driving software, but also concerns over Tesla's dominance and pricing power.

In the following week, Tesla's weekly return dropped to 1.01%, but it still outperformed the market, which saw a weekly return of -3.14%. The news was again mixed, with positive reports about software rollout and robotics ambitions, but also concerns about competition and potential safety recalls.

By Nov'24, Tesla's weekly return fell to -3.22%, underperforming the market's -1.80%. The news highlighted Elon Musk's support for the Trump administration, poten

In [None]:
# Test with market summary query
market_query = "Provide a summary of recent market trends in December 2024. Do not add news before December 2024"

response, sources = get_response(market_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")
    print(f"Weekly Return: {doc.metadata['weekly_return']:.2%}")
    print(f"Market Weekly Return: {doc.metadata['market_weekly_return']:.2%}")
    print(f"Growth Above Market: {doc.metadata['growth_above_market']:.2%}")

Generated Response:

The market trends for December 2023 show a weekly return of 2.56%, matching the market's weekly return. The U.S. economy added 199,000 jobs in November, exceeding expectations and indicating a robust labor market. Consumer sentiment surged by 16.1% YoY in December, with a decline in inflation expectations. Technology firms, particularly Alphabet, are focusing on AI, introducing new models to enhance generative AI efforts. Despite a market shift towards value stocks, Cathie Wood of ARK Invest continues to invest in growth companies. Morgan Stanley is closely monitoring the clean tech sector, including renewable energy, for potential growth trends in 2024.

However, the DJIA underperformed the market with a weekly return of -0.11%, compared to the market's weekly return of 0.75%. Despite a rise in consumer confidence and durable-goods orders, the leading economic index fell for the 20th consecutive month, signaling a potential recession. The Federal Reserve plans to 