In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import pandas as pd
import os
from openai import OpenAI
# from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings

from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY')
)

In [3]:
# Load the data
df = pd.read_parquet('data/news_feed_flattened.parquet')

# Create documents from DataFrame
documents = []
for _, row in df.iterrows():
    # Combine all relevant fields into a single text
    content = f"Type: {row['type']}\n"
    content += f"Period: {row['start_date']} to {row['end_date']}\n"
    content += f"Ticker: {row['ticker']}\n"
    content += f"Growth: {row['growth']}\n"
    content += f"Count: {row['count']}\n"
    content += f"Content: {row['text']}"
    
    # Create Document object with metadata
    doc = Document(
        page_content=content,
        metadata={
            'type': row['type'],
            'ticker': row['ticker'],
            'link': row['link'],
            'start_date': str(row['start_date']),
            'end_date': str(row['end_date'])
        }
    )
    documents.append(doc)

In [4]:
df.tail()

Unnamed: 0,type,start_date,end_date,ticker,count,growth,text,link,model
712,individual,2024-11-17,2024-12-23,CELH,18,-0.96%,Celsius Holdings Inc. (CELH) is facing a decli...,https://pythoninvest.com/tpost/ve9dp9v641-week...,
713,individual,2024-11-17,2024-12-23,SMCI,16,2.56%,Super Micro Computer (SMCI) has been experienc...,https://pythoninvest.com/tpost/ve9dp9v641-week...,
714,individual,2024-11-17,2024-12-23,CMG,29,-0.47%,"Chipotle Mexican Grill, Inc. is facing a secur...",https://pythoninvest.com/tpost/ve9dp9v641-week...,
715,market_1_day,2024-12-22,2024-12-23,multiple_tickers,64,,Recent market updates indicate a mix of positi...,https://pythoninvest.com/tpost/ve9dp9v641-week...,GPT4
716,market_1_week,2024-11-17,2024-12-23,multiple_tickers,2332,,1. **GLP-1 Analogues Growth**: The global mark...,https://pythoninvest.com/tpost/ve9dp9v641-week...,GPT4


In [5]:
# TOP 10 tickers
df.ticker.value_counts().head(11)

ticker
multiple_tickers    134
AMZN                 52
NVDA                 51
TSLA                 50
AAPL                 47
SPY                  39
GOOG                 37
MSFT                 33
DJIA                 26
SMCI                 21
QQQ                  20
Name: count, dtype: int64

In [6]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

# Split documents into chunks
splits = text_splitter.split_documents(documents)

In [7]:
# Initialize embeddings
embeddings = OpenAIEmbeddings()

# Create vector store
vectorstore = FAISS.from_documents(splits, embeddings)

In [28]:
# Initialize LLM
llm = ChatOpenAI(model_name="gpt-4", temperature=0)

# Create QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    return_source_documents=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
        template="""You are a financial news analyst assistant. Your task is to provide accurate, 
                well-structured responses based on the provided news articles context. 
                 Focus on summarizing the most important insights and trends, and explain any relationships between different pieces of information. Ensure that you maintain an objective and neutral tone in all responses, citing specific details from the articles when relevant.

                 Do not exceed the line length of 80 characters to ensure readability. Use concise and clear language when summarizing complex details.

                Insert approximate dates where possible (e.g., Nov'24) if the exact date isn't available.
                Organize your response by presenting the most important or recent news first, followed by less important or older information.

                USE ONLY FACTS YOU SEE IN THE NEWS, DO NOT HALLUCINATE. If details are missing, omit them or state that the information is not available.

            Question: {question}
            Context: {context}

            Answer: Let's analyze this based on the provided information.""",
            input_variables=["context", "question"]
        )
    }
)

In [29]:
def get_response(query):
    """Get response using the RAG chain.
    
    Args:
        query: User's query string
    
    Returns:
        Generated response and source documents
    """
    # result = qa_chain({"query": query})

    # (2025) this is a new method instead of the deprecated one
    result = qa_chain.invoke({"query": query})

    return result["result"], result["source_documents"]

In [30]:
# Test with pharma sector query
tech_query = "What are the recent trends in pharma sector news? Focus on growth patterns and significant developments."

response, sources = get_response(tech_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")

Generated Response:

In the pharma sector, there's a growing focus on personalized and precision medicine, 
indicating increased investment opportunities in drug development and diagnostics 
(recent trends). Notably, Regeneron and Sanofi received FDA approval for Dupixent, a 
significant development in treating chronic lung diseases (recent developments). 
However, AbbVie faced a setback as its schizophrenia drug emraclidine failed to meet 
trial endpoints, negatively impacting its stock. The healthcare sector anticipates 
growth in areas like neuroscience, needle-free injectors, and orthobiologics markets. 
The global urology devices market is also projected to grow significantly due to 
advancements in healthcare technology and increasing prevalence of related disorders. 
Despite mixed drug efficacy results causing market volatility, the sector shows 
resilience.

Source Documents:

Source 1:
Type: market_1_week
Ticker: multiple_tickers
Period: 2024-11-12 to 2024-12-16
Link: https://

In [31]:
# Test with specific company query
company_query = "What are the latest developments and market sentiment for NVDA (NVIDIA)?"

response, sources = get_response(company_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")

Generated Response:

From Nov'24 to Dec'24, Nvidia (NVDA) stock saw a growth of -1.68%. The stock's potential in the AI market is a key focus, with some analysts suggesting Nvidia is well-positioned to benefit from the AI boom. However, concerns about its high valuation and rising competition are also noted. Key factors driving the debate around the stock's potential include upcoming product launches, major partnerships, and strong financial performance. 

In the same period, the semiconductor industry highlighted Nvidia as a top large-cap pick, predicting competitive advantages from system-level efficiencies in computing. Despite a decline in U.S. factory orders, the ISM services PMI rose, signaling an uptick in service sector activity. Bitcoin surpassed $42,000, buoyed by positive investor sentiment and future financial market speculation. 

From Sep'24 to Sep'24, Nvidia's stock saw a growth of -1.95%. The company's recent stock split and CEO's selling of shares prompted various inte

In [34]:
# Test with specific company query (TSLA)
company_query = "What are the latest developments and market sentiment for Tesla (TSLA)?"

response, sources = get_response(company_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")

Generated Response:

In the most recent period (Nov-Dec'24), Tesla's stock grew by 2.27%. Analysts are divided on the company's future, with some expressing concerns about its high valuation and potential downside, while others are optimistic about its self-driving technology and future plans such as budget-priced electric vehicles and robotaxis. There are also speculations about Tesla's potential launches and developments in China. However, the cryptocurrency Dogecoin, which has been associated with Tesla due to CEO Elon Musk's interest, suffered a significant drop due to the Federal Reserve's revised interest rate outlook.

Earlier, in Sep'24, Tesla's stock saw a mix of optimism and caution, with a growth of -1.52%. Positive news included increased production in China and the rollout of full self-driving software in Europe and China. However, there were also concerns over Tesla's dominance, pricing power, and waning interest in electric vehicles.

In Apr'24, the stock grew by 4.9%. T

In [33]:
# Test with market summary query
market_query = "Provide a summary of recent market trends in December 2024. Do not add news before December 2024"

response, sources = get_response(market_query)
print("Generated Response:\n")
print(response)
print("\nSource Documents:")
for i, doc in enumerate(sources, 1):
    print(f"\nSource {i}:")
    print(f"Type: {doc.metadata['type']}")
    print(f"Ticker: {doc.metadata['ticker']}")
    print(f"Period: {doc.metadata['start_date']} to {doc.metadata['end_date']}")
    print(f"Link: {doc.metadata['link']}")

Generated Response:

In the week from late Dec'23 to early Jan'24, Wall Street saw mixed movements with the Dow Jones gaining modestly and the S&P 500 approaching record levels. Jobless claims rose slightly, while wholesale inventories declined by 0.2%. Companies like Cingulate Inc. and Microbot Medical saw significant stock price increases. AI continued to influence the market, particularly for companies like Microsoft and Nvidia. The consumer healthcare sector is expected to thrive in 2024 despite staffing shortages. Financial services stocks capitalized on stable consumer spending and a resilient labor market. Tech, energy, and healthcare sectors dominated the news, indicating a dynamic market landscape for 2024.

In the week from early to mid-Dec'23, the U.S. economy added 199,000 jobs, outperforming expectations. Consumer sentiment surged by 16.1% YoY in Dec'23, with declining inflation expectations. Tech firms focused on AI, with Alphabet introducing new models. Cathie Wood of AR