In [8]:
import os
from langchain_groq import ChatGroq
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import OllamaEmbeddings

from langchain.vectorstores.cassandra import Cassandra
import cassio
from dotenv import load_dotenv
load_dotenv()
from sentence_transformers import SentenceTransformer

In [9]:
groq_api_key=os.environ['GROQ_API_KEY']

## connection of the ASTRA DB
ASTRA_DB_APPLICATION_TOKEN="AstraCS:mCvoKdLGapjJkCwAqMdrDyrW:cbaa287e58a235a57a1f5ae356d0f5b2c3fccf4fdd" # enter the "AstraCS:..." string found in in your Token JSON file"
ASTRA_DB_ID="c06fb4ed-0c34-43c5db4f851c"
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN,database_id=ASTRA_DB_ID)

In [12]:

import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define a simple document class if not provided by LangChain
#class Document:
 #   def __init__(self, text):
  #      self.page_content = text
#        self.metadata = {}


class Document:
    def __init__(self, text, vector=None):
        self.page_content = text
        self.metadata = {}
        self.vector = vector  # Add vector attribute




def fetch_and_parse_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise an HTTPError for bad responses
        soup = BeautifulSoup(response.text, 'html.parser')
        
        
        paragraphs = [p.get_text() for p in soup.find_all('p')]
        headers = [h.get_text() for h in soup.find_all(['h1', 'h2', 'h3'])]
        # Additional extraction for other potential content areas
        divs = [div.get_text() for div in soup.find_all('div', class_=lambda x: x and 'content' in x)]
        
        
        full_text = "\n".join(headers + paragraphs + divs)
        return full_text
    
    except requests.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return ""


url = "https://edition.cnn.com/business"
full_text = fetch_and_parse_url(url)


documents = [Document(full_text)]


text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)  # Pass list of Document objects


print("no of chunks created:", len(docs))
for i, doc in enumerate(docs[:5]):  # Print the first 5 chunks
    print(f"--- Chunk {i+1} ---")
    print(doc.page_content[:500])  # Print first 500 characters of each chunk for preview
    print()

Number of chunks created: 27
--- Chunk 1 ---
CNN values your feedback
Market Movers
                  
What to Watch
In Case You Missed It
More from Video
Success
Tech
Media
Underscored Money
Markets 



Hot Stocks 



Fear & Greed Index 




            Latest Market News 




            Hot Stocks

--- Chunk 2 ---
Quote Search
Most stock quote data provided by BATS. US market indices are shown in real time, except for the S&P 500 which is refreshed every two minutes. All times are ET. Factset: FactSet Research Systems Inc. All rights reserved. Chicago Mercantile: Certain market data is the property of Chicago Mercantile Exchange Inc. and its licensors. All rights reserved. Dow Jones: The Dow Jones branded indices are proprietary to and are calculated, distributed and marketed by DJI Opco, a subsidiary of 

--- Chunk 3 ---
© 2024 Cable News Network. A Warner Bros. Discovery Company. All Rights Reserved.  CNN Sans ™ & © 2016 Cable News Network.

--- Chunk 4 ---
1. How relevant is this

In [4]:
"""def embed_text(chunks, model_name='all-MiniLM-L6-v2'):
    # Load Sentence Transformer model
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, convert_to_tensor=False)  # Convert to list directly
    return embeddings


# Generate embeddings
embeddings = embed_text(chunks)
print(f"Embedding dimension: {len(embeddings[0])}")"""


Embedding dimension: 384


In [16]:
embeddings=OllamaEmbeddings()
astra_vector_store=Cassandra(
    embedding=embeddings,
    table_name="cnntable",
    session=None,
    keyspace=None

)
print(embeddings)

base_url='http://localhost:11434' model='llama2' embed_instruction='passage: ' query_instruction='query: ' mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None show_progress=False headers=None model_kwargs=None


In [17]:
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
astra_vector_store.add_documents(docs)
print("Inserted %i headlines." % len(docs))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 27 headlines.


In [18]:
llm=ChatGroq(groq_api_key=groq_api_key,
         model_name="mixtral-8x7b-32768")

from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Elon Musk is going all-in on Donald Trump. 
<context>
{context}
</context>

Question: {input}""")

In [22]:
astra_vector_index.query("One has made cutting greenhouse emissions a major business selling point.",llm=llm)

'Based on the provided context, I cannot directly answer the question about SCOTUS One making cutting greenhouse emissions a major business selling point. However, it mentions that SCOTUS One has made a significant impact on the political landscape. It would be best to look up the latest news and decisions related to SCOTUS One to get a better understanding of their impact on businesses and greenhouse emissions.'

In [23]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

retriever=astra_vector_store.as_retriever()
document_chain=create_stuff_documents_chain(llm,prompt)
retrieval_chain=create_retrieval_chain(retriever,document_chain)

In [25]:
response=retrieval_chain.invoke({"input":"One has made cutting greenhouse emissions a major business selling point."})
response

{'input': 'One has made cutting greenhouse emissions a major business selling point.',
 'context': [Document(page_content='KDKA\n\n\n\n\n\n\n\n\n‘We’re screwed’: Stranded passenger frustrated at tech outage\n1:47\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n•\n          Video\n        \n3:37\n\n\n\nKent Nishimura/Getty Images/File\n\n\n\n\n\n\n\n\nCyber outage disrupts globe. Expert explains how long it could take to fix\n3:37\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSuccess\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nKirkikis/iStock Editorial/Getty Images/File\n\n\n\n\n\n\n\n\nSteps you can take now to avoid college sticker shock\n\n\n\n\n\n\n\n\n\n\n\n\n\nFG Trade/E+/Getty Images\n\n\n\n\n\n\n\n\nUnhappy workers may reduce global GDP by as much as 9%, Gallup estimates\n\n\n\n\n\n\n\n\n\n\n\n\n\ndamircudic/E+/Getty Images\n\n\n\n\n\n\n\n\nFed decision buys more time for savers to profit from high interest rates\n\n\n\n\n\n\n\n\n\n\n\n\n\nshapecharge/E+/Getty Images\n\n\n\n\n\n\n\n\nYou’ve retired. Can you get a m

In [26]:
response["answer"]

'Based on the provided context, Elon Musk has made cutting greenhouse emissions a major business selling point. Musk is the CEO of Tesla, a company known for producing electric vehicles and clean energy solutions, both of which contribute to reducing greenhouse gas emissions.'