In [1]:
# RAG with Langchain
# source: https://python.langchain.com/docs/concepts/rag/

In [2]:
from dotenv import load_dotenv
import os

# 1st method: using .env file.
load_dotenv()
# Access them using os.getenv or os.environ
api_key = os.getenv("GROQ_API_KEY")

# 2nd method: using hard code
# api_key = "<put the api key here>"
# if not os.environ.get("GROQ_API_KEY"):
#     os.environ["GROQ_API_KEY"] = api_key #getpass.getpass("Enter API key for Groq: ")




from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

In [3]:
# !pip install langchain_groq

In [4]:
# !pip install langchain

In [5]:
# !pip install sentence-transformers

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
# from langchain_community.chat_models import ChatGroq
import os

# # 1. Set your Groq API key
# os.environ["GROQ_API_KEY"] = "your-groq-api-key"

# 2. Load and split your text
text = """
The company said the Starship "experienced a major anomaly" at about 11 p.m. while on the test stand preparing for the tenth flight test at Starbase, SpaceX's launch site at the southern tip of Texas.

"A safety clear area around the site was maintained throughout the operation and all personnel are safe and accounted for," SpaceX said in a statement on the social platform X.

It marked the latest in a series of incidents involving Starship rockets. On Jan. 16, one of the massive rockets broke apart in what the company called a "rapid unscheduled disassembly," sending trails of flaming debris near the Caribbean. Two months later, Space X lost contact with another Starship during a March 6 test flight as the spacecraft broke apart, with wreckage seen streaming over Florida.
"""
splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
docs = splitter.create_documents([text])

# 3. Use HuggingFace Embeddings (local + open source)
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 4. Store embeddings in FAISS
vectorstore = FAISS.from_documents(docs, embedding)
retriever = vectorstore.as_retriever()

# # 5. Use Groq's LLM (Mixtral here)
# llm = ChatGroq(model_name="mixtral-8x7b-32768")

# 6. Create a RetrievalQA chain
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# 7. Ask a question
query = "What happend to Space X recently?"
answer = qa.run(query)
print(answer)


Created a chunk of size 201, which is longer than the specified 100
Created a chunk of size 176, which is longer than the specified 100


In [None]:
# !pip install faiss-cpu

In [None]:
# !pip install -U langchain-community

In [None]:
# from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage

# Define a system prompt that tells the model how to use the retrieved context
system_prompt = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
Context: {context}:"""
    
# Define a question
question = """why the spacecraft failed?"""

# Retrieve relevant documents
docs = retriever.invoke(question)

# Combine the documents into a single string
docs_text = "".join(d.page_content for d in docs)

# Populate the system prompt with the retrieved context
system_prompt_fmt = system_prompt.format(context=docs_text)

# # Create a model
# model = ChatOpenAI(model="gpt-4o", temperature=0) 

# Generate a response
questions = llm.invoke([SystemMessage(content=system_prompt_fmt),
                          HumanMessage(content=question)])

In [None]:
questions.content

In [None]:
!conda env list

In [None]:
# how to get the transcript of a video (url given)? 

In [None]:
video_url = "https://youtu.be/sJ9kSzMbMRY?si=k4dnmxrt-yMkuSAy"


In [None]:
from urllib.parse import urlparse, parse_qs

def extract_video_id(url):
    parsed_url = urlparse(url)
    if parsed_url.hostname in ['www.youtube.com', 'youtube.com']:
        return parse_qs(parsed_url.query).get('v', [None])[0]
    elif parsed_url.hostname == 'youtu.be':
        return parsed_url.path[1:]
    return None

# Example usage
video_url = "https://youtu.be/sJ9kSzMbMRY?si=k4dnmxrt-yMkuSAy"
video_id = extract_video_id(video_url)
print("Video ID:", video_id)

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi

video_id = extract_video_id(video_url)
if video_id:
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    full_text = " ".join([entry['text'] for entry in transcript])
    print(full_text)
else:
    print("Could not extract video ID.")