# Chunking the data

In [1]:
from langchain_community.document_loaders import PyPDFLoader   
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("../data/DSA.pdf")

documents = loader.load()

# Chunking of the data

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap = 50
)

chunks = splitter.split_documents(documents)

for i,chunk in enumerate(chunks):
    print(f"\n chunk {i+1} \n {chunk.page_content}" )
# print(type(chunks))

<class 'list'>


In [3]:
import os
from textwrap import dedent


# Ensure data directory exists
os.makedirs("data", exist_ok=True)


# Path for the new file
file_path = "data/cloud_computing.txt"


# Content for the file
content = dedent("""
Cloud Computing — Overview and Essentials


1. What is Cloud Computing?
Cloud computing is the delivery of computing services — including servers, storage, databases, networking, software,
analytics, and intelligence — over the Internet (the "cloud") to offer faster innovation, flexible resources,
and economies of scale. Instead of owning their own computing infrastructure or data centers, organizations can
rent access to computing resources from cloud providers.


2. Key Service Models
- Infrastructure as a Service (IaaS): Provides virtualized computing resources over the internet.
  Examples: virtual machines, storage, networking. Users manage OS and applications.
- Platform as a Service (PaaS): Provides a platform allowing customers to develop, run, and manage applications
  without handling the underlying infrastructure.
- Software as a Service (SaaS): Delivers software applications over the internet on a subscription basis.
  The provider manages everything; users simply use the software through a web browser or API.


3. Deployment Models
- Public Cloud: Services offered over the public internet and available to anyone (e.g., AWS, Azure, GCP).
- Private Cloud: Cloud infrastructure operated solely for a single organization, either on-site or hosted.
- Hybrid Cloud: A combination of public and private clouds, allowing data and applications to move between them.
- Multi-Cloud: Using multiple cloud providers to avoid vendor lock-in and optimize services/costs.


4. Core Benefits
- Scalability: Scale resources up or down on demand.
- Cost Efficiency: Pay-as-you-go pricing and reduced capital expenditures.
- Agility & Speed: Faster provisioning of resources and shorter time-to-market.
- Reliability: High availability and disaster recovery options.
- Global Reach: Deploy applications closer to users via region/availability zones.


5. Common Use Cases
- Web and mobile applications hosting
- Data storage, backup and disaster recovery
- Big data processing and analytics
- Machine learning and AI workloads
- DevOps and CI/CD pipelines
- IoT backends and real-time telemetry processing


6. Challenges & Considerations
- Security & Compliance: Ensuring data protection, identity management, and regulatory compliance.
- Cost Management: Unexpected costs if resources are not monitored or optimized.
- Latency & Performance: Some workloads require low-latency or specialized hardware.
- Vendor Lock-in: Migration complexity between providers or back on-premises.
- Governance & Control: Ensuring policies, access controls, and auditing are in place.


7. Best Practices
- Design for failure: build systems to handle outages gracefully.
- Automate provisioning and configuration with infrastructure-as-code tools.
- Monitor and log proactively; use centralized observability.
- Optimize costs by rightsizing, reserved instances, and lifecycle policies.
- Implement strong identity and access management (IAM) and encryption.


8. Getting Started (Practical Steps)
- Create a free-tier account with a major cloud provider to explore services.
- Learn core services: compute (VMs, containers), storage (object/block), networking (VPC), and IAM.
- Try a small project: deploy a static website or a simple API with managed database.
- Explore managed developer tools: serverless functions, container services, and managed databases.


Summary:
Cloud computing provides flexible, on-demand access to computing resources and enables modern applications to scale, innovate, and operate cost-effectively. Understanding service models (IaaS/PaaS/SaaS), deployment options, and basic best practices will make it easier to start building cloud-native systems.
""").strip()


# Write the content to the file
with open(file_path, "w", encoding="utf-8") as f:
    f.write(content)


print(f"Created file: {file_path} ({len(content.splitlines())} lines, {len(content)} characters)")
print("You can now load this file with LlamaIndex (e.g., SimpleDirectoryReader('data').load_data()).")


Created file: data/cloud_computing.txt (68 lines, 3677 characters)
You can now load this file with LlamaIndex (e.g., SimpleDirectoryReader('data').load_data()).


In [4]:

import os
from llama_index.llms.groq import Groq
from llama_index.core import SimpleDirectoryReader,VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding




KeyboardInterrupt: 

In [None]:
import os
from llama_index.llms.groq import Groq
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

os.environ["GROQ_API_KEY"] = ''
os.environ["HF_TOKEN"] = ''

llm = Groq(model="llama3-8b-8192")

emdb = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

file_path = "data/cloud_computing.txt"
documents = SimpleDirectoryReader(input_files=[file_path]).load_data()

index = VectorStoreIndex.from_documents(documents, embed_model=emdb)


query_engine = index.as_query_engine(llm=llm)


response = query_engine.query("Summarize the document into 3 points")

print("\nDocument Summary:\n", response)

# 2. Ask Question About Document

In [None]:
import os
from llama_index.llms.groq import Groq
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

os.environ["GROQ_API_KEY"] = ''

llm = Groq(model="llama3-8b-8192")

emdb = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

file_path = "/content/data/Stack.txt"
documents = SimpleDirectoryReader(input_files=[file_path]).load_data()

index = VectorStoreIndex.from_documents(documents, embed_model=emdb)


query_engine = index.as_query_engine(llm=llm)


response = query_engine.query("Summarize the document into 3 points")

print("\nDocument Summary:\n", response)