### 1. Imports

In [1]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from uuid import uuid4
from langchain_core.documents import Document
import os
from groq import Groq
import pandas as pd

### 2 - Function to take content & chunk size and returns a list of chunks

In [2]:
def chunk_data(content,chunk_size):
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=200)
    documents = text_splitter.create_documents(texts = [content])
    return documents
        

### 3 - Take an individual chunk, db collection and stores in DB

In [3]:
def embed_and_store(chunk, db, uuid=None):
    if uuid == None:
        uuid = str(uuid4())
    chunk.metadata['id'] = uuid
    db.add_documents(documents = [chunk], ids=[uuid],)
    return uuid

In [4]:
def create_document_from_string(content):
    return Document(
        page_content=content,
    )

### 4 - Function to take chunk and returns tags

In [5]:
client = Groq(
    api_key="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",
)

def create_tags(doc):
    question = """Based on the given content generate 10 or less tags in the form of list seperated by comma.
    Don't return redundant or uneccessary tags, just return the tags and nothing else,
    Your output should look like <tag1>,<tag2>,<tag3>,"""
    content = doc.page_content
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"""{question}\n\n {content} """,
            }
        ],
        model="llama3-8b-8192",
    )
    return chat_completion.choices[0].message.content


## 4 - Driver code (also stores tags against chunks to a dataframe)

In [6]:
class PersistData:
    _instance = None

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super().__new__(cls, *args, **kwargs)
        return cls._instance
    
    def __init__(self) -> None:
        columns = ['uuid', 'content','tags']
        self.content_df = pd.DataFrame(columns=columns)
        
        self.tag_db = Chroma(
            collection_name="tag_collection",
            embedding_function=HuggingFaceEmbeddings(),
            persist_directory="./our_db",  # Where to save data locally, remove if not neccesary
        )
        
        self.content_db = Chroma(
            collection_name="content_collection",
            embedding_function=HuggingFaceEmbeddings(),
            persist_directory="./our_db",  # Where to save data locally, remove if not neccesary
        )
    
    @staticmethod
    def create(self):
        if self.obj ==None:
            self.obj = PersistData()
        return self.obj
    
    def store_data(self, content):
        chunks = chunk_data(content,chunk_size = 1500)
        uuid_list = []
        for chunk in chunks:
            tags = create_tags(chunk) # a string generated here, we need to split them into words
            uuid = embed_and_store(create_document_from_string(tags),self.tag_db)
            uuid = embed_and_store(chunk,self.content_db,uuid)
            self.content_df = self.content_df.append({'uuid': uuid,
                                  'content':chunk.page_content,
                                  'tags':tags},
                                 ignore_index=True)
        uuid_list.append(uuid)
        return uuid_list

    def search_tags(self, query):
        docs = self.tag_db.similarity_search_with_score(query, k=4)
        nearest_embeddings = [(doc[0].metadata['id'],doc[1]) for doc in docs if 'id' in doc[0].metadata]
        score_df = pd.DataFrame(nearest_embeddings, columns=['uuid', 'score'])
        filtered_df = self.content_df[self.content_df['uuid'].isin([doc[0] for doc in nearest_embeddings])]
        result = pd.merge(filtered_df, score_df, on='uuid', how='inner')
        return result.sort_values(by='score')
    
    def search_content(self,query):
        docs = self.content_db.similarity_search_with_score(query, k=4)
        nearest_embeddings = [(doc[0].metadata['id'],doc[1]) for doc in docs if 'id' in doc[0].metadata]
        score_df = pd.DataFrame(nearest_embeddings, columns=['uuid', 'score'])
        filtered_df = self.content_df[self.content_df['uuid'].isin([doc[0] for doc in nearest_embeddings])]
        result = pd.merge(filtered_df, score_df, on='uuid', how='inner')
        return result.sort_values(by='score')
    


In [7]:
obj = PersistData()

  from tqdm.autonotebook import tqdm, trange


In [8]:
obj.store_data("""Foreign, economic and strategic relations
Main articles: Foreign relations of India and Indian Armed Forces

During the 1950s and 60s, India played a pivotal role in the Non-Aligned Movement.[269] From left to right: Gamal Abdel Nasser of United Arab Republic (now Egypt), Josip Broz Tito of Yugoslavia and Jawaharlal Nehru in Belgrade, September 1961.
In the 1950s, India strongly supported decolonisation in Africa and Asia and played a leading role in the Non-Aligned Movement.[270] After initially cordial relations with neighbouring China, India went to war with China in 1962 and was widely thought to have been humiliated.[271] This was followed by another military conflict in 1967 in which India successfully repelled Chinese attack.[272] India has had tense relations with neighbouring Pakistan; the two nations have gone to war four times: in 1947, 1965, 1971, and 1999. Three of these wars were fought over the disputed territory of Kashmir, while the third, the 1971 war, followed from India's support for the independence of Bangladesh.[273] In the late 1980s, the Indian military twice intervened abroad at the invitation of the host country: a peace-keeping operation in Sri Lanka between 1987 and 1990; and an armed intervention to prevent a 1988 coup d'état attempt in the Maldives. After the 1965 war with Pakistan, India began to pursue close military and economic ties with the Soviet Union; by the late 1960s, the Soviet Union was its largest arms supplier.[274]

Aside from its ongoing special relationship with Russia,[275] India has wide-ranging defence relations with Israel and France. In recent years, it has played key roles in the South Asian Association for Regional Cooperation and the World Trade Organization. The nation has provided 100,000 military and police personnel to serve in 35 UN peacekeeping operations across four continents. It participates in the East Asia Summit, the G8+5, and other multilateral forums.[276] India has close economic ties with countries in South America,[277] Asia, and Africa; it pursues a "Look East" policy that seeks to strengthen partnerships with the ASEAN nations, Japan, and South Korea that revolve around many issues, but especially those involving economic investment and regional security.[278][279]


The Indian Air Force contingent marching at the 221st Bastille Day military parade in Paris, on 14 July 2009. The parade at which India was the foreign guest was led by India's oldest regiment, the Maratha Light Infantry, founded in 1768.[280]
China's nuclear test of 1964, as well as its repeated threats to intervene in support of Pakistan in the 1965 war, convinced India to develop nuclear weapons.[281] India conducted its first nuclear weapons test in 1974 and carried out additional underground testing in 1998. Despite criticism and military sanctions, India has signed neither the Comprehensive Nuclear-Test-Ban Treaty nor the Nuclear Non-Proliferation Treaty, considering both to be flawed and discriminatory.[282] India maintains a "no first use" nuclear policy and is developing a nuclear triad capability as a part of its "Minimum Credible Deterrence" doctrine.[283][284] It is developing a ballistic missile defence shield and, a fifth-generation fighter jet.[285][286] Other indigenous military projects involve the design and implementation of Vikrant-class aircraft carriers and Arihant-class nuclear submarines.[287]

Since the end of the Cold War, India has increased its economic, strategic, and military co-operation with the United States and the European Union.[288] In 2008, a civilian nuclear agreement was signed between India and the United States. Although India possessed nuclear weapons at the time and was not a party to the Nuclear Non-Proliferation Treaty, it received waivers from the International Atomic Energy Agency and the Nuclear Suppliers Group, ending earlier restrictions on India's nuclear technology and commerce. As a consequence, India became the sixth de facto nuclear weapons state.[289] India subsequently signed co-operation agreements involving civilian nuclear energy with Russia,[290] France,[291] the United Kingdom,[292] and Canada.[293]


Prime Minister Narendra Modi of India (left, background) in talks with President Enrique Peña Nieto of Mexico during a visit to Mexico, 2016
The President of India is the supreme commander of the nation's armed forces; with 1.45 million active troops, they compose the world's second-largest military. It comprises the Indian Army, the Indian Navy, the Indian Air Force, and the Indian Coast Guard.[294] The official Indian defence budget for 2011 was US$36.03 billion, or 1.83% of GDP.[295] Defence expenditure was pegged at US$70.12 billion for fiscal year 2022–23 and, increased 9.8% than previous fiscal year.[296][297] India is the world's second-largest arms importer; between 2016 and 2020, it accounted for 9.5% of the total global arms imports.[298] Much of the military expenditure was focused on defence against Pakistan and countering growing Chinese influence in the Indian Ocean.[299] In May 2017, the Indian Space Research Organisation launched the South Asia Satellite, a gift from India to its neighbouring SAARC countries.[300] In October 2018, India signed a US$5.43 billion (over ₹400 billion) agreement with Russia to procure four S-400 Triumf surface-to-air missile defence systems, Russia's most advanced long-range missile defence system.[301]

Economy
Main article: Economy of India

A farmer in northwestern Karnataka ploughs his field with a tractor even as another in a field beyond does the same with a pair of oxen. In 2019, 43% of India's total workforce was employed in agriculture.[302]

India is the world's largest producer of milk, with the largest population of cattle. In 2018, nearly 80% of India's milk was sourced from small farms with herd size between one and two, the milk harvested by hand milking.[304]

Women tend to a recently planted rice field in Junagadh district in Gujarat. 55% of India's female workforce was employed in agriculture in 2019.[303]
According to the International Monetary Fund (IMF), the Indian economy in 2024 was nominally worth $3.94 trillion; it was the fifth-largest economy by market exchange rates and is, at around $15.0 trillion, the third-largest by purchasing power parity (PPP).[17] With its average annual GDP growth rate of 5.8% over the past two decades, and reaching 6.1% during 2011–2012,[305] India is one of the world's fastest-growing economies.[306] However, the country ranks 136th in the world in nominal GDP per capita and 125th in GDP per capita at PPP.[307] Until 1991, all Indian governments followed protectionist policies that were influenced by socialist economics. Widespread state intervention and regulation largely walled the economy off from the outside world. An acute balance of payments crisis in 1991 forced the nation to liberalise its economy;[308] since then, it has moved increasingly towards a free-market system[309][310] by emphasising both foreign trade and direct investment inflows.[311] India has been a member of World Trade Organization since 1 January 1995.[312]

The 522-million-worker Indian labour force is the world's second-largest, as of 2017.[294] The service sector makes up 55.6% of GDP, the industrial sector 26.3% and the agricultural sector 18.1%. India's foreign exchange remittances of US$100 billion in 2022,[313] highest in the world, were contributed to its economy by 32 million Indians working in foreign countries.[314] Major agricultural products include rice, wheat, oilseed, cotton, jute, tea, sugarcane, and potatoes.[13] Major industries include textiles, telecommunications, chemicals, pharmaceuticals, biotechnology, food processing, steel, transport equipment, cement, mining, petroleum, machinery, and software.[13] In 2006, the share of external trade in India's GDP stood at 24%, up from 6% in 1985.[309] In 2008, India's share of world trade was 1.7%;[315] In 2021, India was the world's ninth-largest importer and the sixteenth-largest exporter.[316] Major exports include petroleum products, textile goods, jewellery, software, engineering goods, chemicals, and manufactured leather goods.[13] Major imports include crude oil, machinery, gems, fertiliser, and chemicals.[13] Between 2001 and 2011, the contribution of petrochemical and engineering goods to total exports grew from 14% to 42%.[317] India was the world's second-largest textile exporter after China in the 2013 calendar year.[318]

Averaging an economic growth rate of 7.5% for several years prior to 2007,[309] India has more than doubled its hourly wage rates during the first decade of the 21st century.[319] Some 431 million Indians have left poverty since 1985; India's middle classes are projected to number around 580 million by 2030.[320] Though ranking 68th in global competitiveness,[321] as of 2010, India ranks 17th in financial market sophistication, 24th in the banking sector, 44th in business sophistication, and 39th in innovation, ahead of several advanced economies.[322] With seven of the world's top 15 information technology outsourcing companies based in India, as of 2009, the country is viewed as the second-most favourable outsourcing destination after the United States.[323] India is ranked 40th in the Global Innovation Index in 2023.[324] As of 2023, India's consumer market was the world's fifth-largest.[325]

Driven by growth, India's nominal GDP per capita increased steadily from US$308 in 1991, when economic liberalisation began, to US$1,380 in 2010, to an estimated US$2,731 in 2024. It is expected to grow to US$3,264 by 2026.[17] However, it has remained lower than those of other Asian developing countries such as Indonesia, Malaysia, Philippines, Sri Lanka, and Thailand, and is expected to remain so in the near future.


A panorama of Bangalore, the centre of India's software development economy. In the 1980s, when the first multinational corporations began to set up centres in India, they chose Bangalore because of the large pool of skilled graduates in the area, in turn due to the many science and engineering colleges in the surrounding region.[326]
According to a 2011 PricewaterhouseCoopers (PwC) report, India's GDP at purchasing power parity could overtake that of the United States by 2045.[327] During the next four decades, Indian GDP is expected to grow at an annualised average of 8%, making it potentially the world's fastest-growing major economy until 2050.[327] The report highlights key growth factors: a young and rapidly growing working-age population; growth in the manufacturing sector because of rising education and engineering skill levels; and sustained growth of the consumer market driven by a rapidly growing middle-class.[327] The World Bank cautions that, for India to achieve its economic potential, it must continue to focus on public sector reform, transport infrastructure, agricultural and rural development, removal of labour regulations, education, energy security, and public health and nutrition.[328]

According to the Worldwide Cost of Living Report 2017 released by the Economist Intelligence Unit (EIU) which was created by comparing more than 400 individual prices across 160 products and services, four of the cheapest cities were in India: Bangalore (3rd), Mumbai (5th), Chennai (5th) and New Delhi (8th).[329]

Industries

A tea garden in Sikkim. India, the world's second-largest producer of tea, is a nation of one billion tea drinkers, who consume 70% of India's tea output.
India's telecommunication industry is the second-largest in the world with over 1.2 billion subscribers. It contributes 6.5% to India's GDP.[330] After the third quarter of 2017, India surpassed the US to become the second-largest smartphone market in the world after China.[331]

The Indian automotive industry, the world's second-fastest growing, increased domestic sales by 26% during 2009–2010,[332] and exports by 36% during 2008–2009.[333] In 2022, India became the world's third-largest vehicle market after China and the United States, surpassing Japan.[334] At the end of 2011, the Indian IT industry employed 2.8 million professionals, generated revenues close to US$100 billion equalling 7.5% of Indian GDP, and contributed 26% of India's merchandise exports.[335]

The pharmaceutical industry in India emerged as a global player. As of 2021, with 3000 pharmaceutical companies and 10,500 manufacturing units India is the world's third-largest pharmaceutical producer, largest producer of generic medicines and supply up to 50–60% of global vaccines demand, these all contribute up to US$24.44 billions in exports and India's local pharmaceutical market is estimated up to US$42 billion.[336][337] India is among the top 12 biotech destinations in the world.[338][339] The Indian biotech industry grew by 15.1% in 2012–2013, increasing its revenues from ₹204.4 billion (Indian rupees) to ₹235.24 billion (US$3.94 billion at June 2013 exchange rates).[340]

Energy
Main articles: Energy in India and Energy policy of India
India's capacity to generate electrical power is 300 gigawatts, of which 42 gigawatts is renewable.[341] The country's usage of coal is a major cause of greenhouse gas emissions by India but its renewable energy is competing strongly.[342] India emits about 7% of global greenhouse gas emissions. This equates to about 2.5 tons of carbon dioxide per person per year, which is half the world average.[343][344] Increasing access to electricity and clean cooking with liquefied petroleum gas have been priorities for energy in India.[345]

Socio-economic challenges

Health workers about to begin another day of immunisation against infectious diseases in 2006. Eight years later, and three years after India's last case of polio, the World Health Organization declared India to be polio-free.[346]
Despite economic growth during recent decades, India continues to face socio-economic challenges. In 2006, India contained the largest number of people living below the World Bank's international poverty line of US$1.25 per day.[347] The proportion decreased from 60% in 1981 to 42% in 2005.[348] Under the World Bank's later revised poverty line, it was 21% in 2011.[p][350] 30.7% of India's children under the age of five are underweight.[351] According to a Food and Agriculture Organization report in 2015, 15% of the population is undernourished.[352][353] The Midday Meal Scheme attempts to lower these rates.[354]

A 2018 Walk Free Foundation report estimated that nearly 8 million people in India were living in different forms of modern slavery, such as bonded labour, child labour, human trafficking, and forced begging, among others.[355] According to the 2011 census, there were 10.1 million child labourers in the country, a decline of 2.6 million from 12.6 million in 2001.[356]

Since 1991, economic inequality between India's states has consistently grown: the per-capita net state domestic product of the richest states in 2007 was 3.2 times that of the poorest.[357] Corruption in India is perceived to have decreased. According to the Corruption Perceptions Index, India ranked 78th out of 180 countries in 2018 with a score of 41 out of 100, an improvement from 85th in 2014.[358][359]""")

  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,


  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,


['dcad6239-6315-40e0-85fc-9a79e9c86bb8']

In [9]:
obj.store_data("""What is Retrieval-Augmented Generation (RAG)?
RAG (Retrieval-Augmented Generation) is an AI framework that combines the strengths of traditional information retrieval systems (such as databases) with the capabilities of generative large language models (LLMs).  By combining this extra knowledge with its own language skills, the AI can write text that is more accurate, up-to-date, and relevant to your specific needs.

Get started for free
image of what is RAG
35:30
Grounding for Gemini with Vertex AI Search and DIY RAG
How does Retrieval-Augmented Generation work?
RAGs operate with a few main steps to help enhance generative AI outputs: 

Retrieval and Pre-processing: RAGs leverage powerful search algorithms to query external data, such as web pages, knowledge bases, and databases. Once retrieved, the relevant information undergoes pre-processing, including tokenization, stemming, and removal of stop words.
Generation: The pre-processed retrieved information is then seamlessly incorporated into the pre-trained LLM. This integration enhances the LLM's context, providing it with a more comprehensive understanding of the topic. This augmented context enables the LLM to generate more precise, informative, and engaging responses. 
RAG operates by first retrieving relevant information from a database using a query generated by the LLM. This retrieved information is then integrated into the LLM's query input, enabling it to generate more accurate and contextually relevant text. RAG leverages vector databases, which store data in a way that facilitates efficient search and retrieval.

Why Use RAG?
RAG offers several advantages over traditional methods of text generation, especially when dealing with factual information or data-driven responses. Here are some key reasons why using RAG can be beneficial:

Access to updated information
Traditional LLMs are often limited to their pre-trained knowledge and data. This could lead to potentially outdated or inaccurate responses. RAG overcomes this by granting LLMs access to external information sources, ensuring accurate and up-to-date answers.

Factual grounding
LLMs are powerful tools for generating creative and engaging text, but they can sometimes struggle with factual accuracy. This is because LLMs are trained on massive amounts of text data, which may contain inaccuracies or biases.

RAG helps address this issue by providing LLMs with access to a curated knowledge base, ensuring that the generated text is grounded in factual information. This makes RAG particularly valuable for applications where accuracy is paramount, such as news reporting, scientific writing, or customer service.

Note: RAG may also assist in preventing hallucinations being sent to the end user. The LLM will still generate solutions from time to time where its training is incomplete but the RAG technique helps improve the user experience.

Contextual relevance
The retrieval mechanism in RAG ensures that the retrieved information is relevant to the input query or context.

By providing the LLM with contextually relevant information, RAG helps the model generate responses that are more coherent and aligned with the given context.

This contextual grounding helps to reduce the generation of irrelevant or off-topic responses.

Factual consistency
RAG encourages the LLM to generate responses that are consistent with the retrieved factual information.

By conditioning the generation process on the retrieved knowledge, RAG helps to minimize contradictions and inconsistencies in the generated text.

This promotes factual consistency and reduces the likelihood of generating false or misleading information.

Utilizes vector databases
RAGs leverage vector databases to efficiently retrieve relevant documents. Vector databases store documents as vectors in a high-dimensional space, allowing for fast and accurate retrieval based on semantic similarity.

Improved response accuracy
RAGs complement LLMs by providing them with contextually relevant information. LLMs can then use this information to generate more coherent, informative, and accurate responses, even multi-modal ones.

RAGs and chatbots
RAGs can be integrated into a chatbot system to enhance their conversational abilities. By accessing external information, RAG-powered chatbots helps leverage external knowledge to provide more comprehensive,informative, and context-aware responses, improving the overall user experience.""")

  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,
  self.content_df = self.content_df.append({'uuid': uuid,


['36ac1c5c-ca19-4306-b7f3-53bc2e7f0791']

In [27]:
query = "Indian labour force"

In [28]:
result = obj.search_tags(query)

docs[0]: ['dcad6239-6315-40e0-85fc-9a79e9c86bb8', 'fecdc1b7-7a72-4bcc-9867-5e528db563e3', 'd9f49f81-418c-4184-bf90-47ea268f8b5e', 'a4b3e6d3-12ee-4b62-9a20-851909aad26e']
                                    uuid  \
5   a4b3e6d3-12ee-4b62-9a20-851909aad26e   
7   d9f49f81-418c-4184-bf90-47ea268f8b5e   
8   fecdc1b7-7a72-4bcc-9867-5e528db563e3   
13  dcad6239-6315-40e0-85fc-9a79e9c86bb8   

                                              content  \
5   Economy\nMain article: Economy of India\n\nA f...   
7   The 522-million-worker Indian labour force is ...   
8   Averaging an economic growth rate of 7.5% for ...   
13  Since 1991, economic inequality between India'...   

                                                 tags  
5            Economy, Agriculture, Milk, Farms, India  
7   economy, India, industry, agriculture, trade, ...  
8   Economic growth, India, Poverty, Middle class,...  
13  India, economy, inequality, corruption, develo...  


In [29]:
for c in result['tags']:
    print(c)
    print("##"*25)

India, economy, inequality, corruption, development
##################################################
Economic growth, India, Poverty, Middle class, Competitiveness, Finance, Banking, Innovation, IT outsourcing, GDP, Asia
##################################################
economy, India, industry, agriculture, trade, exports, imports, manufacturing, textiles
##################################################
Economy, Agriculture, Milk, Farms, India
##################################################


In [30]:
for c in result['content']:
    print(c)
    print("##"*25)

Since 1991, economic inequality between India's states has consistently grown: the per-capita net state domestic product of the richest states in 2007 was 3.2 times that of the poorest.[357] Corruption in India is perceived to have decreased. According to the Corruption Perceptions Index, India ranked 78th out of 180 countries in 2018 with a score of 41 out of 100, an improvement from 85th in 2014.[358][359]
##################################################
Averaging an economic growth rate of 7.5% for several years prior to 2007,[309] India has more than doubled its hourly wage rates during the first decade of the 21st century.[319] Some 431 million Indians have left poverty since 1985; India's middle classes are projected to number around 580 million by 2030.[320] Though ranking 68th in global competitiveness,[321] as of 2010, India ranks 17th in financial market sophistication, 24th in the banking sector, 44th in business sophistication, and 39th in innovation, ahead of several adv

In [31]:
result_c = obj.search_content(query)

docs[0]: ['fecdc1b7-7a72-4bcc-9867-5e528db563e3', 'd9f49f81-418c-4184-bf90-47ea268f8b5e', 'e5d715cf-da9d-4a70-90d0-2a30bff7aab1', 'a4b3e6d3-12ee-4b62-9a20-851909aad26e']
                                   uuid  \
5  a4b3e6d3-12ee-4b62-9a20-851909aad26e   
6  e5d715cf-da9d-4a70-90d0-2a30bff7aab1   
7  d9f49f81-418c-4184-bf90-47ea268f8b5e   
8  fecdc1b7-7a72-4bcc-9867-5e528db563e3   

                                             content  \
5  Economy\nMain article: Economy of India\n\nA f...   
6  Women tend to a recently planted rice field in...   
7  The 522-million-worker Indian labour force is ...   
8  Averaging an economic growth rate of 7.5% for ...   

                                                tags  
5           Economy, Agriculture, Milk, Farms, India  
6  India, economy, agriculture, GDP, growth, mark...  
7  economy, India, industry, agriculture, trade, ...  
8  Economic growth, India, Poverty, Middle class,...  


In [32]:
for c in result_c['tags']:
    print(c)
    print("##"*25)

Economic growth, India, Poverty, Middle class, Competitiveness, Finance, Banking, Innovation, IT outsourcing, GDP, Asia
##################################################
economy, India, industry, agriculture, trade, exports, imports, manufacturing, textiles
##################################################
India, economy, agriculture, GDP, growth, market, purchasing.power, protectionism, world trade
##################################################
Economy, Agriculture, Milk, Farms, India
##################################################


In [33]:
for c in result_c['content']:
    print(c)
    print("##"*25)

Averaging an economic growth rate of 7.5% for several years prior to 2007,[309] India has more than doubled its hourly wage rates during the first decade of the 21st century.[319] Some 431 million Indians have left poverty since 1985; India's middle classes are projected to number around 580 million by 2030.[320] Though ranking 68th in global competitiveness,[321] as of 2010, India ranks 17th in financial market sophistication, 24th in the banking sector, 44th in business sophistication, and 39th in innovation, ahead of several advanced economies.[322] With seven of the world's top 15 information technology outsourcing companies based in India, as of 2009, the country is viewed as the second-most favourable outsourcing destination after the United States.[323] India is ranked 40th in the Global Innovation Index in 2023.[324] As of 2023, India's consumer market was the world's fifth-largest.[325]

Driven by growth, India's nominal GDP per capita increased steadily from US$308 in 1991, w

In [34]:
result

Unnamed: 0,uuid,content,tags,score
3,dcad6239-6315-40e0-85fc-9a79e9c86bb8,"Since 1991, economic inequality between India'...","India, economy, inequality, corruption, develo...",0.981657
2,fecdc1b7-7a72-4bcc-9867-5e528db563e3,Averaging an economic growth rate of 7.5% for ...,"Economic growth, India, Poverty, Middle class,...",0.989452
1,d9f49f81-418c-4184-bf90-47ea268f8b5e,The 522-million-worker Indian labour force is ...,"economy, India, industry, agriculture, trade, ...",1.000755
0,a4b3e6d3-12ee-4b62-9a20-851909aad26e,Economy\nMain article: Economy of India\n\nA f...,"Economy, Agriculture, Milk, Farms, India",1.009177


In [35]:
result_c

Unnamed: 0,uuid,content,tags,score
3,fecdc1b7-7a72-4bcc-9867-5e528db563e3,Averaging an economic growth rate of 7.5% for ...,"Economic growth, India, Poverty, Middle class,...",0.814134
2,d9f49f81-418c-4184-bf90-47ea268f8b5e,The 522-million-worker Indian labour force is ...,"economy, India, industry, agriculture, trade, ...",0.819212
1,e5d715cf-da9d-4a70-90d0-2a30bff7aab1,Women tend to a recently planted rice field in...,"India, economy, agriculture, GDP, growth, mark...",0.827677
0,a4b3e6d3-12ee-4b62-9a20-851909aad26e,Economy\nMain article: Economy of India\n\nA f...,"Economy, Agriculture, Milk, Farms, India",0.836985


In [2]:
import requests
from bs4 import BeautifulSoup
from langchain.document_loaders.base import BaseLoader
from langchain.docstore.document import Document
from typing import Callable, Union

class GenericHTMLDocumentLoader(BaseLoader):
    def __init__(self, url: str, selectors: Union[dict, None] = None, custom_parser: Callable = None):
        """
        A flexible HTML document loader that can extract content from an HTML page using CSS selectors, XPath,
        or a custom parser function.
        
        :param url: The URL of the webpage to load.
        :param selectors: A dictionary mapping section names to CSS or XPath selectors. If None, extracts all text.
        :param custom_parser: A custom function for parsing content from the BeautifulSoup object.
        """
        self.url = url
        self.selectors = selectors or {}
        self.custom_parser = custom_parser

    def load(self, text) -> list[Document]:
        
        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(text, "html.parser")

        # If a custom parser is provided, use it
        if self.custom_parser:
            content = self.custom_parser(soup)
            return [Document(page_content=content, metadata={"source": self.url})]

        documents = []

        # If no selectors are provided, extract all text from the page
        if not self.selectors:
            all_text = soup.get_text(separator="\n")
            documents.append(Document(page_content=all_text, metadata={"source": self.url}))
        else:
            # Iterate through the provided selectors
            for selector_name, selector in self.selectors.items():
                elements = soup.select(selector)  # Using CSS selectors by default
                content = "\n".join([element.get_text() for element in elements])
                
                # Create a LangChain Document object
                if content:
                    documents.append(Document(page_content=content, metadata={"source": self.url, "section": selector_name}))
        
        return documents

# Example usage:
if __name__ == "__main__":
    url = "https://example.com"

    # Example 1: Generic extraction (no specific selectors, all text)
    loader_generic = GenericHTMLDocumentLoader(url)
    docs_generic = loader_generic.load()
    print("Generic extraction:")
    for doc in docs_generic:
        print(doc.page_content)

Generic extraction:








Vector database - Wikipedia






















































Jump to content
















Main menu












Main menu


move to sidebar


hide







		Navigation
	






Main page
Contents
Current events
Random article
About Wikipedia
Contact us
Donate











		Contribute
	






Help
Learn to edit
Community portal
Recent changes
Upload file








































Search
























Search














































Appearance


































Create account




Log in


















Personal tools












 
Create account
 
Log in











		Pages for logged out editors 
learn more








Contributions
Talk


























































Contents


move to sidebar


hide










(Top)












1


Techniques


















2


Implementations


















3


See also


















4


References













In [3]:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://en.wikipedia.org/wiki/Vector_database")
docs = loader.load()
for doc in docs:
   print(doc.page_content)





Vector database - Wikipedia


























Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload file



















Search











Search






















Appearance
















Create account

Log in








Personal tools





 Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
Techniques








2
Implementations








3
See also








4
References








5
External links


















Toggle the table of contents







Vector database



6 languages




CatalàČeštinaDeutschEspañolFrançaisУкраїнська

Edit links











ArticleTalk





English

















ReadEditView history







Tools





Tools
move to sidebar
hide



		Actions
	




In [16]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
import json


class TextProcessor:

	def generate_tag_metadata(self, tags):
		# Load English language model
		nlp = spacy.load("en_core_web_sm")
		# Process the text with spaCy
		doc = nlp(tags)
		# Extract named entities and their labels
		meta_data = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
		# Convert meta data to JSON format
		meta_data_json = json.dumps(meta_data)
		return meta_data_json


	def preprocess_content(self, text):
		# Tokenization
		tokens = word_tokenize(text)
		
		# Remove Noise
		cleaned_tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens]
		
		# Normalization (convert to lowercase)
		cleaned_tokens = [token.lower() for token in cleaned_tokens]
		
		# Remove Stopwords
		stop_words = set(stopwords.words('english'))
		cleaned_tokens = [token for token in cleaned_tokens if token not in stop_words]
		
		# Lemmatize data
		formatted_text = ' '.join(cleaned_tokens)
		nlp = spacy.load("en_core_web_sm")
		lemmatized_tokens = [token.lemma_ for token in nlp(formatted_text)]
		
		return lemmatized_tokens


In [17]:
text_processor = TextProcessor()
text_processor.generate_tag_metadata('Economic growth, India, Poverty, Middle class, Competitiveness, Finance, Banking, Innovation, IT outsourcing, GDP, Asia')

'[{"text": "India", "label": "GPE"}, {"text": "Competitiveness, Finance", "label": "ORG"}, {"text": "Banking, Innovation", "label": "ORG"}, {"text": "Asia", "label": "LOC"}]'

In [18]:
text = """A vector database, vector store or vector search engine is a database that can store vectors (fixed-length lists of numbers) along with other data items. Vector databases typically implement one or more Approximate Nearest Neighbor (ANN) algorithms,[1][2] so that one can search the database with a query vector to retrieve the closest matching database records.

Vectors are mathematical representations of data in a high-dimensional space. In this space, each dimension corresponds to a feature of the data, with the number of dimensions ranging from a few hundred to tens of thousands, depending on the complexity of the data being represented. A vector's position in this space represents its characteristics. Words, phrases, or entire documents, as well as images, audio, and other types of data, can all be vectorized.[3]

These feature vectors may be computed from the raw data using machine learning methods such as feature extraction algorithms, word embeddings[4] or deep learning networks. The goal is that semantically similar data items receive feature vectors close to each other."""

In [21]:
' '.join(text_processor.preprocess_content(text))

'vector database   vector store vector search engine database store vector   fixedlength list number   along data item   vector database typically implement one approximate near neighbor   ann   algorithm    1    2   one search database query vector retrieve close matching database record   vector mathematical representation datum highdimensional space   space   dimension correspond feature data   number dimension range hundred ten thousand   depend complexity datum represent   vector position space represent characteristic   word   phrase   entire document   well image   audio   type datum   vectorize    3   feature vector may compute raw datum use machine learning method feature extraction algorithm   word embedding   4   deep learning network   goal semantically similar datum item receive feature vector close'