# **ENHANCE RAG WITH RE-RANKING**

### Extracting data from websites (Wikipedia web scraping)


In [None]:
!apt-get install -y wkhtmltopdf


In [None]:
!pip install pdfkit
!pip install requests
!pip install beautifulsoup4


In [3]:
import requests
from bs4 import BeautifulSoup
import pdfkit

# Function to scrape content from a given URL
def scrape_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')
    content = '\n'.join([para.get_text() for para in paragraphs])
    return content

# Function to preprocess content by removing backslashes and unwanted spaces
def preprocess_content(content):
    content = content.replace('\\', '')
    content = ' '.join(content.split())
    return content

urls = [
    'https://en.wikipedia.org/wiki/Artificial_intelligence',
    'https://en.wikipedia.org/wiki/Machine_learning',
    'https://en.wikipedia.org/wiki/Deep_learning',
    'https://en.wikipedia.org/wiki/Natural_language_processing',
    'https://en.wikipedia.org/wiki/Image_processing',
    'https://en.wikipedia.org/wiki/Computer_vision',
    'https://en.wikipedia.org/wiki/Speech_recognition'
]

contents = [preprocess_content(scrape_content(url)) for url in urls]

full_content = "\n\n".join(contents)

with open('content.html', 'w') as file:
    file.write(full_content)

# Path to the installed wkhtmltopdf
path_wkhtmltopdf = '/usr/bin/wkhtmltopdf'

config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)

# Convert the HTML file to a PDF
pdfkit.from_file('content.html', 'AI.pdf', configuration=config)


True

## **RAG Implementation**

To begin with install the required packages and import all the libraries

In [None]:
%pip install pypdf

In [None]:
%pip install langchain-chroma

In [None]:
%pip install sentence_transformers

In [None]:
%pip install torch torchvision

In [None]:
%pip install pinecone langchain openai

In [None]:
%pip install langchain_community

In [None]:
%pip install langchain_openai

In [11]:
import os
import time
import getpass
import pandas as pd
import numpy as np
import pinecone
from pinecone import Pinecone
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import LanceDB
import openai

In [39]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings

loader = PyPDFLoader(r"input file path")
pages = loader.load_and_split()

In [40]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L12-v2"),
    chunk_size=300,
    chunk_overlap=100,
    strip_whitespace=True,
)
docs = text_splitter.split_documents(pages)

In [41]:
openai_api_key = 'your openai api key here'
os.environ["OPENAI_API_KEY"] = openai_api_key
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [42]:
from langchain_chroma import Chroma

embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=docs, embedding=embeddings)

In [None]:
def preprocess_content(content):
    content = content.replace('\\', '')
    content = ' '.join(content.split())
    return content

# Perform similarity search
query = "What is supervised learning?"
docsnew = vectordb.similarity_search(query)

# Preprocess the retrieved content
cleaned_content = preprocess_content(docsnew[0].page_content)
cleaned_content

In [None]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder(
    "cross-encoder/ms-marco-TinyBERT-L-2-v2", max_length=512, device="cpu"
)

In [None]:
#cross encoder reranker
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

document_texts = [preprocess_content(doc.page_content) for doc in docsnew]

response = [[query, doc_text] for doc_text in document_texts]

scores = cross_encoder.predict(response)

print("Scores:")
for score in scores:
    print(score)

In [None]:
print("Responses:")
for res in response:
    print(res[0], res[1])