In [None]:
!pip install llama-index-vector-stores-deeplake
!pip install deeplake
!pip install llama-index
!pip install sentence-transformers
!pip install langchain-google-genai

In [None]:
import os
import openai
import requests
import re

from google.colab import userdata
from bs4 import BeautifulSoup

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.vector_stores.deeplake import DeepLakeVectorStore

### 1.Retrieve and clean documents

In [None]:
openai.api_key = userdata.get('OPENAI_API_KEY')
os.environ['USER_AGENT'] = 'RAGUserAgent'
os.environ['ACTIVELOOP_TOKEN']= userdata.get('ACTIVELOOP_TOKEN')

In [None]:
urls = [
    "https://github.com/VisDrone/VisDrone-Dataset",
    "https://paperswithcode.com/dataset/visdrone",
    "https://openaccess.thecvf.com/content_ECCVW_2018/papers/11133/Zhu_VisDrone-DET2018_The_Vision_Meets_Drone_Object_Detection_in_Image_Challenge_ECCVW_2018_paper.pdf",
    "https://github.com/VisDrone/VisDrone2018-MOT-toolkit",
    "https://en.wikipedia.org/wiki/Object_detection",
    "https://en.wikipedia.org/wiki/Computer_vision",
    "https://en.wikipedia.org/wiki/Convolutional_neural_network",
    "https://en.wikipedia.org/wiki/Unmanned_aerial_vehicle",
    "https://www.faa.gov/uas/",
    "https://www.tensorflow.org/",
    "https://pytorch.org/",
    "https://keras.io/",
    "https://arxiv.org/abs/1804.06985",
    "https://arxiv.org/abs/2202.11983",
    "https://motchallenge.net/",
    "http://www.cvlibs.net/datasets/kitti/",
    "https://www.dronedeploy.com/",
    "https://www.dji.com/",
    "https://arxiv.org/",
    "https://openaccess.thecvf.com/",
    "https://roboflow.com/",
    "https://www.kaggle.com/",
    "https://paperswithcode.com/",
    "https://github.com/"
]

In [None]:
def clean_text(content):
    content = re.sub(r'\[\s*(\d+|edit)\s*\]','',content)
    content = re.sub(r'[^\w\s\.]','',content)
    return content

In [None]:
def fetch(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        content = soup.find('div',{'class': 'mw-parser-output'}) or soup.find('div', {'id': 'content'})
        if content is None:
            return None

        for section_title in ['References', 'Bibliography', 'External links', 'See also', 'Notes']:
            section = content.find('span',id=section_title)
            while section:
                for sib in section.parent.find_next_siblings():
                    sib.decompose()
                section.parent.decompose()
                section = content.find('span',id=section_title)

        text = content.get_text(separator=' ', strip=True)
        text = clean_text(text)
        return text


    except requests.exceptions.RequestException as e:
        print(f'error from {url}: {e}')
        return None




In [None]:
LLM_PATH = './contents/'

In [None]:

for url in urls:
    article = url.split('/')[-1].replace('.html','')

    filename = os.path.join(LLM_PATH, f'{article}.txt')
    clean_article = fetch(url)
    if clean_article:
        with open(filename,'w',encoding='utf-8') as file:
            file.write(clean_article)
            print(f'\tContent was written to {filename}')

print('Content writing done...')

In [None]:
documents = SimpleDirectoryReader(LLM_PATH).load_data()

In [None]:
documents[1]

### 2.Create and load data to DeepLake Vector Store

In [None]:
from llama_index.core import StorageContext
from pydantic.v1 import BaseModel,Field
from typing import ClassVar
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import deeplake


In [None]:
vector_path = 'hub://pythoninaction/drone_1000'
dataset_path = 'hub://pythoninaction/drone_1000'

gemini_embedding = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

In [None]:
vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=True)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# create index
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)#, embedding=gemini_embedding, config=Config())

In [None]:
ds = deeplake.load(dataset_path)

In [None]:
import json
import pandas as pd
import numpy as np

# Assuming 'ds' is your loaded Deep Lake dataset

# Create a dictionary to hold the data
data = {}

# Iterate through the tensors in the dataset
for tensor_name in ds.tensors:
    tensor_data = ds[tensor_name].numpy()

    # Check if the tensor is multi-dimensional
    if tensor_data.ndim > 1:
        # Flatten multi-dimensional tensors
        data[tensor_name] = [np.array(e).flatten().tolist() for e in tensor_data]
    else:
        # Convert 1D tensors directly to lists and decode text
        if tensor_name == "text":
            data[tensor_name] = [t.tobytes().decode('utf-8') if t else "" for t in tensor_data]
        else:
            data[tensor_name] = tensor_data.tolist()

# Create a Pandas DataFrame from the dictionary
df = pd.DataFrame(data)

In [None]:
# Function to display a selected record
def display_record(record_number):
    record = df.iloc[record_number]
    display_data = {
        "ID": record.get("id", "N/A"),
        "Metadata": record.get("metadata", "N/A"),
        "Text": record.get("text", "N/A"),
        "Embedding": record.get("embedding", "N/A")
    }

    # Print the ID
    print("ID:")
    print(display_data["ID"])
    print()

    # Print the metadata in a structured format
    print("Metadata:")
    metadata = display_data["Metadata"]
    if isinstance(metadata, list):
        for item in metadata:
            for key, value in item.items():
                print(f"{key}: {value}")
            print()
    else:
        print(metadata)
    print()

    # Print the text
    print("Text:")
    print(display_data["Text"])
    print()

    # Print the embedding
    print("Embedding:")
    print(display_data["Embedding"])
    print()

# Function call to display a record
rec = 0  # Replace with the desired record number
display_record(rec)

In [None]:
# Ensure 'text' column is of type string
df['text'] = df['text'].astype(str)
# Create documents with IDs
documents = [Document(text=row['text'], doc_id=str(row['id'])) for _, row in df.iterrows()]

### 3.RAG Index Based

In [None]:
question = "Can drones identify moving objects like boad or vehicle?"

# CONSTANT
K=3
TEMPERATURE=0.1
MT=1024

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import numpy as np

In [None]:
model = SentenceTransformer('all-MiniLM-l6-v2')

In [None]:
def calculate_cosine_similarity(str1,str2):
  embeddings = model.encode([str1,str2])
  similarity = cosine_similarity(embeddings)
  return similarity[0][0]

### Index Query Engine

In [None]:
from llama_index.core import VectorStoreIndex
vector_store_index = VectorStoreIndex.from_documents(documents)

In [None]:
print(type(vector_store_index))

In [None]:
vector_engine = vector_store_index.as_query_engine(similarity_top_k=K,num_output=MT,temperature=TEMPERATURE)

In [None]:
print(type(vector_engine))

### Test Response of Index Query Engine

In [None]:
import pandas as pd
import textwrap

def test_index_query_engine(query):
    response = vector_engine.query(query)

    print(textwrap.fill(str(response), 100))

    node_data = []
    for node_with_score in response.source_nodes:
        node = node_with_score.node
        node_info = {
            'NodeID': node.id_,
            'Score': node_with_score.score,
            'Text': node.text
        }
        node_data.append(node_info)

    df = pd.DataFrame(node_data)

    return df, response

In [None]:
import time

#timer start
start_time = time.time()
df, response = test_index_query_engine(question)

# time end
end_time = time.time()

# check speed of response
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

 # Display the DataFrame using markdown
print(df.to_markdown(index=False, numalign="left", stralign="left"))

nodeid = response.source_nodes[0].node.id_
print(nodeid)
text = response.source_nodes[0].get_text()
print(text)

### Metric Performance

In [None]:
def metric_performance(response):
  # Calculate the performance (handling None scores)
  scores = [node.score for node in response.source_nodes if node.score is not None]
  if scores:  # Check if there are any valid scores
      weights = np.exp(scores) / np.sum(np.exp(scores))
      perf = np.average(scores, weights=weights) / elapsed_time
  else:
      perf = 0  # Or some other default value if all scores are None

  average_score=np.average(scores, weights=weights)
  print(f"Average score: {average_score:.4f}")
  print(f"Query execution time: {elapsed_time:.4f} seconds")
  print(f"Performance metric: {perf:.4f}")


metric_performance(response)