<a href="https://colab.research.google.com/github/raiaiaia/llm-code-review-clj/blob/main/creating_rag_db.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Criação da Base de Dados para o RAG


Document Loaders: https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/


#Instalações e importações necessárias

In [None]:
%%capture
!pip install langchain.community
!pip install chromadb --upgrade
!pip install langchain

In [None]:
!pip show chromadb

Name: chromadb
Version: 0.5.5
Summary: Chroma.
Home-page: https://github.com/chroma-core/chroma
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: /usr/local/lib/python3.10/dist-packages
Requires: bcrypt, build, chroma-hnswlib, fastapi, grpcio, httpx, importlib-resources, kubernetes, mmh3, numpy, onnxruntime, opentelemetry-api, opentelemetry-exporter-otlp-proto-grpc, opentelemetry-instrumentation-fastapi, opentelemetry-sdk, orjson, overrides, posthog, pydantic, pypika, PyYAML, tenacity, tokenizers, tqdm, typer, typing-extensions, uvicorn
Required-by: 


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
from transformers import BertTokenizer, BertModel
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import Chroma
from tqdm import tqdm
import pandas as pd
import numpy as np
import chromadb
import torch

#Preparando os dados

In [None]:
df = pd.read_csv('/content/drive/MyDrive/rayanne-llm-code-review-clj/Dataset/com-filtro/filtered_by_number_of_lines.csv')

In [None]:
print(df.shape)

(11890, 6)


In [None]:
df = df[df['diff_lines'] >= 5]
df = df.sort_values(by='diff_lines', ascending=False)

In [None]:
print(df.shape)

(5105, 6)


In [None]:
df_metadata = df[['comment_id','content']]
df = df[['content', 'cleaned_diff_hunk']]
df.to_csv('/content/drive/MyDrive/rayanne-llm-code-review-clj/Dataset/com-filtro/filtered_rag.csv', index=False)

In [None]:
print(df.shape)

(5105, 2)


#ChromaDB

In [None]:
df = pd.read_csv('/content/drive/MyDrive/rayanne-llm-code-review-clj/Dataset/com-filtro/filtered_rag.csv')

In [None]:
print(df.shape)

(5105, 2)


In [None]:
def load_documents(file_path):
    document_loader = CSVLoader(file_path)
    return document_loader.load()

page_concent='...'
metadata={'source': 'data/file.csv', 'row':0}



In [None]:
documents = load_documents('/content/drive/MyDrive/rayanne-llm-code-review-clj/Dataset/com-filtro/filtered_rag.csv')

In [None]:
text_documents = [doc.page_content for doc in documents]

In [None]:
print(type(documents))

<class 'list'>


In [None]:
%%capture
MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)

In [None]:
def get_embedding_function(text: str) -> list:
  inputs = tokenizer(
      text,
      return_tensors="pt",
      truncation=True,
      padding=True
      ).to('cuda')

  model.to('cuda')

  with torch.no_grad():
    outputs = model(**inputs)
  embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

  return embeddings.tolist()

In [None]:
embeddings = list(map(get_embedding_function, tqdm(text_documents)))

100%|██████████| 5105/5105 [01:48<00:00, 47.01it/s]


In [None]:
print(type(embeddings))
print(embeddings[0])

<class 'list'>
[-0.22231262922286987, -0.07508031278848648, 0.4689426124095917, -0.29574331641197205, 0.3117946982383728, -0.20683801174163818, -0.032172348350286484, 0.21376684308052063, 0.07578279078006744, -0.01353347860276699, -0.14933307468891144, -0.2965100407600403, -0.1757403314113617, 0.08353213965892792, 0.1226666122674942, 0.5420349836349487, 0.1307133436203003, 0.12233331799507141, -0.3132737874984741, 0.3223687410354614, 0.3755011260509491, 0.055463433265686035, -0.05541156232357025, 0.4610636830329895, 0.627261221408844, 0.09314967691898346, -0.01631273701786995, -0.08863542228937149, -0.4421256184577942, -0.00018328707665205002, 0.47657328844070435, -0.0882616713643074, 0.01959901861846447, -0.33824998140335083, 0.06597252190113068, -0.25769686698913574, 0.008636277168989182, -0.29588964581489563, 0.3613778352737427, 0.3001830577850342, -0.5867102146148682, -0.4902265667915344, 0.18207333981990814, -0.07177406549453735, -0.18687596917152405, -0.3219350576400757, -0.10273

In [None]:
def generate_metadata(df):
  metadata = []
  for i, row in df.iterrows():
    metadata.append({'comment_id': str(row['comment_id']), 'content': str(row['content'])})

  return metadata

In [None]:
CHROMA_PATH = '/content/drive/MyDrive/rayanne-llm-code-review-clj/Dataset/rag/chroma_data'
db = chromadb.PersistentClient(CHROMA_PATH)

In [None]:
collections = db.list_collections()
print(collections)

[]


In [None]:
def add_to_chroma(documents, embeddings):

  documents_ids = [str(i) for i in range(len(text_documents))]
  collection = db.create_collection(
      name="rag",
      metadata={"hnsw:space": "cosine"}
  )

  collection.add(
      documents=df['cleaned_diff_hunk'].tolist(),
      embeddings=embeddings,
      metadatas=generate_metadata(df_metadata),
      ids=documents_ids
   )

  print('done')

In [None]:
add_to_chroma(text_documents, embeddings)

done


In [None]:
collections = db.list_collections()
print(collections)

[Collection(id=f8a3bfb5-c8de-4910-9d9f-d4a833007c9b, name=rag)]
