<a href="https://colab.research.google.com/github/raiaiaia/llm-code-review-clj/blob/main/rag_experiment_mistral.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Model Experiment RAG

Mistral model with RAG optimization technique

#Instalações e importações necessárias

In [None]:
%%capture
!pip install chromadb --upgrade
!pip install langchain.community --upgrade
!pip install langchain --upgrade
!pip install -q sentence_transformers \
                accelerate \
                transformers \
                bitsandbytes

In [3]:
!pip show langchain

In [2]:
!huggingface-cli login

In [1]:
!huggingface-cli whoami

In [None]:
%%capture
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, pipeline
from langchain.vectorstores.chroma import Chroma
from huggingface_hub import InferenceClient
from time import time
import transformers
import pandas as pd
import numpy as np
import argparse
import chromadb
import torch
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Carregando os dados

In [None]:
df = pd.read_csv('/content/drive/MyDrive/rayanne-llm-code-review-clj/Dataset/com-filtro/filtered_by_number_of_lines.csv')
df.head()

Unnamed: 0,pull_request_url,comment_id,diff_hunk,content,cleaned_diff_hunk,diff_lines
0,https://api.github.com/repos/adamtornhill/code...,33513420,"@@ -1,3 +1,5 @@+[![Build Status](https://travi...",Ah sorry there is clearly an issue here @adamt...,[![Build Status](https:,1
1,https://api.github.com/repos/adamtornhill/code...,95127516,"@@ -10,52 +10,106 @@ ""/some/path => G1 /anothe...",The test cases are a bit too noisy. We need to...,"(def ^:const multi-regexp-group-spec\n""^/some/...",9
2,https://api.github.com/repos/anmonteiro/lumo/p...,87304994,"@@ -35,7 +35,7 @@ Check out `lumo -h` for usag...",:+1:,"2. At the root of the repository, run: `boot r...",1
3,https://api.github.com/repos/anmonteiro/lumo/p...,88037369,"@@ -0,0 +1,35 @@+import net from 'net';",this needs a `/* @flow */` header,import net from 'net';,1
4,https://api.github.com/repos/anmonteiro/lumo/p...,88787684,"@@ -75,30 +76,35 @@ function consumeBuffer(buf...",can we keep the readline interface as the firs...,"export function processLine(sessionId: number,...",1


In [None]:
df = df[df['diff_lines'] >= 5].reset_index(drop=True)

In [None]:
print(df.shape)

(5105, 6)


#Preparando o ambiente

In [None]:
CHROMA_PATH = '/content/drive/MyDrive/rayanne-llm-code-review-clj/Dataset/rag/chroma_data'
db = chromadb.PersistentClient(CHROMA_PATH)
collections = db.list_collections()
print(collections)

[Collection(id=f8a3bfb5-c8de-4910-9d9f-d4a833007c9b, name=rag)]


In [None]:
%%capture
MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
embedding_model = BertModel.from_pretrained(MODEL_NAME)

In [None]:
MISTRAL_INSTRUCT= 'mistralai/Mistral-7B-Instruct-v0.2'
MISTRAL= 'mistralai/Mistral-7B-v0.1'

In [None]:
quantization = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
    )

mistral_model = AutoModelForCausalLM.from_pretrained(
    MISTRAL_INSTRUCT,
    device_map="auto",
    quantization_config=quantization
)

mistral_model.eval()

mistral_tokenizer = AutoTokenizer.from_pretrained(
    MISTRAL_INSTRUCT,
    padding_side="left",
    model_max_length=4096
    )

mistral_tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

#Utilizando RAG para otimizar as revisões com MISTRAL

In [None]:
def get_embedding_function(text: str) -> list:
  inputs = tokenizer(
      text,
      return_tensors="pt",
      truncation=True,
      padding=True
      ).to('cuda')

  embedding_model.to('cuda')

  with torch.no_grad():
    outputs = embedding_model(**inputs)
  embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

  return embeddings.tolist()

In [None]:
def create_prompt(code_snippet, context):
  return f'''
      <s>[ISNT] You are an experienced programmer in Clojure.
      Review the following code snippet and provide feedback on: readability, efficiency, maintainability, and potencial bugs.

      {code_snippet}

      ---
      Conduct your review based on the following review examples:

      {context}

      </s>

      [INST] Format your review as text with itemized concrete instructions to the author of the code. Do not add this prompt to the output. [/ISNT]
  '''

In [None]:
def format_prompt_mistral(code, comment_id):

  code_snippet = code
  context = ''
  query_text = get_embedding_function(code_snippet)
  collection = db.get_collection('rag')
  results = collection.query(query_text, n_results=5)

  for id in results['ids'][0]:
    reviews = collection.get(ids=[id])
    comment_id_review = reviews['metadatas'][0]['comment_id']

    if comment_id_review != comment_id:
      comment = reviews['metadatas'][0]['content']
      diff_hunk = reviews['documents'][0]
      context += f'\n The following code \n {diff_hunk}\n generates this review comment\n {comment}\n'

  return create_prompt(code_snippet=code_snippet, context=context)

In [1]:
t = time()

if 'model_comment' not in df.columns:
    df['model_comment'] = [None for _ in range(df.shape[0])]

start_index = 2650

#for index, row in df.iterrows():
for index, row in df.iloc[start_index:].iterrows():
  print(f'Running example {index+1}')

  prompt = format_prompt_mistral(row['cleaned_diff_hunk'], row['comment_id'])
  model_input = mistral_tokenizer(prompt, return_tensors='pt').to('cuda')

  generated_ids = mistral_model.generate(
                                **model_input,
                                 temperature=0.1,
                                 top_k=1,
                                 top_p=1.0,
                                 repetition_penalty=1.4,
                                 min_new_tokens=16,
                                 max_new_tokens=512,
                                 do_sample=True
                                )

  resp = mistral_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  clean_text = resp[(len(prompt)-9):]

  df.at[index, 'model_comment'] = clean_text
  if ((index+1) % 5 == 0):
    print(f'Saving results. (current shape: {index+1})')
    df.to_csv('/content/drive/MyDrive/rayanne-llm-code-review-clj/Dataset/output/rag_model_output_2.csv', index=False)

print(f'Demanded time: {time() - t} seconds')