In [1]:
# https://www.kaggle.com/code/hotchpotch/llm-detect-pip 
!pip install -q -U accelerate --no-index --find-links ../input/llm-detect-pip/
!pip install -q -U bitsandbytes --no-index --find-links ../input/llm-detect-pip/
!pip install -q -U transformers --no-index --find-links ../input/llm-detect-pip/

In [2]:
import sys
import torch
import random
import numpy as np
import pandas as pd
import gc
from IPython.display import Markdown

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

sys.path.append("/kaggle/input/peft-main/src")
from peft import PeftModel

#https://github.com/Lightning-AI/lit-gpt/issues/327
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

if (not torch.cuda.is_available()): print("Sorry - GPU required!")

In [3]:
model_name = '/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1'

# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [4]:
def display_formatted(input_text):
    input_text = input_text.replace("<s>", "").replace("</s>", "")
    user_start = input_text.find("[INST]") + len("[INST]")
    user_end = input_text.find("[/INST]")
    user_text = input_text[user_start:user_end].strip()
    llm_response = input_text[user_end + len("[/INST]"):].strip()
    
    formatted_text = f"<b>User:</b><br>{user_text}\n\n<b>LLM Response:</b><br>{llm_response}"

    display(Markdown(formatted_text))
    
def get_mistral_response(prompt):
    # Construct the prompt
    messages = [
        {"role": "user", "content": prompt},
    ]
    
    # Apply the chat template and tokenize
    model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
    model_inputs = model_inputs.to("cuda")
    
    # Generate the response with stopping criteria
    generated_ids = model.generate(
        model_inputs, 
        max_new_tokens=128, 
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,  # Ensure end-of-sequence token is used
        num_beams=4,
        early_stopping=True
    )

    # Decode the generated ids
    decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return decoded[0]

In [5]:
response = get_mistral_response("What is the capital of France?")
display_formatted(response)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
2024-07-30 12:13:44.595535: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 12:13:44.595653: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 12:13:44.736881: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


<b>User:</b><br>What is the capital of France?

<b>LLM Response:</b><br>The capital of France is Paris.

In [6]:
model = PeftModel.from_pretrained(model, "/kaggle/input/mistral_finetuned-quora/pytorch/default/1")

In [7]:
response = get_mistral_response("What is the capital of France?")
display_formatted(response)

<b>User:</b><br>What is the capital of France?

<b>LLM Response:</b><br>The capital city of France is Paris.

In [8]:
response = get_mistral_response("Can you explain me theory of relativity")
display_formatted(response)

<b>User:</b><br>Can you explain me theory of relativity

<b>LLM Response:</b><br>Certainly! The theory of relativity is a fundamental concept in physics that explains the relationship between space and time. It was first proposed by Albert Einstein in the early 20th century and has since become one of the most well-established theories in modern physics.
There are two parts to the theory of relativity: the special theory of relativity and the general theory of relativity.
The special theory of relativity states that the laws of physics are the same for all observers moving at a constant speed in a straight line. It also states that the speed of light is always the same, regardless of the

In [11]:
import pandas as pd
# Load your test dataset
test_data = pd.read_csv('/kaggle/input/mistral-qna/quora_dataset.csv')


# Assuming your test data has columns 'source_text' and 'reference_summary'
source_texts = test_data['question'].tolist()
reference_summaries = test_data['answer'].tolist()

In [12]:
predictions = []

for text in source_texts:
    response = get_mistral_response(text)
    predictions.append(response)

In [13]:
!pip install rouge-score

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=cbdb1a3697aff3f4627757d1dc97b2956471f869ad6a2f1a1bc8f8f198718ca5
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [18]:
from rouge_score import rouge_scorer

# Function to compute ROUGE score
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = []
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores.append(score)
    return scores

# Compute ROUGE scores
rouge_scores = compute_rouge(predictions, reference_summaries)

# Extract fmeasure values and compute averages
average_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
average_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
average_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

print(f'Average ROUGE-1: {average_rouge1*100}')
print(f'Average ROUGE-2: {average_rouge2*100}')
print(f'Average ROUGE-L: {average_rougeL*100}')


Average ROUGE-1: 9.42831563783953
Average ROUGE-2: 1.6946171722881727
Average ROUGE-L: 7.541255808609577


In [None]:
model.push_to_hub("llama-3-8b-chat-Qs-Ans-Quora", use_temp_dir=False)
tokenizer.push_to_hub("llama-3-8b-chat-Qs-Ans-Quora", use_temp_dir=False)