In [1]:
!pip install transformers pydantic bitsandbytes-cuda110 bitsandbytes

Collecting bitsandbytes-cuda110
  Downloading bitsandbytes_cuda110-0.26.0.post2-py3-none-any.whl.metadata (6.3 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes_cuda110-0.26.0.post2-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes-cuda110, bitsandbytes
Successfully installed bitsandbytes-0.45.2 bitsandbytes-cuda110-0.26.0.post2


In [2]:
import json

with open("/kaggle/input/general-animal-names/general_animal_names.json") as json_data:
    animal_samples = json.load(json_data)["animals"]

print(animal_samples[:10])

['cat', 'lion', 'fish', 'spider', 'fly', 'butterfly', 'horse', 'bull', 'cow', 'dog']


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import pandas as pd
from tqdm import tqdm
from pydantic import BaseModel, Field, ValidationError
from typing import List
import spacy
from spacy.tokens import DocBin, Span
import re
import gc 
import random

random.seed(42)

nlp = spacy.load("en_core_web_sm")

model_dir = "/kaggle/input/llama-3.1/transformers/8b-instruct/2/"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)


model = AutoModelForCausalLM.from_pretrained(
    model_dir, 
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

model.config.use_cache = False
model.config.pretraining_tp = 1

# # Load CSV files
# animal_df = pd.read_csv("/kaggle/input/animal-names/processed_animal_names.csv")
# needed_animal_classes = ["Mammalia", "Aves", "Arachnida", "Insecta", "Pisces", "Amphibia", "Mollusca", "Crustacea", "Cnidaria"]

# animal_samples = list(animal_df[animal_df["class"].isin(needed_animal_classes)]["common_name"])


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
system_prompt = """
You are an advanced AI trained in natural language processing and synthetic data generation.
Your task is to read the following animal name and generate 10 unique sentences using given animal name.
Make main focus on diversifying sentences - sentence structures and words.

Make sure to extract the exact and use string of the animal name without any changes in it.
For each sentence, highlight the name of the given animal string by setting "||" around it.
You are not allowed to use words that may have a meaning of the animal except given animal name.
Do not provide any explanations.
Only respond with the JSON structured data, structure of JSON should be strictly as in examples.

### Example 1:
Input: 'bald eagle'

Output:
[
    {
        "bald eagle": [
            "The majestic ||bald eagle|| soars high above the tranquil lake, its keen eyes scanning for prey.",
            "With powerful wings, the ||bald eagle|| glides effortlessly through the morning sky.",
            "A symbol of strength and freedom, the ||bald eagle|| commands attention wherever it flies.",
            "The sharp talons of the ||bald eagle|| make it a formidable hunter among the skies.",
            "The call of the ||bald eagle|| echoes through the valleys, a sound both haunting and beautiful.",
            "Under the golden sunset, the silhouette of the ||bald eagle|| is a breathtaking sight.",
            "The ||bald eagle|| is often seen perched on rocky cliffs, surveying the world below.",
            "The ||bald eagle|| is easily identifiable by its white head and tail feathers.",
            "From a distance, it was hard to tell that was the ||bald eagle||.",
            "||eagle|| enthusiasts pay much attention to ||bald eagle||, admiring its regal presence and hunting prowess."
        ]
    }
]


### Example 2:
Input: 'cow'

Output:
[
    {
        "cow": [
            "In the serene meadow, the ||cow|| grazes peacefully under the warm sun.",
            "Once revered in ancient cultures, the ||cow|| holds symbolic meaning even today.",
            "Researchers study the digestion of the ||cow|| to improve agricultural efficiency.",
            "The ||cow||, known for its gentle demeanor, is a beloved farm animal worldwide.",
            "On the rolling hills, the ||cow|| is a symbol of pastoral beauty.",
            "||Cow|| enthusiasts pay much attention for ||cows|| with exceptional milk production.",
            "Farmers appreciate the ||cow|| not only for milk but also for the role of the ||cow|| in sustainable agriculture.",
            "The bell around the ||cow||'s neck jingles as the ||cow|| moves through the pasture.",
            "From ancient myths to modern-day farming, the ||cow|| has always held a special place in human society, symbolizing abundance and nurturing.",
            "The ||cow||'s milk is used to make cheese, yogurt, and butter, highlighting the ||cow||'s importance in the culinary world."
        ]
    }
]


Continue with the task and stop after generating valid output for the given animal by the user by outputting '### Output ends here.'
Don't forget this strict rules.
"""


In [5]:
from transformers import StoppingCriteria, StoppingCriteriaList
from torch import cuda, LongTensor, FloatTensor
import os

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
failed_animal_names = []
import json
import re

import json
import re

import json
import re

def extract_json_from_response(response):
    """
    Extracts JSON content from the response string.
    It looks for the first occurrence of '[' or '{' after the keyword "Output:" 
    and uses the last occurrence of the corresponding closing bracket before "### Output ends here.".
    If extraction or parsing fails, returns an empty list.
    """
    try:
        # Locate the start of the output segment and the end marker
        output_index = response.find("\nUser:")
        end_index = response.rfind("Output ends here")
        if output_index == -1 or end_index == -1:
            print("Output or end marker not found")
            return []
        # Consider only the text between "Output:" and "### Output ends here."
        segment = response[output_index:end_index]
        # Find the first occurrence of '[' or '{' in the segment
        match = re.search(r'([\[\{])', segment)
        if not match:
            print("No JSON start character found in segment")
            return []
        start_bracket = match.start()
        json_content = segment[start_bracket:].strip()
        # Depending on the first character, find the last matching closing bracket
        if json_content[0] == '[':
            end_bracket = json_content.rfind(']')
        else:
            end_bracket = json_content.rfind('}')
        if end_bracket == -1:
            print("No closing bracket found")
            return []
        json_content = json_content[:end_bracket+1]
        # Debug print the extracted JSON content
        parsed = json.loads(json_content)
        # Wrap dictionary in a list if needed
        if not isinstance(parsed, list):
            parsed = [parsed]
        return parsed
    except json.JSONDecodeError as e:
        print(f"Failed to decode JSON: {e}")
        return []





def create_stopping_criteria(stop_words, tokenizer, device):
    class StoppingCriteriaSub(StoppingCriteria):
        def __init__(self, stops = [], device=device, encounters = 1):
            super().__init__()
            self.stops = stops = [stop.to(device) for stop in stops]

        def __call__(self, input_ids: LongTensor, scores: FloatTensor) -> bool:
            last_token = input_ids[0][-1]
            for stop in self.stops:
                if tokenizer.decode(stop) == tokenizer.decode(last_token):
                    return True
            return False

    stop_word_ids = [tokenizer(stop_word,
                               return_tensors="pt", 
                               add_special_tokens=False)["input_ids"].squeeze() 
                               for stop_word in stop_words]

    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_word_ids)])
    return stopping_criteria


stop_words_list = ["Output ends"]
stopping_criteria = None
if stop_words_list is not None:
    stopping_criteria = create_stopping_criteria(stop_words_list, tokenizer, device)

def write_batch_to_json(entities, start_idx):
    filename = os.path.join("/kaggle/working/generated/", "entity_" + str(start_idx) + ".json")
    with open(filename, "w", encoding='utf-8') as f:
        json.dump(entities, f)
        

def predict_entities_in_batches(test_dataset, model, tokenizer, system_prompt):
    text_generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

    with torch.no_grad():
        for i in tqdm(range(len(test_dataset)), desc="Processing batches"):

            prompt = test_dataset[i]
            prompt = f"{prompt}"
            chat_input = [
                f"{system_prompt}\nUser: {prompt}"
            ]
            results = text_generation_pipeline(chat_input,
                                               max_new_tokens=400,
                                               do_sample=True,
                                               temperature=1.1,
                                               top_p=0.9)
            gc.collect()
            torch.cuda.empty_cache()
            entities = []
            for result in results:
                generated_text = result[0]['generated_text']
                entity = extract_json_from_response(generated_text)
                if entity == []:
                    failed_animal_names.append(prompt)
                else:
                    entities.append(entity)
            
            write_batch_to_json(entities, i) 


os.makedirs("/kaggle/working/generated/", exist_ok=True)

processed_data = predict_entities_in_batches(animal_samples, model, tokenizer, system_prompt)

Processing batches:   0%|          | 0/239 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:   0%|          | 1/239 [00:23<1:32:49, 23.40s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:   1%|          | 2/239 [00:45<1:29:36, 22.69s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:   1%|▏         | 3/239 [01:02<1:19:04, 20.11s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:   2%|▏         | 4/239 [01:24<1:22:07, 20.97s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:   2%|▏         | 5/239 [01:47<1:23:33, 21.43s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:   3%|▎         | 6/239 [02:09<1:24:24, 21.74s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:   3%|▎    

Failed to decode JSON: Extra data: line 5 column 1 (char 31)


Processing batches:  27%|██▋       | 64/239 [21:55<53:53, 18.48s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  27%|██▋       | 65/239 [22:17<56:54, 19.63s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  28%|██▊       | 66/239 [22:40<58:55, 20.44s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Failed to decode JSON: Expecting ',' delimiter: line 14 column 10 (char 1163)


Processing batches:  28%|██▊       | 67/239 [23:02<1:00:15, 21.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  28%|██▊       | 68/239 [23:24<1:00:58, 21.39s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  29%|██▉       | 69/239 [23:46<1:01:23, 21.67s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  29%|██▉       | 70/239 [24:09<1:01:34, 21.86s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  30%|██▉       | 71/239 [24:31<1:01:30, 21.97s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  30%|███       | 72/239 [24:45<54:23, 19.54s/it]  Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  31%|███       | 73/239 [25:07<56:23, 20.38s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batc

No JSON start character found in segment


Processing batches:  33%|███▎      | 79/239 [27:13<55:47, 20.92s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  33%|███▎      | 80/239 [27:35<56:28, 21.31s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  34%|███▍      | 81/239 [27:58<56:54, 21.61s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  34%|███▍      | 82/239 [28:20<57:10, 21.85s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  35%|███▍      | 83/239 [28:42<57:06, 21.97s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  35%|███▌      | 84/239 [29:04<56:56, 22.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  36%|███▌      | 85/239 [29:19<50:37, 19.73s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  36%|██

No JSON start character found in segment


Processing batches:  41%|████      | 97/239 [33:16<48:16, 20.40s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  41%|████      | 98/239 [33:38<49:13, 20.95s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  41%|████▏     | 99/239 [34:00<49:45, 21.32s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  42%|████▏     | 100/239 [34:18<47:08, 20.35s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  42%|████▏     | 101/239 [34:41<48:08, 20.93s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


No JSON start character found in segment


Processing batches:  43%|████▎     | 102/239 [34:56<44:06, 19.32s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  43%|████▎     | 103/239 [35:18<45:49, 20.22s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  44%|████▎     | 104/239 [35:41<46:53, 20.84s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  44%|████▍     | 105/239 [36:03<47:25, 21.24s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  44%|████▍     | 106/239 [36:25<47:44, 21.53s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  45%|████▍     | 107/239 [36:47<47:52, 21.76s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  45%|████▌     | 108/239 [37:10<47:49, 21.90s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches: 

Failed to decode JSON: Invalid control character at: line 10 column 133 (char 885)


Processing batches:  48%|████▊     | 114/239 [39:23<46:08, 22.15s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  48%|████▊     | 115/239 [39:45<45:45, 22.14s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  49%|████▊     | 116/239 [40:07<45:25, 22.16s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  49%|████▉     | 117/239 [40:29<45:02, 22.15s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  49%|████▉     | 118/239 [40:52<44:41, 22.16s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  50%|████▉     | 119/239 [41:14<44:22, 22.19s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  50%|█████     | 120/239 [41:36<44:02, 22.20s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches: 

No JSON start character found in segment


Processing batches:  55%|█████▍    | 131/239 [45:29<36:42, 20.39s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  55%|█████▌    | 132/239 [45:51<37:19, 20.93s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  56%|█████▌    | 133/239 [46:14<37:42, 21.34s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  56%|█████▌    | 134/239 [46:36<37:47, 21.59s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Failed to decode JSON: Extra data: line 5 column 1 (char 28)


Processing batches:  56%|█████▋    | 135/239 [46:58<37:45, 21.79s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  57%|█████▋    | 136/239 [47:20<37:36, 21.91s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  57%|█████▋    | 137/239 [47:43<37:29, 22.06s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  58%|█████▊    | 138/239 [48:05<37:14, 22.12s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  58%|█████▊    | 139/239 [48:27<36:56, 22.17s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  59%|█████▊    | 140/239 [48:49<36:34, 22.17s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  59%|█████▉    | 141/239 [49:11<36:10, 22.15s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches: 

Failed to decode JSON: Extra data: line 4 column 1 (char 29)


Processing batches:  85%|████████▍ | 203/239 [1:11:17<12:49, 21.38s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  85%|████████▌ | 204/239 [1:11:39<12:36, 21.60s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  86%|████████▌ | 205/239 [1:12:01<12:18, 21.73s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  86%|████████▌ | 206/239 [1:12:23<11:58, 21.78s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  87%|████████▋ | 207/239 [1:12:44<11:26, 21.46s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  87%|████████▋ | 208/239 [1:12:59<10:04, 19.50s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing batches:  87%|████████▋ | 209/239 [1:13:21<10:08, 20.27s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Proces

Failed to decode JSON: Expecting ',' delimiter: line 9 column 95 (char 672)





In [6]:
filename = "/kaggle/working/failed_animal_names.json"
with open(filename, "w", encoding='utf-8') as f:
    json.dump({"failed_animal_names": failed_animal_names}, f)


In [7]:
import shutil
shutil.make_archive('/kaggle/working/generated', "zip", '/kaggle/working/generated/')

'/kaggle/working/generated.zip'