In [None]:
! pip install -q sentence_transformers==2.2.2
! pip install -q -U langchain
! pip install -q -U tiktoken
! pip install -q -U pypdf
! pip install -q -U faiss-gpu
! pip install -q -U InstructorEmbedding 
! pip install -q -U bitsandbytes
! pip install -q -U peft
! pip install -q -U trl 
! pip install -q -U transformers 
! pip install -q -U accelerate
! pip install -q -U bitsandbytes
!pip install -q -U datasets==2.16.0

In [None]:
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
from datasets import Dataset,load_dataset

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False
)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBERTv2-SS")
model = AutoModelForCausalLM.from_pretrained("ai4bharat/IndicBERTv2-SS",quantization_config=bnb_config,
    low_cpu_mem_usage = True)

In [None]:
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBERTv2-SS", trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_eos_token

In [None]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["query", "value"]
)
model = get_peft_model(model, peft_config)

In [None]:
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    hub_model_id="srinija2005/Test_1"
)

In [None]:
dataset = load_dataset("livinNector/indic_corp")

In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "[PAD]"
tokenizer.model_max_length = 512  # Ensure consistency with model


In [None]:
dataset
small_dataset = dataset["train"].shuffle(seed=42).select(range(int(0.005 * len(dataset["train"]))))

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512  # Ensure it matches tokenizer.model_max_length
    )

tokenized_datasets = small_dataset.map(tokenize_function, batched=True, remove_columns=["text"])


In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets,  # Ensure tokenized dataset is used
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
trainer.train()

In [None]:
import torch

# Define save path
save_directory = "./fine_tuned_model"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

# Save training arguments (if applicable)
training_args.save_to_json(f"{save_directory}/training_args.json")

print(f"Model saved to {save_directory}")


In [None]:
trainer.push_to_hub()

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load your fine-tuned model
model_name = "srinija2005/MyModel"  # Replace with your Hugging Face model ID or local path
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure padding is correctly set
tokenizer.pad_token = tokenizer.eos_token  

# Create a text-generation pipeline
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.eos_token_id,  
    max_length=256,  # Adjust as needed
    temperature=0.7,  # Adjust for randomness
    top_p=0.95,  
    repetition_penalty=1.15
)

# Define test prompts
test_prompts = [
    "भारत का प्रधानमंत्री कौन है?",
    "AI का भविष्य क्या है?",
    "नमस्ते, आप कैसे हैं?"
]

# Generate responses
for prompt in test_prompts:
    print(f"📝 **Prompt:** {prompt}")
    response = pipe(prompt, max_length=100, num_return_sequences=1)
    print(f"🤖 **Model Response:** {response[0]['generated_text']}\n")


In [None]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.eos_token_id,  # Ensure this is properly set
    max_length=512,  # Change from 600 to 512 to match model's expected size
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

In [None]:
! pip install  langchain_community


In [None]:
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate, LLMChain
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQA

In [None]:
llm = HuggingFacePipeline(pipeline = pipe)

In [None]:
model.hf_device_map

In [None]:
llm.invoke("வணக்கம்")

In [None]:
loader = DirectoryLoader(
    "/kaggle/input/theripistbot",
    glob="./*.pdf",
    loader_cls=PyPDFLoader,
    show_progress=True,
    use_multithreading=True
)

documents = loader.load()

In [None]:
len(documents)

In [None]:
documents[72]

In [None]:
st = ""
for i in documents:
    st = st+i.dict()["page_content"].replace("\t"," ")

In [None]:
len(st)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap = 0
)

texts = text_splitter.split_text(st)

In [None]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs = {"device": "cuda"}
)

vectordb = FAISS.from_texts(
    texts = texts, 
    embedding = embeddings
)

In [None]:
vectordb.similarity_search('depression')

In [None]:
prompt_template = """
Ask for questions about how they feel about their problem.
Make the user feel comfortable.
Answer in the same language the question was asked.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables = ["context", "question"]
)

In [None]:
llm_chain = LLMChain(prompt=PROMPT, llm=llm)

In [None]:
retriever = vectordb.as_retriever(search_kwargs = {"k": 3, "search_type" : "similarity"})
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = retriever, 
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = True
)

In [None]:
def wrap_text_preserve_newlines(text, width=700):
    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])

    return ans

In [None]:
import time
import textwrap

In [None]:
def llm_ans(query):
    start = time.time()
    
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

In [None]:
llm_ans("i feel sad.")