<a href="https://colab.research.google.com/github/polyexplorer/open-llm/blob/main/Calance_Otsuka_ProtocolScoring_Instruction_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
#@markdown # Dependencies
#@markdown - pymupdf
! pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.23.6-cp310-none-manylinux2014_x86_64.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.6 (from pymupdf)
  Downloading PyMuPDFb-1.23.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.23.6 pymupdf-1.23.6


In [10]:
#@markdown Compile Training Data from PDFs List.

import os

training_data_path = "/content/drive/MyDrive/protocol_scoring/ift"

pdfs = [os.path.join(training_data_path, x) for x in os.listdir(training_data_path) if x.endswith('.pdf')]
csvs = [os.path.join(training_data_path, x) for x in os.listdir(training_data_path) if x.endswith('.csv')]
excels = [os.path.join(training_data_path, x) for x in os.listdir(training_data_path) if x.endswith('.xlsx')]


In [5]:
csvs

['/content/drive/MyDrive/protocol_scoring/ift/031-201-00559_Protocol_Approved.csv',
 '/content/drive/MyDrive/protocol_scoring/ift/031-201-00301 Protocol Amendment 2_20Jul2020 - 031-201-00301 Protocol Amendment 2_20Jul2020.csv',
 '/content/drive/MyDrive/protocol_scoring/ift/31-14-204_Protocol_Amendment_1_30Apr2018.csv']

In [17]:
#@markdown Safely read CSV and Excel files with robust encoding error handling
import pandas as pd
import io



def safe_read(file, file_type):

    common_encodings = ['utf-8', 'windows-1252', 'iso-8859-1']  # Add more if necessary

    if file_type == 'csv':
        for encoding in common_encodings:
            try:
                with open(file, 'rb') as f:
                    content = f.read()
                decoded_content = content.decode(encoding, errors='replace')
                return pd.read_csv(io.StringIO(decoded_content))
            except UnicodeDecodeError:
                pass  # Try the next encoding
            except Exception as e:
                print(f"Error reading {file} with encoding {encoding}: {e}")

    elif file_type == 'excel':
        try:
            return pd.read_excel(file)
        except Exception as e:
            print(f"Error reading {file}: {e}")

qa_data = pd.concat([safe_read(x, 'csv') for x in csvs] + [safe_read(x, 'excel') for x in excels])


In [23]:
#@markdown Read text from PDF
import fitz

def extract_pdf_text(pdf_path):
    try:
        with fitz.open(pdf_path) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
            return text
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
        return ""


# Modified to use a dictionary
def process_file(file, file_type):
    if file_type in ['csv', 'excel']:
        df = safe_read(file, file_type)
        pdf_path = file.rsplit('.', 1)[0] + '.pdf'
        pdf_text = extract_pdf_text(pdf_path) if os.path.exists(pdf_path) else None
        return {file: {'dataframe': df, 'pdf_text': pdf_text}}
    else:
        return None

# Using a dictionary to maintain the relationship
file_data = {}
for file in csvs:
    file_data.update(process_file(file, 'csv'))
for file in excels:
    file_data.update(process_file(file, 'excel'))


In [25]:
file_data.keys()

dict_keys(['/content/drive/MyDrive/protocol_scoring/ift/031-201-00559_Protocol_Approved.csv', '/content/drive/MyDrive/protocol_scoring/ift/031-201-00301 Protocol Amendment 2_20Jul2020 - 031-201-00301 Protocol Amendment 2_20Jul2020.csv', '/content/drive/MyDrive/protocol_scoring/ift/31-14-204_Protocol_Amendment_1_30Apr2018.csv', '/content/drive/MyDrive/protocol_scoring/ift/031-201-00469_Protocol_22Feb2021.xlsx'])

In [41]:
#@markdown ### Instruction Template


# General Instruction Fine-Tune Format
def create_instruction(instruction, input, response):
  return f"""### Instruction: {instruction}
### INPUT:
{input}
### Response:
{response}
"""

# Protocol PDF to Instruction Format
def create_protocol_instruction(pdf_text, question,answer, context):
  instruction = """Given the Context of a Protocol PDF and a Query, Find relevant information from the context, and formulate an answer . Also provide relevant text from the context that supports the answer.
   If you cannot find the answer, simply answer 'I don't know'. Do not try to come up with an answer."""
  input = f"""QUERY : {question}
-------------------
CONTEXT :
{pdf_text}
"""
  response = f"""
{answer}
---------------
SUPPORTING CONTEXT:
{context}
"""
  return create_instruction(instruction,input,response)


In [42]:
print(create_protocol_instruction("{pdf_text}","{question}","{answer}","{context}"))

### Instruction: Given the Context of a Protocol PDF and a Query, Find relevant information from the context, and formulate an answer . Also provide relevant text from the context that supports the answer.
   If you cannot find the answer, simply answer 'I don't know'. Do not try to come up with an answer.
### INPUT:
QUERY : {question}
-------------------
CONTEXT :
{pdf_text}

### Response:

{answer}
---------------
SUPPORTING CONTEXT:
{context}
 



Remove General Phrases from ChatPDF response:

```text
"To answer your question"
"Regarding your question"
"I apolopgize"
"I'm sorry(optionally followed by ', but')"
```



In [43]:
instruction_texts = []

for key, value in file_data.items():
  pdf_text = value['pdf_text']
  df = value['dataframe']
  for idx, row in df.iterrows():
    question = row['question']
    answer = row['answer']
    context = row['context']
    full_instruction_data = create_protocol_instruction(pdf_text, question, answer, context)
    instruction_texts.append(full_instruction_data)

In [56]:
#@markdown Approx Instruction Token Size

max_token_size = max([int(len(instruction_text.split(" ")) * 4/3) for instruction_text in instruction_texts])
min_token_size = min([int(len(instruction_text.split(" ")) * 4/3) for instruction_text in instruction_texts])
mean_token_size = sum([int(len(instruction_text.split(" ")) * 4/3) for instruction_text in instruction_texts])/len(instruction_texts)
with_context_instructions = [x for x in instruction_texts if (len(x.split(" ")) * 4/3) >1000]
relevant_mean_token_size = sum([int(len(instruction_text.split(" ")) * 4/3) for instruction_text in with_context_instructions])/len(with_context_instructions)

print(f"Min Token Size:{min_token_size}")
print(f"Max Token Size:{max_token_size}")
print(f"Avg Token Size:{mean_token_size}")
print(f"Actual Avg. Token Size:{relevant_mean_token_size}")

#@markdown Thus 64k context length model will be used.

Min Token Size:102
Max Token Size:39952
Avg Token Size:14132.10447761194
Actual Avg. Token Size:21564.618320610687


# 64k Context Length LLM

In [60]:
! pip install transformers optimum flash_attn
! pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7

Collecting flash_attn
  Downloading flash_attn-2.3.3.tar.gz (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash_attn)
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja (from flash_attn)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: flash_attn
  Building wheel for flash_attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash_attn: filename=flash_attn-2.3.3-cp310-cp310-linux_x86_64.whl size=57075008 sha256=bcb63b64213ab61590b340b77de84e448a442e19c100480895194df39ad7673d


In [61]:
from transformers import AutoModelForCausalLM, AutoTokenizer,GPTQConfig, pipeline,TextStreamer
model_name_or_path = "TheBloke/Yarn-Mistral-7B-64k-GPTQ"
# To use a different branch, change revision
# For example: revision="main"
quantization_config_loading = GPTQConfig(bits=4, use_exllama = False)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,

                                          quantization_config=quantization_config_loading,
                                          device_map="cuda",
                                          trust_remote_code=True,
                                          revision="gptq-4bit-32g-actorder_True")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)



You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. use_exllama, exllama_config, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


model.safetensors:   0%|          | 0.00/4.57G [00:00<?, ?B/s]

(…)32g-actorder_True/generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

(…)-GPTQ/resolve/main/tokenizer_config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

(…)-7B-64k-GPTQ/resolve/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

(…)PTQ/resolve/main/special_tokens_map.json:   0%|          | 0.00/145 [00:00<?, ?B/s]

In [62]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1000,
    do_sample=True,
    temperature=0.1,
    top_k=40,
    top_p=0.95,
    repetition_penalty=1.15,
    streamer=streamer,
)

In [69]:
len(with_context_instructions[0].split(" "))

14373

In [70]:
pipe(with_context_instructions[0])[0]['generated_text']



OutOfMemoryError: ignored