# Part3: Basic QA-QC

This notebook goes over a way to compare the downloaded jsonl files data with the generated .hdf5, and getting some metrics in the training files before uploading them to Sambastudio.
These steps shown in this Notebook can be omitted if you want

In [None]:
import os
import sys
import glob
import json
import h5py
from transformers import AutoTokenizer, AutoModelForCausalLM

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, ".."))
repo_dir = os.path.abspath(os.path.join(kit_dir, ".."))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

# Set the id_model with the tokenizer to use
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") 

## Fine-tuning nstext2sql dataset:


### Compare jsonl and hdf5 entries

In [None]:
data_dir = os.path.join(kit_dir, "data", "output", "fine-tune-nstext2sql" )

sql_files = sorted(glob.glob(f'{data_dir}/splits/*.jsonl'))
hdf_files = sorted(glob.glob(f'{data_dir}/*.hdf5'))

for sql_file, hdf_file in zip(sql_files, hdf_files):

    sql_data = []
    # Read sql file
    with open(sql_file) as f:
        for line in f:
            sql_data.append(json.loads(line))
    
    hdf_data = []
    # Read hdf file
    f = h5py.File(hdf_file, 'r')
    for i in range(f['input_ids'].shape[0]):
        hdf_data.append(tokenizer.decode(f['input_ids'][i]))
    
    break

rows_to_compare = 5
for i, d_sql, d_hdf in zip(range(rows_to_compare),sql_data[:5], hdf_data[:5]):
    print(f"{i}")
    print("jsonl row:")
    print(f"Prompt:\n{d_sql['prompt']}\nCompletion:\n{d_sql['completion']}\n")
    print('----------------------------------------------------------------')
    print("hdf5:\n")
    print(f"{d_hdf}")
    print('----------------------------------------------------------------\n\n')

### Calculate metrics from hdf files

In [None]:
total_sequences = 0
ocurrences = 0
for hdf_file in hdf_files:
    f = h5py.File(hdf_file, 'r')
    total_sequences += f['input_ids'].shape[0]
    for i in range(f['input_ids'].shape[0]):
        text = tokenizer.decode(f['input_ids'][i,])
        occurences += text.count('</s>')
print(f'total_sequences: {total_sequences}')
print(f'occurrences: {occurences}')

## Pretrain squad-smol-sql dataset

### Compare jsonl and hdf5 entries

In [None]:
data_dir = os.path.join(kit_dir, "data", "output", "pretrain-squad-smol-sql" )

sql_files = sorted(glob.glob(f'{data_dir}/splits/*.jsonl'))
hdf_files = sorted(glob.glob(f'{data_dir}/*.hdf5'))

for sql_file, hdf_file in zip(sql_files, hdf_files):

    sql_data = []
    # Read sql file
    with open(sql_file) as f:
        for line in f:
            sql_data.append(json.loads(line))
    
    hdf_data = []
    # Read hdf file
    f = h5py.File(hdf_file, 'r')
    for i in range(f['input_ids'].shape[0]):
        hdf_data.append(tokenizer.decode(f['input_ids'][i]))
    
    break

rows_to_compare = 5
for i, d_sql, d_hdf in zip(range(rows_to_compare),sql_data[:5], hdf_data[:5]):
    print(f"{i}")
    print("jsonl row:")
    print(f"Prompt:\n{d_sql['prompt']}\nCompletion:\n{d_sql['completion']}\n")
    print('----------------------------------------------------------------')
    print("hdf5:\n")
    print(f"{d_hdf}")
    print('----------------------------------------------------------------\n\n')

### Calculate metrics from hdf files

In [None]:
total_sequences = 0
ocurrences = 0
for hdf_file in hdf_files:
    f = h5py.File(hdf_file, 'r')
    total_sequences += f['input_ids'].shape[0]
    for i in range(f['input_ids'].shape[0]):
        text = tokenizer.decode(f['input_ids'][i,])
        occurences += text.count('</s>')
print(f'total_sequences: {total_sequences}')
print(f'occurrences: {occurences}')

## Pretrain stack dedup dataset

### Compare jsonl and hdf5 entries

In [None]:
data_dir = os.path.join(kit_dir, "data", "output", "pretrain-the-stack-dedup")

sql_files = sorted(glob.glob(f'{data_dir}/splits/*.jsonl'))
hdf_files = sorted(glob.glob(f'{data_dir}/*.hdf5'))

for sql_file, hdf_file in zip(sql_files, hdf_files):

    sql_data = []
    # Read sql file
    with open(sql_file) as f:
        for line in f:
            sql_data.append(json.loads(line))
    
    hdf_data = []
    # Read hdf file
    f = h5py.File(hdf_file, 'r')
    for i in range(f['input_ids'].shape[0]):
        hdf_data.append(tokenizer.decode(f['input_ids'][i]))
    
    break

rows_to_compare = 5
for i, d_sql, d_hdf in zip(range(rows_to_compare),sql_data[:5], hdf_data[:5]):
    print(f"{i}")
    print("jsonl row:")
    print(f"Prompt:\n{d_sql['prompt']}\nCompletion:\n{d_sql['completion']}\n")
    print('----------------------------------------------------------------')
    print("hdf5:\n")
    print(f"{d_hdf}")
    print('----------------------------------------------------------------\n\n')

### Calculate metrics from hdf files

In [None]:
total_sequences = 0
ocurrences = 0
for hdf_file in hdf_files:
    f = h5py.File(hdf_file, 'r')
    total_sequences += f['input_ids'].shape[0]
    for i in range(f['input_ids'].shape[0]):
        text = tokenizer.decode(f['input_ids'][i,])
        occurences += text.count('</s>')
print(f'total_sequences: {total_sequences}')
print(f'occurrences: {occurences}')