# **Creating 8 SIMILAR samples & creating 150 txt files**

In [None]:
!pip install sentence_transformers


Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transform

In [None]:
import json
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import euclidean_distances

# Load the model
model = SentenceTransformer("pritamdeka/PubMedBERT-mnli-snli-scinli-scitail-mednli-stsb")


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.46k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/706k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Function to find most similar entries
def find_similar_entries(test_entry, train_encodings, train_data, k=8):
    test_encoding = model.encode(test_entry['inputs'], convert_to_tensor=True).detach().cpu().numpy()
    distance = euclidean_distances(test_encoding.reshape(1, -1), train_encodings)
    #print(distance.shape)
    min_indices = []
    for _ in range(k):
        min_index = np.argmin(distance[0])
        min_indices.append(min_index)
        distance[0][min_index] = np.inf
    return [train_data[idx] for idx in min_indices]

In [None]:
import os
def create_samples(test_data, train_data, output_dir, num_samples=8):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    train_texts = [entry['inputs'] for entry in train_data]
    train_encodings = model.encode(train_texts, convert_to_tensor=True).detach().cpu().numpy()

    for idx, test_entry in enumerate(test_data):
        similar_entries = find_similar_entries(test_entry, train_encodings, train_data, num_samples)

        output_file = os.path.join(output_dir, f'similar_samples_{idx+1}.txt')
        with open(output_file, 'w', encoding='utf-8') as file:
            # Write test entry as JSON object on its own line without indentation
            file.write(json.dumps({"type": "TEST_ENTRY", "data": test_entry}) + '\n')
            # Write similar train entries as JSON objects on their own lines without indentation
            for entry in similar_entries:
                file.write(json.dumps({"type": "TRAIN_ENTRY", "data": entry}) + '\n')
        print(f"Similar samples for test + train entry {idx+1} have been saved to '{output_file}'")

def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

# Define file paths

output_dir = '8_output_samples'

folder_name = '/data/chq'
train_file_name = 'train.jsonl'
test_file_name = 'test.jsonl'

# Join the folder and file names to create a full path
train_file = os.path.join(folder_name, train_file_name)
test_file = os.path.join(folder_name, test_file_name)

if __name__ == "__main__":
    # Load train and test data
    train_data = load_jsonl(train_file)
    test_data = load_jsonl(test_file)

    # Create and save similar samples
    create_samples(test_data, train_data, output_dir)
    print(f"Similar samples have been saved to '{output_dir}'")


Similar samples for test + train entry 1 have been saved to '8_output_samples/similar_samples_1.txt'
Similar samples for test + train entry 2 have been saved to '8_output_samples/similar_samples_2.txt'
Similar samples for test + train entry 3 have been saved to '8_output_samples/similar_samples_3.txt'
Similar samples for test + train entry 4 have been saved to '8_output_samples/similar_samples_4.txt'
Similar samples for test + train entry 5 have been saved to '8_output_samples/similar_samples_5.txt'
Similar samples for test + train entry 6 have been saved to '8_output_samples/similar_samples_6.txt'
Similar samples for test + train entry 7 have been saved to '8_output_samples/similar_samples_7.txt'
Similar samples for test + train entry 8 have been saved to '8_output_samples/similar_samples_8.txt'
Similar samples for test + train entry 9 have been saved to '8_output_samples/similar_samples_9.txt'
Similar samples for test + train entry 10 have been saved to '8_output_samples/similar_samp

In [None]:
import shutil
from google.colab import files

# Path to the folder you want to zip
folder_to_zip = '/content/8_output_samples'
# Name of the zip file
zip_filename = '8_output_similar_samples.zip'

# Create a zip file
shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', folder_to_zip)

# Download the zip file
files.download(zip_filename)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Creating each of 150 SIMILAR txt entries**

In [None]:
def process_and_save_samples(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for idx in range(150):
        input_file = os.path.join(input_dir, f'similar_samples_{idx+1}.txt')
        output_file = os.path.join(output_dir, f'{idx+1}.txt')

        with open(input_file, 'r', encoding='utf-8') as file:
            data = [json.loads(line) for line in file]

        dump_str = '''summarize the patient health query into one question of 15 words or less, using the provided examples to guide word choice \n\n'''
        for entry_idx, entry in enumerate(data[1:]):
            input_text = entry['data']['inputs'].strip().replace('\n', '')
            target = entry['data']['target'].strip().replace('\n', '')
            dump_str += f'query {entry_idx+1}: {input_text}\n' + f'summarized question {entry_idx+1}: {target}\n##\n'

        input_text = data[0]['data']['inputs']
        dump_str += f'query {len(data)}: {input_text}\n' + f'summarized question {len(data)}:'

        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(dump_str)
        print(f"Processed and saved '{output_file}'")

In [None]:
output_samples_dir = '8_output_samples'
final_output_dir = 'final_8_output_samples'
if __name__ == "__main__":
# Process and save the final output samples
    process_and_save_samples(output_samples_dir, final_output_dir)

Processed and saved 'final_8_output_samples/1.txt'
Processed and saved 'final_8_output_samples/2.txt'
Processed and saved 'final_8_output_samples/3.txt'
Processed and saved 'final_8_output_samples/4.txt'
Processed and saved 'final_8_output_samples/5.txt'
Processed and saved 'final_8_output_samples/6.txt'
Processed and saved 'final_8_output_samples/7.txt'
Processed and saved 'final_8_output_samples/8.txt'
Processed and saved 'final_8_output_samples/9.txt'
Processed and saved 'final_8_output_samples/10.txt'
Processed and saved 'final_8_output_samples/11.txt'
Processed and saved 'final_8_output_samples/12.txt'
Processed and saved 'final_8_output_samples/13.txt'
Processed and saved 'final_8_output_samples/14.txt'
Processed and saved 'final_8_output_samples/15.txt'
Processed and saved 'final_8_output_samples/16.txt'
Processed and saved 'final_8_output_samples/17.txt'
Processed and saved 'final_8_output_samples/18.txt'
Processed and saved 'final_8_output_samples/19.txt'
Processed and saved '

In [None]:
import shutil
from google.colab import files

# Path to the folder you want to zip
folder_to_zip = '/content/final_8_output_samples'
# Name of the zip file
zip_filename = 'final_8_each_similar_samples.zip'

# Create a zip file
shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', folder_to_zip)

# Download the zip file
files.download(zip_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>