# **Creating 4 RANDOM samples & creating 150 txt files** #

In [None]:
!pip install sentence_transformers




In [None]:
import json
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
#from sklearn.metrics.pairwise import euclidean_distances
import random

# Load the model
model = SentenceTransformer("pritamdeka/PubMedBERT-mnli-snli-scinli-scitail-mednli-stsb")


In [None]:
import os
def create_samples(test_data, train_data, output_dir, num_samples=4):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    train_texts = [entry['inputs'] for entry in train_data]
    train_encodings = model.encode(train_texts, convert_to_tensor=True).detach().cpu().numpy()

    for idx, test_entry in enumerate(test_data):
        # similar_entries = find_similar_entries(test_entry, train_encodings, train_data, num_samples)
        random_samples = random.sample(train_data, min(num_samples, len(train_data)))
        output_file = os.path.join(output_dir, f'random_samples_{idx+1}.txt')
        with open(output_file, 'w', encoding='utf-8') as file:
            # Write test entry as JSON object on its own line without indentation
            file.write(json.dumps({"type": "TEST_ENTRY", "data": test_entry}) + '\n')
            # Write similar train entries as JSON objects on their own lines without indentation
            for entry in random_samples:
                file.write(json.dumps({"type": "TRAIN_ENTRY", "data": entry}) + '\n')
        print(f"Random samples for test + train entry {idx+1} have been saved to '{output_file}'")

def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

# Define file paths
output_dir = '4_output_samples'

folder_name = '/data/chq'
train_file_name = 'train.jsonl'
test_file_name = 'test.jsonl'

# Join the folder and file names to create a full path
train_file = os.path.join(folder_name, train_file_name)
test_file = os.path.join(folder_name, test_file_name)


if __name__ == "__main__":
    # Load train and test data
    train_data = load_jsonl(train_file)
    test_data = load_jsonl(test_file)

    # Create and save similar samples
    create_samples(test_data, train_data, output_dir)
    print(f"Random samples have been saved to '{output_dir}'")


Random samples for test + train entry 1 have been saved to '4_output_samples/random_samples_1.txt'
Random samples for test + train entry 2 have been saved to '4_output_samples/random_samples_2.txt'
Random samples for test + train entry 3 have been saved to '4_output_samples/random_samples_3.txt'
Random samples for test + train entry 4 have been saved to '4_output_samples/random_samples_4.txt'
Random samples for test + train entry 5 have been saved to '4_output_samples/random_samples_5.txt'
Random samples for test + train entry 6 have been saved to '4_output_samples/random_samples_6.txt'
Random samples for test + train entry 7 have been saved to '4_output_samples/random_samples_7.txt'
Random samples for test + train entry 8 have been saved to '4_output_samples/random_samples_8.txt'
Random samples for test + train entry 9 have been saved to '4_output_samples/random_samples_9.txt'
Random samples for test + train entry 10 have been saved to '4_output_samples/random_samples_10.txt'
Random s

In [None]:
import shutil
from google.colab import files

# Path to the folder you want to zip
folder_to_zip = '/content/4_output_samples'
# Name of the zip file
zip_filename = '4_output_random_samples.zip'

# Create a zip file
shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', folder_to_zip)

# Download the zip file
files.download(zip_filename)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Creating each of 150 RANDOM txt entries**

In [None]:
def process_and_save_samples(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for idx in range(150):
        input_file = os.path.join(input_dir, f'random_samples_{idx+1}.txt')
        output_file = os.path.join(output_dir, f'{idx+1}.txt')

        with open(input_file, 'r', encoding='utf-8') as file:
            data = [json.loads(line) for line in file]

        dump_str = '''summarize the patient health query into one question of 15 words or less, using the provided examples to guide word choice \n\n'''
        for entry_idx, entry in enumerate(data[1:]):
            input_text = entry['data']['inputs'].strip().replace('\n', '')
            target = entry['data']['target'].strip().replace('\n', '')
            dump_str += f'query {entry_idx+1}: {input_text}\n' + f'summarized question {entry_idx+1}: {target}\n##\n'

        input_text = data[0]['data']['inputs']
        dump_str += f'query {len(data)}: {input_text}\n' + f'summarized question {len(data)}:'

        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(dump_str)
        print(f"Processed and saved '{output_file}'")

In [None]:
output_samples_dir = '4_output_samples'
final_output_dir = 'final_4_output_samples'
if __name__ == "__main__":
# Process and save the final output samples
    process_and_save_samples(output_samples_dir, final_output_dir)

Processed and saved 'final_4_output_samples/1.txt'
Processed and saved 'final_4_output_samples/2.txt'
Processed and saved 'final_4_output_samples/3.txt'
Processed and saved 'final_4_output_samples/4.txt'
Processed and saved 'final_4_output_samples/5.txt'
Processed and saved 'final_4_output_samples/6.txt'
Processed and saved 'final_4_output_samples/7.txt'
Processed and saved 'final_4_output_samples/8.txt'
Processed and saved 'final_4_output_samples/9.txt'
Processed and saved 'final_4_output_samples/10.txt'
Processed and saved 'final_4_output_samples/11.txt'
Processed and saved 'final_4_output_samples/12.txt'
Processed and saved 'final_4_output_samples/13.txt'
Processed and saved 'final_4_output_samples/14.txt'
Processed and saved 'final_4_output_samples/15.txt'
Processed and saved 'final_4_output_samples/16.txt'
Processed and saved 'final_4_output_samples/17.txt'
Processed and saved 'final_4_output_samples/18.txt'
Processed and saved 'final_4_output_samples/19.txt'
Processed and saved '

In [None]:
import shutil
from google.colab import files

# Path to the folder you want to zip
folder_to_zip = '/content/final_4_output_samples'
# Name of the zip file
zip_filename = 'final_4_each_random_samples.zip'

# Create a zip file
shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', folder_to_zip)

# Download the zip file
files.download(zip_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>