In [1]:
import csv
import numpy as np
import os

In [2]:
def load_csv_to_list(file_path):
    with open(file_path, mode='r', newline='') as csvfile:
        return [item for row in csv.reader(csvfile) for item in row]

def list_to_csv(string_list, csv_file_path):
    # Open the CSV file in write mode
    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        # Write each string as a separate row
        for string in string_list:
            writer.writerow([string])

In [3]:
def split_names_to_csv(names, output_prefix, chunk_size=10):
    for i in range(0, len(names), chunk_size):
        chunk = names[i:i + chunk_size]
        with open(f'{output_prefix}/query_list_{i // chunk_size}.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerows([[name] for name in chunk])  # Write each name in a new row


In [4]:
def make_query_lists(path_to_queries, path_to_coords, chunk_size, output_prefix):
    
    if not os.path.exists(output_prefix):
        os.makedirs(output_prefix)
        
        all_queries = load_csv_to_list(path_to_queries)
        coords = np.load(path_to_coords)

        # Sort the database by length for better batching
        key_shape_pairs = [(key, coords[key].shape[0]) for key in all_queries]
        sorted_keys = sorted(key_shape_pairs, key=lambda x: x[1])
        sorted_names = [key for key, shape in sorted_keys]

        # write csvs
        split_names_to_csv(sorted_names, output_prefix, chunk_size)
    else:
        print("already directory there")
    
    

In [6]:
# make test queries for search by taking the intersection of queryProts.txt and test.csv
data_path = "/cluster/tufts/pettilab/shared/structure_comparison_data"
all_test = load_csv_to_list(f"{data_path}/train_test_val/test.csv")
all_query = load_csv_to_list(f"{data_path}/train_test_val/queryProts.txt")
test_query = list(set(all_test).intersection(all_query))
print(len(test_query))

list_to_csv(test_query, f"{data_path}/train_test_val/test_queries.csv")

1442


In [9]:
make_query_lists(f"{data_path}/train_test_val/test_queries.csv", f"{data_path}/alphabets_blosum_coordinates/allCACoord.npz", 10, f"{data_path}/test_queries_by_10" )