In [None]:
import os
import re
import pandas as pd
import pickle
import os.path
import numpy as np
from scipy import sparse
import time
print('Packages loaded')

In [None]:
_maf_re = re.compile(r'\bMAF=([^;]+)')

def extract_maf_regex(info_str):
    """
    Use a regular expression to grab the MAF= value.
    """
    m = _maf_re.search(info_str)
    if not m:
        return None
    try:
        return float(m.group(1))
    except ValueError:
        return None

def classify_maf(maf):
    if maf < 0.001:
        return 'rare'
    elif maf > 0.05:
        return 'common'
    else:
        return 'none'

In [None]:
inPath = '/scratch/ml-csm/datasets/genomics/ref-genome/human/GRCh38/ensembl/variants/processed/'
df_maf = pd.read_parquet(inPath+'1000GENOMES-release114-maf.parquet.gz')

df_maf['category'] = df_maf['maf'].apply(classify_maf)
df_maf.dropna(inplace=True, ignore_index=True)
df_maf

## Predictions 150

In [3]:
group = 1
inPath = f'/home/sdodl001/Desktop/DNA_Methylation_Scripts/cpg_util_scripts/data/kmeans/uncert_gve_direction/{group}/pred200_merged/'
files = os.listdir(inPath)
files.sort()
len(files)

148

In [None]:
# Initialize an empty set to store unique name values
unique_names = set()

# Track progress
total_files = len(files)
processed = 0

# Process files in batches to avoid memory issues
for file in files:
    try:
        # Only load the 'name' column to minimize memory usage
        data = pd.read_parquet(inPath + file, columns=['name'])
        
        # Update the set with new unique values
        unique_names.update(data['name'])
        
        # Update and display progress
        processed += 1
        if processed % 10 == 0 or processed == total_files:
            print(f"Processed {processed}/{total_files} files. Current unique names: {len(unique_names)}")
            
    except Exception as e:
        print(f"Error processing {file}: {str(e)}")

print(f"\nTotal unique names collected: {len(unique_names)}")
print(f"Memory used by unique_names set: {sum(len(name) for name in unique_names) / (1024*1024):.2f} MB")


# Save the unique_names to a file

# Create outputs directory if it doesn't exist (this won't be tracked by Git)
import os
output_dir = os.path.join(os.path.dirname(os.path.abspath('__file__')), 'outputs')
os.makedirs(output_dir, exist_ok=True)
print(f"Saving large files to {output_dir} (not tracked by Git)")

# Save as pickle (Python's binary format - preserves the set data structure)
output_path = os.path.join(output_dir, f'unique_names_group_{group}.pkl')
with open(output_path, 'wb') as f:
    pickle.dump(unique_names, f)
print(f"Saved unique names to: {output_path}")
print("WARNING: This file is large and should not be committed to Git.")

# # Optionally save as text file (one name per line)
# text_output_path = os.path.join(output_dir, f'unique_names_group_{group}.txt')
# with open(text_output_path, 'w') as f:
#     for name in sorted(unique_names):
#         f.write(f"{name}\n")
# print(f"Saved unique names to: {text_output_path}")

Processed 10/148 files. Current unique names: 3645640
Processed 20/148 files. Current unique names: 4195255
Processed 30/148 files. Current unique names: 4554064
Processed 40/148 files. Current unique names: 4695465
Processed 50/148 files. Current unique names: 4931860
Processed 60/148 files. Current unique names: 5099470
Processed 70/148 files. Current unique names: 5138015
Processed 80/148 files. Current unique names: 5149679
Processed 90/148 files. Current unique names: 5257657
Processed 100/148 files. Current unique names: 7958108
Processed 110/148 files. Current unique names: 8837202
Processed 120/148 files. Current unique names: 9015023
Processed 130/148 files. Current unique names: 9120615
Processed 140/148 files. Current unique names: 10246896
Processed 148/148 files. Current unique names: 10358075

Total unique names collected: 10358075
Memory used by unique_names set: 114.34 MB


['rs567865040', 'rs1426319946', 'rs927581744', 'rs1038629972', 'rs928388916']

# Loading Saved Unique Names

If you have previously saved the unique names to a file, you can load them instead of recreating the set. This is useful for:
- Separating the data collection and analysis steps
- Rerunning analyses with the same variant set
- Sharing variant sets between different analyses

In [None]:
# Uncomment this section if you want to load previously saved unique names
group = 1
# Path to the saved unique names file
unique_names_file = f'unique_names_group_{group}.pkl'

# Check if the file exists
if os.path.isfile(unique_names_file):
    # Load unique names from pickle file
    with open(unique_names_file, 'rb') as f:
        unique_names = pickle.load(f)
    print(f"Loaded {len(unique_names)} unique names from {unique_names_file}")
    print(f"Memory used by unique_names set: {sum(len(name) for name in unique_names) / (1024*1024):.2f} MB")
    
    # Preview some unique names
    print("\nSample names:")
    print(list(unique_names)[:5])
else:
    print(f"File {unique_names_file} not found. Please run the cell that creates and saves unique_names first.")


# Alternatively, you can load from text file if you don't have the pickle file
'''
unique_names_file = f'unique_names_group_{group}.txt'

if os.path.isfile(unique_names_file):
    # Load unique names from text file
    unique_names = set()
    with open(unique_names_file, 'r') as f:
        for line in f:
            unique_names.add(line.strip())
            
    print(f"Loaded {len(unique_names)} unique names from {unique_names_file}")
else:
    print(f"File {unique_names_file} not found. Please run the cell that creates and saves unique_names first.")
'''

# Creating a Binary Membership Matrix

Now we'll create a binary matrix where:
- Each row corresponds to a unique variant name
- Each column corresponds to a file
- Cell values are 1 (variant present in file) or 0 (variant absent)

This implementation uses:
1. A dictionary for fast lookups of variant indices
2. Sparse matrix construction for memory efficiency
3. Batch processing to manage memory usage

In [None]:
# Create a mapping of variant names to row indices
start_time = time.time()
name_to_idx = {name: idx for idx, name in enumerate(unique_names)}
print(f"Created name to index mapping in {time.time() - start_time:.2f} seconds")

# Initialize lists to store the sparse matrix coordinates and values
row_indices = []
col_indices = []
data_values = []

# Process files to build the membership matrix
start_time = time.time()
total_files = len(files)
processed = 0

for file_idx, file in enumerate(files):
    try:
        # Only load the 'name' column
        file_data = pd.read_parquet(inPath + file, columns=['name'])
        
        # Get unique names in this file (we only need each name once per file)
        file_names = set(file_data['name'])
        
        # For each name in this file, add a 1 to the matrix
        for name in file_names:
            if name in name_to_idx:  # This should always be true but checking to be safe
                row_indices.append(name_to_idx[name])
                col_indices.append(file_idx)
                data_values.append(1)
        
        # Update progress
        processed += 1
        if processed % 10 == 0 or processed == total_files:
            print(f"Processed {processed}/{total_files} files for matrix construction")
            print(f"Current non-zero elements: {len(data_values)}")
            
    except Exception as e:
        print(f"Error processing {file} for matrix: {str(e)}")

# Create a sparse matrix in CSR format (efficient for row operations)
num_variants = len(unique_names)
num_files = len(files)

membership_matrix = sparse.csr_matrix(
    (data_values, (row_indices, col_indices)),
    shape=(num_variants, num_files)
)

print(f"\nMatrix shape: {membership_matrix.shape} (variants × files)")
print(f"Number of non-zero elements: {membership_matrix.count_nonzero()} (variant occurrences)")
print(f"Sparsity: {100 - 100 * membership_matrix.count_nonzero() / (num_variants * num_files):.2f}%")
print(f"Memory usage: {membership_matrix.data.nbytes / 1024**2:.2f} MB (data)")
print(f"Total construction time: {time.time() - start_time:.2f} seconds")

# Analyzing the Membership Matrix

With the membership matrix constructed, you can perform various analyses:

In [None]:
# Example 1: Count variants per file (column sums)
file_variant_counts = membership_matrix.sum(axis=0).A1  # A1 converts to 1D array

# Show files with most and least variants
print(f"Average variants per file: {file_variant_counts.mean():.2f}")
print(f"Max variants in a file: {file_variant_counts.max()} (file #{file_variant_counts.argmax()}: {files[file_variant_counts.argmax()]})")
print(f"Min variants in a file: {file_variant_counts.min()} (file #{file_variant_counts.argmin()}: {files[file_variant_counts.argmin()]})")

# Example 2: Count how many files each variant appears in (row sums)
variant_file_counts = membership_matrix.sum(axis=1).A1

# Show distribution of variant occurrence
from collections import Counter
occurrence_dist = Counter(variant_file_counts)

# Find variants that appear in exactly 1 file
min_file_threshold = 1
cell_specific_variants = np.where(variant_file_counts == min_file_threshold)[0]
print(f"\nNumber of variants appearing in exactly {min_file_threshold} file: {len(cell_specific_variants)}")

# Calculate percentage of these variants relative to all variants
percentage = 100 * len(cell_specific_variants) / len(variant_file_counts)
print(f"This represents {percentage:.2f}% of all variants")

idx_to_name = {idx: name for name, idx in name_to_idx.items()}
cell_specific_variant_names150 = [idx_to_name[idx] for idx in cell_specific_variants]  
print(f"Cell specific variants: {len(cell_specific_variant_names150)}")


# Print top 10 most common occurrence counts
print("\nVariant occurrence distribution:")
for count, num_variants in sorted(occurrence_dist.items())[:10]:
    print(f"{num_variants} variants appear in exactly {int(count)} files")

# Example 3: Find variants that appear in at least 80 files
min_file_threshold = 50
frequent_variants = np.where(variant_file_counts >= min_file_threshold)[0]
print(f"\nNumber of variants appearing in at least {min_file_threshold} files: {len(frequent_variants)}")
# Calculate percentage of these variants relative to all variants
percentage = 100 * len(frequent_variants) / len(variant_file_counts)
print(f"This represents {percentage:.2f}% of all variants")

# Example 4: Find variants that appear in at least 80 files
min_file_threshold = 80
cell_nonspecific_variants = np.where(variant_file_counts >= min_file_threshold)[0]
print(f"\nNumber of variants appearing in at least {min_file_threshold} files: {len(cell_nonspecific_variants)}")
# Calculate percentage of these variants relative to all variants
percentage = 100 * len(cell_nonspecific_variants) / len(variant_file_counts)
print(f"This represents {percentage:.2f}% of all variants")

idx_to_name = {idx: name for name, idx in name_to_idx.items()}
cell_nonspecific_variant_names150 = [idx_to_name[idx] for idx in cell_nonspecific_variants]  
print(f"Cell non-specific variants: {len(cell_nonspecific_variant_names150)}")

# Example 4: Find variants that appear in all files
universal_variants = np.where(variant_file_counts == len(files))[0]
print(f"\nNumber of variants appearing in all {len(files)} files: {len(universal_variants)}")

# If you need to get the names of specific variants (e.g., universal ones)
if len(universal_variants) > 0:
    idx_to_name = {idx: name for name, idx in name_to_idx.items()}
    universal_variant_names = [idx_to_name[idx] for idx in universal_variants[:5]]  # Show first 5
    print(f"Sample universal variants: {universal_variant_names}")

# Example 5: Save the matrix for future use (if needed)

# # Save the sparse matrix
# import pickle
# with open(f'variant_membership_matrix_group_{group}.pkl', 'wb') as f:
#     pickle.dump({
#         'matrix': membership_matrix,
#         'variant_names': list(name_to_idx.keys()), 
#         'file_names': files
#     }, f)


## Prediction 1

group = 1
thr = 0.10

inPath = f'/scratch/ml-csm/projects/fgenom/gve/output/kmeans/pred1/aggr/thr{thr}/{group}/'

files = os.listdir(inPath)
print(len(files))

In [None]:

# Initialize an empty set to store unique name values
unique_names = set()

# Track progress
total_files = len(files)
processed = 0

# Process files in batches to avoid memory issues
for file in files:
    try:
        # Only load the 'name' column to minimize memory usage
        data = pd.read_parquet(inPath + file, columns=['name'])
        
        # Update the set with new unique values
        unique_names.update(data['name'])
        
        # Update and display progress
        processed += 1
        if processed % 10 == 0 or processed == total_files:
            print(f"Processed {processed}/{total_files} files. Current unique names: {len(unique_names)}")
            
    except Exception as e:
        print(f"Error processing {file}: {str(e)}")

print(f"\nTotal unique names collected: {len(unique_names)}")
print(f"Memory used by unique_names set: {sum(len(name) for name in unique_names) / (1024*1024):.2f} MB")


# Save the unique_names to a file


# Save as pickle (Python's binary format - preserves the set data structure)
with open(f'unique_names_group_{group}_pred{1}.pkl', 'wb') as f:
    pickle.dump(unique_names, f)
print(f"Saved unique names to: unique_names_group_{group}_pred{1}.pkl")

# # Optionally save as text file (one name per line)
# with open(f'unique_names_group_{group}.txt', 'w') as f:
#     for name in sorted(unique_names):
#         f.write(f"{name}\n")
# print(f"Saved unique names to: unique_names_group_{group}.txt")

In [None]:
# Uncomment this section if you want to load previously saved unique names

# Path to the saved unique names file
unique_names_file = f'/home/sdodl001/UAVarPrior/uavarprior/interpret/unique_names_group_{group}_pred{1}.pkl'

# Check if the file exists
if os.path.isfile(unique_names_file):
    # Load unique names from pickle file
    with open(unique_names_file, 'rb') as f:
        unique_names = pickle.load(f)
    print(f"Loaded {len(unique_names)} unique names from {unique_names_file}")
    print(f"Memory used by unique_names set: {sum(len(name) for name in unique_names) / (1024*1024):.2f} MB")
    
    # Preview some unique names
    print("\nSample names:")
    print(list(unique_names)[:5])
else:
    print(f"File {unique_names_file} not found. Please run the cell that creates and saves unique_names first.")


# Alternatively, you can load from text file if you don't have the pickle file
'''
unique_names_file = f'unique_names_group_{group}.txt'

if os.path.isfile(unique_names_file):
    # Load unique names from text file
    unique_names = set()
    with open(unique_names_file, 'r') as f:
        for line in f:
            unique_names.add(line.strip())
            
    print(f"Loaded {len(unique_names)} unique names from {unique_names_file}")
else:
    print(f"File {unique_names_file} not found. Please run the cell that creates and saves unique_names first.")
'''

In [None]:
# Create a mapping of variant names to row indices
start_time = time.time()
name_to_idx = {name: idx for idx, name in enumerate(unique_names)}
print(f"Created name to index mapping in {time.time() - start_time:.2f} seconds")

# Initialize lists to store the sparse matrix coordinates and values
row_indices = []
col_indices = []
data_values = []

# Process files to build the membership matrix
start_time = time.time()
total_files = len(files)
processed = 0

for file_idx, file in enumerate(files):
    try:
        # Only load the 'name' column
        file_data = pd.read_parquet(inPath + file, columns=['name'])
        
        # Get unique names in this file (we only need each name once per file)
        file_names = set(file_data['name'])
        
        # For each name in this file, add a 1 to the matrix
        for name in file_names:
            if name in name_to_idx:  # This should always be true but checking to be safe
                row_indices.append(name_to_idx[name])
                col_indices.append(file_idx)
                data_values.append(1)
        
        # Update progress
        processed += 1
        if processed % 10 == 0 or processed == total_files:
            print(f"Processed {processed}/{total_files} files for matrix construction")
            print(f"Current non-zero elements: {len(data_values)}")
            
    except Exception as e:
        print(f"Error processing {file} for matrix: {str(e)}")

# Create a sparse matrix in CSR format (efficient for row operations)
num_variants = len(unique_names)
num_files = len(files)

membership_matrix = sparse.csr_matrix(
    (data_values, (row_indices, col_indices)),
    shape=(num_variants, num_files)
)

print(f"\nMatrix shape: {membership_matrix.shape} (variants × files)")
print(f"Number of non-zero elements: {membership_matrix.count_nonzero()} (variant occurrences)")
print(f"Sparsity: {100 - 100 * membership_matrix.count_nonzero() / (num_variants * num_files):.2f}%")
print(f"Memory usage: {membership_matrix.data.nbytes / 1024**2:.2f} MB (data)")
print(f"Total construction time: {time.time() - start_time:.2f} seconds")

In [None]:
# Example 1: Count variants per file (column sums)
file_variant_counts = membership_matrix.sum(axis=0).A1  # A1 converts to 1D array

# Show files with most and least variants
print(f"Average variants per file: {file_variant_counts.mean():.2f}")
print(f"Max variants in a file: {file_variant_counts.max()} (file #{file_variant_counts.argmax()}: {files[file_variant_counts.argmax()]})")
print(f"Min variants in a file: {file_variant_counts.min()} (file #{file_variant_counts.argmin()}: {files[file_variant_counts.argmin()]})")

# Example 2: Count how many files each variant appears in (row sums)
variant_file_counts = membership_matrix.sum(axis=1).A1

# Show distribution of variant occurrence
from collections import Counter
occurrence_dist = Counter(variant_file_counts)

# Find variants that appear in exactly 1 file
min_file_threshold = 1
cell_specific_variants = np.where(variant_file_counts == min_file_threshold)[0]
print(f"\nNumber of variants appearing in exactly {min_file_threshold} file: {len(cell_specific_variants)}")

# Calculate percentage of these variants relative to all variants
percentage = 100 * len(cell_specific_variants) / len(variant_file_counts)
print(f"This represents {percentage:.2f}% of all variants")

idx_to_name = {idx: name for name, idx in name_to_idx.items()}
cell_specific_variant_names1 = [idx_to_name[idx] for idx in cell_specific_variants]  
print(f"Cell specific variants: {len(cell_specific_variant_names1)}")


# Print top 10 most common occurrence counts
print("\nVariant occurrence distribution:")
for count, num_variants in sorted(occurrence_dist.items())[1:10]:
    print(f"{num_variants} variants appear in exactly {int(count)} files")

# Example 3: Find variants that appear in at least 80 files
min_file_threshold = 50
frequent_variants = np.where(variant_file_counts >= min_file_threshold)[0]
print(f"\nNumber of variants appearing in at least {min_file_threshold} files: {len(frequent_variants)}")
# Calculate percentage of these variants relative to all variants
percentage = 100 * len(frequent_variants) / len(variant_file_counts)
print(f"This represents {percentage:.2f}% of all variants")

# Example 4: Find variants that appear in at least 80 files
min_file_threshold = 80
cell_nonspecific_variants = np.where(variant_file_counts >= min_file_threshold)[0]
print(f"\nNumber of variants appearing in at least {min_file_threshold} files: {len(cell_nonspecific_variants)}")
# Calculate percentage of these variants relative to all variants
percentage = 100 * len(cell_nonspecific_variants) / len(variant_file_counts)
print(f"This represents {percentage:.2f}% of all variants")

idx_to_name = {idx: name for name, idx in name_to_idx.items()}
cell_nonspecific_variant_names1 = [idx_to_name[idx] for idx in cell_nonspecific_variants]  
print(f"Cell non-specific variants: {len(cell_nonspecific_variant_names1)}")

# Example 5: Find variants that appear in all files
universal_variants = np.where(variant_file_counts == len(files))[0]
print(f"\nNumber of variants appearing in all {len(files)} files: {len(universal_variants)}")

# If you need to get the names of specific variants (e.g., universal ones)
if len(universal_variants) > 0:
    idx_to_name = {idx: name for name, idx in name_to_idx.items()}
    universal_variant_names = [idx_to_name[idx] for idx in universal_variants[:5]]  # Show first 5
    print(f"Sample universal variants: {universal_variant_names}")

# Example 5: Save the matrix for future use (if needed)

# # Save the sparse matrix
# import pickle
# with open(f'variant_membership_matrix_group_{group}_pred{1}.pkl', 'wb') as f:
#     pickle.dump({
#         'matrix': membership_matrix,
#         'variant_names': list(name_to_idx.keys()), 
#         'file_names': files
#     }, f)

In [None]:
ind1 = df_maf['id'].isin(cell_specific_variant_names1)
df_maf_pred1 = df_maf[ind1]
rare_count = (df_maf_pred1['category'] == 'rare').sum()
common_count = (df_maf_pred1['category'] == 'common').sum()
rare_count, common_count, len(df_maf_pred1)

In [None]:
ind1 = df_maf['id'].isin(cell_nonspecific_variant_names1)
df_maf_pred1 = df_maf[ind1]
rare_count = (df_maf_pred1['category'] == 'rare').sum()
common_count = (df_maf_pred1['category'] == 'common').sum()
rare_count, common_count, len(df_maf_pred1)

In [None]:
ind150 = df_maf['id'].isin(cell_specific_variant_names150)
df_maf_pred150 = df_maf[ind150]
rare_count150 = (df_maf_pred150['category'] == 'rare').sum()
common_count150 = (df_maf_pred150['category'] == 'common').sum()
rare_count150, common_count150, len(df_maf_pred150)

In [None]:
ind150 = df_maf['id'].isin(cell_nonspecific_variant_names150)
df_maf_pred150 = df_maf[ind150]
rare_count150 = (df_maf_pred150['category'] == 'rare').sum()
common_count150 = (df_maf_pred150['category'] == 'common').sum()
rare_count150, common_count150, len(df_maf_pred150)

In [None]:
# Pred1 & Pred150: cell-specific & Cell non-specific: rare/common
414426/67388, 193/19, 305524/49357, 112/8

## Save Cell-specific and Cell-nonspecific Variant Names

Let's save the variant names we've identified for both prediction models to files for future use.

In [None]:
# Create output directory if it doesn't exist
import os
output_dir = os.path.join(os.path.dirname(os.path.abspath('__file__')), 'outputs')
os.makedirs(output_dir, exist_ok=True)

# Save cell-specific variant names for prediction model 1
pred1_cell_specific_output = os.path.join(output_dir, f'cell_specific_variants_pred1_group_{group}.pkl')
with open(pred1_cell_specific_output, 'wb') as f:
    pickle.dump(cell_specific_variant_names1, f)
print(f"Saved {len(cell_specific_variant_names1)} cell-specific variants (pred1) to: {pred1_cell_specific_output}")

# Save cell-nonspecific variant names for prediction model 1
pred1_cell_nonspecific_output = os.path.join(output_dir, f'cell_nonspecific_variants_pred1_group_{group}.pkl')
with open(pred1_cell_nonspecific_output, 'wb') as f:
    pickle.dump(cell_nonspecific_variant_names1, f)
print(f"Saved {len(cell_nonspecific_variant_names1)} cell-nonspecific variants (pred1) to: {pred1_cell_nonspecific_output}")

# Save cell-specific variant names for prediction model 150
pred150_cell_specific_output = os.path.join(output_dir, f'cell_specific_variants_pred150_group_{group}.pkl')
with open(pred150_cell_specific_output, 'wb') as f:
    pickle.dump(cell_specific_variant_names150, f)
print(f"Saved {len(cell_specific_variant_names150)} cell-specific variants (pred150) to: {pred150_cell_specific_output}")

# Save cell-nonspecific variant names for prediction model 150
pred150_cell_nonspecific_output = os.path.join(output_dir, f'cell_nonspecific_variants_pred150_group_{group}.pkl')
with open(pred150_cell_nonspecific_output, 'wb') as f:
    pickle.dump(cell_nonspecific_variant_names150, f)
print(f"Saved {len(cell_nonspecific_variant_names150)} cell-nonspecific variants (pred150) to: {pred150_cell_nonspecific_output}")

# Optional: Save as text files (one variant name per line) for easier inspection
# These are more readable but take more space than pickle files

# Cell-specific variants for pred1
pred1_cell_specific_txt = os.path.join(output_dir, f'cell_specific_variants_pred1_group_{group}.txt')
with open(pred1_cell_specific_txt, 'w') as f:
    for name in cell_specific_variant_names1:
        f.write(f"{name}\n")

# Cell-nonspecific variants for pred1
pred1_cell_nonspecific_txt = os.path.join(output_dir, f'cell_nonspecific_variants_pred1_group_{group}.txt')
with open(pred1_cell_nonspecific_txt, 'w') as f:
    for name in cell_nonspecific_variant_names1:
        f.write(f"{name}\n")

# Cell-specific variants for pred150
pred150_cell_specific_txt = os.path.join(output_dir, f'cell_specific_variants_pred150_group_{group}.txt')
with open(pred150_cell_specific_txt, 'w') as f:
    for name in cell_specific_variant_names150:
        f.write(f"{name}\n")

# Cell-nonspecific variants for pred150
pred150_cell_nonspecific_txt = os.path.join(output_dir, f'cell_nonspecific_variants_pred150_group_{group}.txt')
with open(pred150_cell_nonspecific_txt, 'w') as f:
    for name in cell_nonspecific_variant_names150:
        f.write(f"{name}\n")

print("\nAlso saved all variant names as text files for easier inspection.")

## Loading Saved Variant Names

Here's how to load the saved variant names in the future:

In [None]:
# Example: Loading the saved variant names
def load_variant_names(file_path):
    """Load variant names from a pickle file."""
    with open(file_path, 'rb') as f:
        variant_names = pickle.load(f)
    return variant_names

# Example usage (uncomment when needed):
'''
# Define the file paths
group = 1  # Set the appropriate group number
output_dir = os.path.join(os.path.dirname(os.path.abspath('__file__')), 'outputs')

# Load cell-specific variants for pred1
pred1_cell_specific_file = os.path.join(output_dir, f'cell_specific_variants_pred1_group_{group}.pkl')
cell_specific_variants_pred1 = load_variant_names(pred1_cell_specific_file)
print(f"Loaded {len(cell_specific_variants_pred1)} cell-specific variants for pred1")

# Load cell-nonspecific variants for pred1
pred1_cell_nonspecific_file = os.path.join(output_dir, f'cell_nonspecific_variants_pred1_group_{group}.pkl')
cell_nonspecific_variants_pred1 = load_variant_names(pred1_cell_nonspecific_file)
print(f"Loaded {len(cell_nonspecific_variants_pred1)} cell-nonspecific variants for pred1")

# Load cell-specific variants for pred150
pred150_cell_specific_file = os.path.join(output_dir, f'cell_specific_variants_pred150_group_{group}.pkl')
cell_specific_variants_pred150 = load_variant_names(pred150_cell_specific_file)
print(f"Loaded {len(cell_specific_variants_pred150)} cell-specific variants for pred150")

# Load cell-nonspecific variants for pred150
pred150_cell_nonspecific_file = os.path.join(output_dir, f'cell_nonspecific_variants_pred150_group_{group}.pkl')
cell_nonspecific_variants_pred150 = load_variant_names(pred150_cell_nonspecific_file)
print(f"Loaded {len(cell_nonspecific_variants_pred150)} cell-nonspecific variants for pred150")
'''