## Create annotation key file (YAML) for CWAS analyses

In [None]:
import yaml
import pandas as pd
import os, glob

In [None]:
# Specify the path to your YAML file
yaml_file_path = '/Users/yujinkim/Dropbox/CWAS_paper_WD/Data/raw_annotations/annotation_keys.yaml'

# Open the YAML file and load its content
with open(yaml_file_path, 'r') as yaml_file:
    yaml_data = yaml.safe_load(yaml_file)

# Now, yaml_data contains the parsed YAML content as a Python dictionary or list
print(yaml_data)


In [None]:
# Specify the path to your Excel file
xlsx_file_path = '/Users/yujinkim/Dropbox/CWAS_paper_WD/CWAS-Plus_annotations_v5_edited.xlsx'

# Specify the sheet name you want to access
sheet_name = 'Annotation v5'

df = pd.read_excel(xlsx_file_path, sheet_name=sheet_name)

In [None]:
# Extract the keys from the DataFrame
df_keys = set(df['Annotation Key'].values)

In [None]:
# Filter keys in yaml_data['functional_annotation'] based on the overlap
filtered_keys = {key: value for key, value in yaml_data['functional_annotation'].items() if value in df_keys}

In [None]:
# Update the original dictionary
yaml_data['functional_annotation'] = filtered_keys

# Display the updated dictionary
print(yaml_data)

In [None]:
# Filter keys in yaml_data['functional_annotation'] based on the overlap
filtered_keys = {key: value for key, value in yaml_data['functional_score'].items() if value in df_keys}

# Update the original dictionary
yaml_data['functional_score'] = filtered_keys

# Display the updated dictionary
print(yaml_data)

In [None]:
# Assuming yaml_data is defined as mentioned in the previous response

# New key-value pair to add
new_key = 'primate.phyloP447way.hg38.over1.3.bed.gz'
new_value = 'phyloP447way'

# Add the new key-value pair to yaml_data['functional_score']
yaml_data['functional_score'][new_key] = new_value

## Get file that are not bgzipped and tabix

In [None]:
file_paths = glob.glob("/Users/yujinkim/Dropbox/CWAS_paper_WD/Data/cwas_annotation.v5/*.bed")
file_paths

In [None]:
for file_path in file_paths:
    # Extract the file name and extension
    base_path, file_name = os.path.split(file_path)
    # Define the output file path
    output_file_path = os.path.join(base_path, f"{file_name}.gz")

    # Execute bgzip command using os.system
    os.system(f"bgzip -c {file_path} > {output_file_path}")

    print(f"File '{file_path}' gzipped and saved to '{output_file_path}'")
    
    # Define the output file path for tabix
    tabixed_file_path = f"{output_file_path}.tbi"

    # Execute tabix command using os.system
    os.system(f"tabix -p bed {output_file_path} -f")

    print(f"File '{output_file_path}' indexed with tabix and saved to '{tabixed_file_path}'")

## Rename gz to bed.gz

In [None]:
file_paths = glob.glob("/Users/yujinkim/Dropbox/CWAS_paper_WD/Data/cwas_annotation.v5/*.gz")
file_paths

In [None]:
# Filter filenames that end with 'gz' but not 'bed.gz'
filtered_filenames = [
    os.path.splitext(os.path.basename(path))[0]  # Get the filename without extension
    for path in file_paths
    if path.endswith('gz') and not path.endswith('bed.gz')
]

# Display the filtered filenames
print("Filenames ending with 'gz' but not 'bed.gz': ", filtered_filenames)

In [None]:
# Rename files using the 'mv' command
for filename in filtered_filenames:
    original_path = f"~/Dropbox/CWAS_paper_WD/Data/cwas_annotation.v5/{filename}.gz"
    new_path = f"~/Dropbox/CWAS_paper_WD/Data/cwas_annotation.v5/{filename}.bed.gz"

    os.system(f"mv {original_path} {new_path}")

    print(f"Renamed '{original_path}' to '{new_path}'")

## Add new annotations

In [None]:
file_paths = glob.glob("/Users/yujinkim/Dropbox/CWAS_paper_WD/Data/cwas_annotation.v5/*.bed.gz")
file_paths

In [None]:
# Assuming yaml_data is defined as mentioned in previous responses
yaml_functional_annotation = yaml_data.get('functional_annotation', {})
yaml_functional_annotation

In [None]:
# Extract file names from file paths
file_names = [os.path.basename(path) for path in file_paths]

# Find values in file_names that are not in yaml_functional_score
new_values = set(file_names) - set(yaml_functional_annotation.keys())

# Display values that are in file_paths but not in yaml_data['functional_score']
print("Values in file_paths but not in yaml_data['functional_score']: ", new_values)

In [None]:
import os

# Assuming yaml_data is defined as mentioned in previous responses
yaml_functional_score = yaml_data.get('functional_score', {})

# Find values in file_names that are not in yaml_functional_score
diff_Val = set(new_values) - set(yaml_functional_score.keys())

# Display values in file_paths but not in yaml_data['functional_score']
print("Values in file_paths but not in yaml_data['functional_score']: ", diff_Val)

# Remove values that overlap with yaml_data['functional_score']
new_values_without_overlap = diff_Val - set(yaml_functional_score.keys())

# Display the final set of values after removing the overlap
print("Filtered values without overlap: ", new_values_without_overlap)

## ASD5TF

In [None]:
# Assuming yaml_data is defined as mentioned in the previous response

# New key-value pair to add
new_key = 'ASD5TF_common.sorted.bed.gz'
new_value = 'ASD5TF.Common'

# Add the new key-value pair to yaml_data['functional_score']
yaml_data['functional_annotation'][new_key] = new_value

In [None]:
# Assuming yaml_data is defined as mentioned in previous responses
yaml_functional_score = yaml_data.get('functional_annotation', {})

# Find values in file_names that are not in yaml_functional_score
diff_Val = set(new_values_without_overlap) - set(yaml_functional_score.keys())

# Remove values that overlap with yaml_data['functional_score']
updated_non_overlap = diff_Val - set(yaml_functional_score.keys())

# Display the final set of values after removing the overlap
print("Filtered values without overlap: ", updated_non_overlap)

In [None]:
# Assuming yaml_data is defined as mentioned in previous responses
yaml_functional_annotation = yaml_data.get('functional_annotation', {})

sorted_new_values = sorted(new_values_without_overlap)
# Add new keys and values to yaml_data['functional_annotation']
for index, value in enumerate(sorted_new_values, start=1):
    yaml_functional_annotation[value] = index

# Update the original dictionary
yaml_data['functional_annotation'] = yaml_functional_annotation

# Display the updated dictionary
print(yaml_data)

In [None]:
# Specify the path to save the YAML file
output_yaml_path = '/Users/yujinkim/Dropbox/CWAS_paper_WD/Data/cwas_annotation.v5/annotation_keys.yaml'

# Save the dictionary to a YAML file
with open(output_yaml_path, 'w') as yaml_file:
    yaml.dump(yaml_data, yaml_file, default_flow_style=False)

print(f"YAML data saved to {output_yaml_path}")

In [None]:
# Filter rows based on the exact match condition
filtered_df = df[df['Study/Database'] == 'Herring et al., Cell (2022)']
filtered_df