In [None]:
# Script to parse the UniProt report on variants of a given protein

In [1]:
import json
import csv

In [2]:
def extract_variant_info(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)

    variants_info = []

    for feature in data.get('features', []):
        if feature.get('type') == 'VARIANT':
            position = feature.get('begin')
            original_aa = feature.get('wildType')
            mutated_aa = feature.get('mutatedType')
            amino_acid_change = f"{original_aa}{position}{mutated_aa}"
            phenotypes = []

            for association in feature.get('association', []):
                phenotype_name = association.get('name')
                if phenotype_name:
                    phenotypes.append(phenotype_name)

            variant_info = {
                'Position': position,
                'Amino Acid Change': amino_acid_change,
                'Associated Phenotypes': ', '.join(phenotypes)
            }
            variants_info.append(variant_info)

    return variants_info


In [3]:
def save_to_csv(data, csv_file):
    with open(csv_file, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['Position', 'AAChange', 'Phenotype'])
        writer.writeheader()
        for row in data:
            writer.writerow({
                'Position': row['Position'],
                'AAChange': row['Amino Acid Change'],
                'Phenotype': row['Associated Phenotypes']
            })


In [4]:
# Usage
json_file = 'O00555-variants.json'
variant_info_list = extract_variant_info(json_file)

# Printing the extracted information
for variant in variant_info_list:
    print(f"Position: {variant['Position']}, Amino Acid Change: {variant['Amino Acid Change']}, Associated Phenotypes: {variant['Associated Phenotypes']}")


Position: 2, Amino Acid Change: A2S, Associated Phenotypes: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Position: 2, Amino Acid Change: A2V, Associated Phenotypes: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Position: 3, Amino Acid Change: R3C, Associated Phenotypes: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Position: 3, Amino Acid Change: R3G, Associated Phenotypes: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Position: 5, Amino Acid Change: G5V, Associated Phenotypes: 
Position: 8, Amino Acid Change: M8I, Associated Phenotypes: 
Position: 9, Amino Acid Change: P9L, Associated Phenotypes: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Position: 9, Amino Acid Change: P9S, Associated Phenotypes: 
Position: 10, Amino Acid Change: A10S, Associated Phenotypes: 
Position: 10, Amino Acid Chan

In [13]:
# save to a file
csv_file = '../data/O00555-variants.csv'
save_to_csv(variant_info_list, csv_file)
