In [None]:
# Script to parse the UniProt report on variants of a given protein

In [1]:
import json
import csv

In [24]:
def extract_variant_info(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)

    variants_info = []

    for feature in data.get('features', []):
        if feature.get('type') == 'VARIANT':
            try:
                genomicLocation = feature.get('genomicLocation')[0]

            except TypeError:
                pass
                
            if '>' in genomicLocation:
                values = (genomicLocation[15:]).split('>')
                mutated_n = values[1]
                position = values[0][:-1]
                original_n = values[0][-1]
                
                amino_acid_change = f"{original_n}{position}{mutated_n}"
                phenotypes = []

            for association in feature.get('association', []):
                phenotype_name = association.get('name')
                if phenotype_name:
                    phenotypes.append(phenotype_name)

            variant_info = {
                'Position': position,
                'Amino Acid Change': amino_acid_change,
                'Associated Phenotypes': ', '.join(phenotypes)
            }
            variants_info.append(variant_info)

    return variants_info


In [3]:
def save_to_csv(data, csv_file):
    with open(csv_file, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['Position', 'AAChange', 'Phenotype'])
        writer.writeheader()
        for row in data:
            writer.writerow({
                'Position': row['Position'],
                'AAChange': row['Amino Acid Change'],
                'Phenotype': row['Associated Phenotypes']
            })


In [25]:
# Usage
json_file = 'O00555-variants.json'
variant_info_list = extract_variant_info(json_file)

# Printing the extracted information
for variant in variant_info_list:
    print(f"Position: {variant['Position']}, Amino Acid Change: {variant['Amino Acid Change']}, Associated Phenotypes: {variant['Associated Phenotypes']}")


Position: 13506221, Amino Acid Change: C13506221A, Associated Phenotypes: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Position: 13506220, Amino Acid Change: G13506220A, Associated Phenotypes: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Position: 13506218, Amino Acid Change: G13506218A, Associated Phenotypes: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Position: 13506218, Amino Acid Change: G13506218C, Associated Phenotypes: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Position: 13506211, Amino Acid Change: C13506211A, Associated Phenotypes: 
Position: 13506201, Amino Acid Change: C13506201G, Associated Phenotypes: 
Position: 13506199, Amino Acid Change: G13506199A, Associated Phenotypes: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Position: 13506200, Amino Acid Change: G13506200A, Asso

In [26]:
len(variant_info_list)

2855

In [13]:
# save to a file
csv_file = '../data/O00555-variants.csv'
save_to_csv(variant_info_list, csv_file)
