In [None]:
# Script to parse the UniProt report on variants of a given protein

In [1]:
import json
import csv

In [82]:
def extract_variant_info(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)

    variants_info = []

    for feature in data.get('features', []):
        if feature.get('type') == 'VARIANT':
            if feature.get('consequenceType') == 'missense':
                try:
                    genomicLocation = feature.get('genomicLocation')[0]
            
                    if '>' in genomicLocation:
                        values = (genomicLocation[15:]).split('>')
                        mutated_n = values[1]
                        n_position = values[0][:-1]
                        original_n = values[0][-1]
                        
                        nucleotide_change = f"{original_n}>{mutated_n}"

                        try:
                            chromosome = feature.get('cytogeneticBand')[:2]
                
                            aa_position = feature.get('begin')
                            original_aa = feature.get('wildType')
                            mutated_aa = feature.get('mutatedType')
                            aa_change = f"{original_aa}{aa_position}{mutated_aa}"
                            phenotypes = []

                            for association in feature.get('association', []):
                                phenotype_name = association.get('name')
                                if phenotype_name:
                                    phenotypes.append(phenotype_name)
                            
                            variant_info = {
                                'Chromosome': chromosome,
                                'NTPosition': n_position,
                                'NTChange': nucleotide_change,
                                'AAPosition': aa_position,
                                'AAChange': aa_change,
                                'Phenotype': ', '.join(phenotypes)
                            }
                            variants_info.append(variant_info)
                        except:
                            continue

                except TypeError:
                    continue

    return variants_info


In [87]:
def save_to_csv(data, csv_file):
    with open(csv_file, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['Chromosome', 'NTPosition', 'NTChange', 'AAPosition', 'AAChange', 'Phenotype'])
        writer.writeheader()
        for row in data:
            writer.writerow({
                'Chromosome': row['Chromosome'],
                'NTPosition': row['NTPosition'],
                'NTChange': row['NTChange'], 
                'AAPosition':row['AAPosition'], 
                'AAChange':row['AAChange'], 
                'Phenotype': row['Phenotype']
            })


In [83]:
# Usage
json_file = 'O00555-variants.json'
variant_info_list = extract_variant_info(json_file)

# Printing the extracted information
for variant in variant_info_list:
    print(f"Chromosome: {variant['Chromosome']}, NTPosition: {variant['NTPosition']}, NTChange: {variant['NTChange']}, AAPosition:{variant['AAPosition']}, AAChange:{variant['Phenotype']}, Phenotype: {variant['Phenotype']}")


Chromosome: 19, NTPosition: 13506221, NTChange: C>A, AAPosition:2, AAChange:Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2), Phenotype: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Chromosome: 19, NTPosition: 13506220, NTChange: G>A, AAPosition:2, AAChange:Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2), Phenotype: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Chromosome: 19, NTPosition: 13506218, NTChange: G>A, AAPosition:3, AAChange:Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2), Phenotype: Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2)
Chromosome: 19, NTPosition: 13506218, NTChange: G>C, AAPosition:3, AAChange:Developmental and epileptic encephalopathy, 42 (DEE42), Episodic ataxia type 2 (EA2), Phenotype: Developmental and epileptic encephalopathy, 42 (DEE42

In [88]:
# save to a file
csv_file = 'O00555-variants.csv'
save_to_csv(variant_info_list, csv_file)
