In [None]:
# Script to parse the UniProt report on variants of a given protein

In [1]:
import json
import csv

In [24]:
def extract_variant_info(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)

    variants_info = []

    for feature in data.get('features', []):
        if feature.get('type') == 'VARIANT':
            try:
                genomicLocation = feature.get('genomicLocation')[0]

            except TypeError:
                pass
                
            if '>' in genomicLocation:
                values = (genomicLocation[15:]).split('>')
                mutated_n = values[1]
                position = values[0][:-1]
                original_n = values[0][-1]
                
                amino_acid_change = f"{original_n}{position}{mutated_n}"
                phenotypes = []

            for association in feature.get('association', []):
                phenotype_name = association.get('name')
                if phenotype_name:
                    phenotypes.append(phenotype_name)

            variant_info = {
                'Position': position,
                'Amino Acid Change': amino_acid_change,
                'Associated Phenotypes': ', '.join(phenotypes)
            }
            variants_info.append(variant_info)

    return variants_info


In [3]:
def save_to_csv(data, csv_file):
    with open(csv_file, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['Position', 'AAChange', 'Phenotype'])
        writer.writeheader()
        for row in data:
            writer.writerow({
                'Position': row['Position'],
                'AAChange': row['Amino Acid Change'],
                'Phenotype': row['Associated Phenotypes']
            })


In [30]:
# Usage
json_file = 'O00555-variants.json'
variant_info_list = extract_variant_info(json_file)

# Printing the extracted information
for variant in variant_info_list:
    print(f"Position: {variant['Position']}, Nucleotide Change: {variant['Amino Acid Change']}")#, Associated Phenotypes: {variant['Associated Phenotypes']}")


Position: 13506221, Nucleotide Change: C13506221A
Position: 13506220, Nucleotide Change: G13506220A
Position: 13506218, Nucleotide Change: G13506218A
Position: 13506218, Nucleotide Change: G13506218C
Position: 13506211, Nucleotide Change: C13506211A
Position: 13506201, Nucleotide Change: C13506201G
Position: 13506199, Nucleotide Change: G13506199A
Position: 13506200, Nucleotide Change: G13506200A
Position: 13506197, Nucleotide Change: C13506197A
Position: 13506196, Nucleotide Change: G13506196A
Position: 13506193, Nucleotide Change: C13506193T
Position: 13506190, Nucleotide Change: T13506190C
Position: 13506190, Nucleotide Change: T13506190A
Position: 13506191, Nucleotide Change: A13506191G
Position: 13506187, Nucleotide Change: C13506187G
Position: 13506187, Nucleotide Change: C13506187T
Position: 13506188, Nucleotide Change: C13506188T
Position: 13506188, Nucleotide Change: C13506188T
Position: 13506188, Nucleotide Change: C13506188T
Position: 13506184, Nucleotide Change: C13506184T


In [26]:
len(variant_info_list)

2855

In [13]:
# save to a file
csv_file = '../data/O00555-variants.csv'
save_to_csv(variant_info_list, csv_file)
