In [1]:
import requests as req

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Display pyplot graphs as interactable
%matplotlib notebook

In [4]:
SGD_API_BASE_URL = 'https://www.yeastgenome.org/webservice'

In [5]:
GENES_TO_QUERY_COUNT = 100

In [6]:
sgdids_with_features_file = open('resources/all_chromosome_features_sgdids.tsv', 'r')

FileNotFoundError: [Errno 2] No such file or directory: 'resources/all_chromosome_features_sgdids.tsv'

In [None]:
# For each line, split by tabs and take the first column which is the SGD ID associated with that gene
sgdids_with_features = [line.strip().split('\t')[0] for line in sgdids_with_features_file]

In [None]:
all_locus_json_data = {}
for sgdid in (sgdids_with_features if GENES_TO_QUERY_COUNT is 'all' else sgdids_with_features[:GENES_TO_QUERY_COUNT]):
    request_url = SGD_API_BASE_URL + '/locus/' + sgdid
    sequence_details_request_url = request_url + '/sequence_details'
    locus_general_json_data = req.get(url=request_url).json()
    locus_sequence_details_json_data = req.get(url=sequence_details_request_url).json()
    print("Got JSON response for locus with SGD ID: {}".format(request_url, sgdid))
    all_locus_json_data[sgdid] = locus_general_json_data, locus_sequence_details_json_data

In [None]:
# ID for the reference strain of yeast S288C
REFERENCE_STRAIN_ID = 1364643

In [None]:
rows = []
column_headers = ['sgdid', 'feature_type', 'systematic_name', 'gene_name', 'aliases', 'characterization', 'location', 'start', 'end', 'strand']
for sgdid, (locus_general_json_data, locus_sequence_details_json_data) in all_locus_json_data.items():
    has_aliases = bool(locus_general_json_data['aliases'])
    genomic_json_data = locus_sequence_details_json_data['genomic_dna']
    reference_strain_json_data = None
    # Make sure that this strain is the reference S288C
    for strain_json_data in genomic_json_data:
        if strain_json_data['strain']['id'] == REFERENCE_STRAIN_ID:
            reference_strain_json_data = strain_json_data
    rows.append([
        sgdid,
        locus_general_json_data['locus_type'] or 'N/A',
        locus_general_json_data['format_name'] or 'N/A',
        locus_general_json_data['gene_name'] or 'N/A',
        [alias['display_name'] for alias in locus_general_json_data['aliases']] if has_aliases else "None",
        locus_general_json_data['qualifier'] or 'N/A',
        reference_strain_json_data['contig']['display_name'] if reference_strain_json_data else 'N/A',
        reference_strain_json_data['start'] if reference_strain_json_data else 'N/A',
        reference_strain_json_data['end'] if reference_strain_json_data else 'N/A',
        reference_strain_json_data['strand'] if reference_strain_json_data else 'N/A',
    ])
locus_data = pd.DataFrame(rows, columns=column_headers)
print(locus_data)