In [2]:
import json
from pathlib import Path
from collections import defaultdict

import requests
import pandas as pd

DATA_DIR = Path('../association/onek1k_website_data/')
OUTPUT_DIR = Path('../')

# API Clients

In [3]:
class ApiClient:
    """
    Base API client providing common cache operations.
    """
    def __init__(self, base_url, connection=None, cache=None):
        self.base_url = base_url
        self.connection = connection or requests
        self.cache = {}

        if isinstance(cache, (Path, str)):
            self.load_cache(file_path=cache)
        
    def save_cache(self, file_path):
        with open(file_path, 'wt') as file:
            json.dump(self.cache, file)
        return self
            
    def load_cache(self, file_path):
        with open(file_path, 'rt') as file:
            self.cache = json.load(file)
        return self

## MyVariant
Simple client for [MyVariant](https://myvariant.info/v1/api) service to retrieve dbSNP record information such as consequence etc.

In [4]:
class MyVariantClient(ApiClient):
    """
    Simple client for https://myvariant.info/v1/api
    """
    
    def __init__(self, base_url='http://myvariant.info/v1/api', connection=None, cache=None):
        super.__init__(self, base_url=base_url, connection=connection, cache=cache)
        
    def fetch_rsid(self, identifier):
        if identifier in self.cache:
            return self.cache[identifier]
        
        result = self.connection.get(f"{self.base_url}/variant/{identifier}").json()
        self.cache[identifier] = result
        return result

## HGNC
Simple client for [HGNC rest API](https://www.genenames.org/help/rest/) to retrieve gene information such as names, aliases, transcripts etc.

In [17]:
class HgncClient(ApiClient):   
    """
    Simple client for https://www.genenames.org/help/rest/
    """
        
    def __init__(self, base_url='http://rest.genenames.org', connection=None, cache=None):
        super().__init__(base_url=base_url, connection=connection, cache=cache)

    def fetch(self, identifier, fields=('hgnc_id')):
        if isinstance(fields, str):
            fields = (fields,)
            
        if identifier in self.cache.keys():
            field_mapping = self.cache[identifier]
            requested_fields = set(fields) & set(field_mapping.keys())
            for field in requested_fields:
                if field_mapping.get(field, None):
                    return field_mapping[field]
            if
        else:
            self.cache[identifier] = {}
        
        for field in fields:
            response = self.connection.get(
                f"{self.base_url}/fetch/{field}/{identifier}",
                headers={'Accept': 'application/json'}
            ).json()

            records = response['response'].get('docs', [])
            if not records:
                records = None
                
            self.cache[identifier][field] = records
            
            if records:
                break
        
        return records
    
    def fetch_many(self, identifiers, fields=('hgnc_id')):
        records = {}
        for identifier in identifiers:
            records[identifier] = None
            result = self.fetch(identifier, field=field)
            for field in fields:
                
                if result is not None:
                    records[identifier] = result
                    break
        return records

# Generate Gene search file
Generate `gene_search_terms.json.txt` file where each line consists of a valid `JSON` list element. Each of these elements is a tuple that looks like `[gene_name, [search_terms]]`.

In [None]:
gene_search_terms = defaultdict(list)
gene_metadata = {}

# ---- Manually insert info not available online
gene_metadata['AC007308.6'] = [{
    'ensembl_gene_id': 'ENSG00000234252',
    'symbol': 'AC007308.6',
    'name': 'AC007308.6 (Clone-based (Vega))',
    'location': '22qxx.x'
}]
gene_metadata['AC002472.13'] = [{
    'ensembl_gene_id': 'ENSG00000187905',
    'symbol': 'AC002472.13',
    'name': 'Leucine-rich repeat-containing protein LOC400891',
    'location': '22qxx.x'
}]
gene_metadata['CTA-29F11.1'] = [{
    'ensembl_gene_id': 'CTA-29F11.1',
    'symbol': 'CTA-29F11.1',
    'name': 'lncRNA',
    'location': '0qxx.x'
}]
gene_metadata['AC000068.5'] = [{
    'ensembl_gene_id': 'ENSG00000185065',
    'symbol': 'AC000068.5',
    'name': 'AC000068.5 (Clone-based (Vega) gene)',
    'location': '19qxx.x'
}]
# ----------------------------------------------
cache_path = OUTPUT_DIR / 'hgnc_cache.json'
if cache_path.exists():
    hgnc_client = HgncClient(cache=cache_path)
else:
    hgnc_client = HgncClient(cache=None)

cell_labels = sorted([
    expression_file.name.split('_')[0].strip() 
    for expression_file in DATA_DIR.glob('*_eQTLs.tsv')
])
    
for label in cell_labels:
    df = pd.read_csv(DATA_DIR / f"{label}_eQTLs.tsv", header=0, delimiter='\t')
    for gene, group in df.groupby('GENE'):
        print(f"Fetching metadata for '{gene}'")
        metadata = hgnc_client.fetch(
            identifier=gene,
            fields=('symbol', 'prev_symbol', 'alias_symbol', 'vega_id', 'uniprot_ids')
        )
        
        if metadata:
            gene_metadata[gene] = metadata
            gene_search_terms[gene].append(metadata[0]['ensembl_gene_id'])
            gene_search_terms[gene].append(metadata[0]['hgnc_id'])
            gene_search_terms[gene].append(metadata[0]['symbol'])
            gene_search_terms[gene].append(metadata[0]['name'])
        
        gene_search_terms[gene].append(label)
        gene_search_terms[gene].append(gene)
        gene_search_terms[gene] += list(group['SNP'])
        gene_search_terms[gene] += list(f"chr{chrom}" for chrom in group['CHR'])
        
hgnc_client.save_cache(OUTPUT_DIR / 'hgnc_cache.json')

with open(OUTPUT_DIR / 'gene_search_terms.json.txt', 'wt') as file:
    for gene, search_terms in gene_search_terms.items():
        file.write(json.dumps([gene, list(set(search_terms))]))
        file.write('\n')
        
with open(OUTPUT_DIR / 'gene_metadata.json', 'wt') as file:
    json.dump(gene_metadata, file)

Fetching metadata for 'ARSA'
Fetching metadata for 'C22orf34'
Fetching metadata for 'CHCHD10'
Fetching metadata for 'CRYBB2'
Fetching metadata for 'CTA-29F11.1'
Fetching metadata for 'APOBEC3C'
Fetching metadata for 'ARFGAP3'
Fetching metadata for 'BCR'
Fetching metadata for 'CHCHD10'
Fetching metadata for 'CRYBB2'
Fetching metadata for 'AC000068.5'
Fetching metadata for 'APOBEC3G'
Fetching metadata for 'APOL2'
Fetching metadata for 'ARSA'
Fetching metadata for 'BCR'
Fetching metadata for 'ADRBK2'
Fetching metadata for 'APOBEC3G'
Fetching metadata for 'APOBEC3H'
Fetching metadata for 'APOL6'
Fetching metadata for 'ARSA'
Fetching metadata for 'AC002472.13'
Fetching metadata for 'APOBEC3G'
Fetching metadata for 'APOBEC3H'
Fetching metadata for 'ARSA'
Fetching metadata for 'ASPHD2'
Fetching metadata for 'ARSA'
Fetching metadata for 'CTA-29F11.1'
Fetching metadata for 'DDT'
Fetching metadata for 'FAM118A'
Fetching metadata for 'GGT1'
Fetching metadata for 'DDT'
Fetching metadata for 'FAM11

# Site metadata file
This is a metadata file containing configuration options for the TOB browser. These are used by the framework to automate table rendering and file serving on the client and server side.

Framework *requires* the keys `reference_genome`, `gene_result_analysis_groups` and `gene_group_result_field_names`. Other fields are TOB specific.

In [13]:
human_readable_labels = {
    "BimmNaive": "Bimm Naive",
    "Bmem": "B Memory",
    "CD4all": "CD4 All",
    "CD8all": "CD8 All",
    "CD8eff": "CD8 Eff",
    "CD8unknown": "CD8 Unknown",
    "DC": "DC",
    "MonoC": "Mono C",
    "MonoNC": "Mono CN",
    "NKact": "NK Act",
    "NKmat": "NK Mat",
    "Plasma": "Plasma"
}

metadata = {
    "datasets": {
        "tob": {
            "reference_genome": "GRCh37",
            "gene_result_analysis_groups": ["All"],
            "gene_group_result_field_names": cell_labels,
            "gene_results_table_headings": {
                l: human_readable_labels[l] for l in cell_labels
            },
            "gene_symbols": sorted(gene_search_terms.keys())
        }
    }
}

with open(OUTPUT_DIR / 'metadata.json', 'wt') as file:
    json.dump(metadata, file)

# Generate gene results file
Generates the gene results `JSON` file that the exome browser framework uses to render a gene results table when requesting the `/results` URL.

In [14]:
gene_results = {"results": []}
for gene in gene_search_terms.keys():
    cell_label_results = []
    for cell_label in cell_labels:
        df = pd.read_csv(DATA_DIR / f"{cell_label}_eQTLs.tsv", header=0, delimiter='\t')
        cell_label_results.append(df[df['GENE'] == gene].shape[0])
        
    metadata = gene_metadata[gene][0]
    record = [
        metadata.get('ensembl_gene_id'),
        gene,
        metadata.get('name'),
        int(metadata.get('location').split('q')[0]),
        1,
        [cell_label_results]
    ]
    
    gene_results['results'].append(record)

with open(OUTPUT_DIR / 'results'/ 'tob.json', 'wt') as file:
    json.dump(gene_results, file)

KeyError: 'C22orf34'

# Process eQTL association files
Process the raw association files into a `JSON` mapping indexed by cell type and then further by gene name. Values are a list of SNP mappings containing p-values, log10 p-values, dbSNP identifiers and other analysis data.

In [None]:
snp_associations = defaultdict(lambda: defaultdict(list))

for label in cell_labels:
    df = pd.read_csv(DATA_DIR / f"{label}_eQTLs.tsv", header=0, delimiter='\t')
    for gene, group in df.groupby('GENE'):
        records = list(group.to_dict(orient='records'))
        for r in records:
            r['ID'] = f"{r['CHR']}-{r['BP']}-{r['A1']}-{r['A2']}"
            r['CONSEQUENCE'] = 'TODO'
            r['HGVSP'] = 'TODO'
            r['HGVSC'] = 'TODO'
        snp_associations[label][gene] = records
        
with open(OUTPUT_DIR / 'snp_association.json', 'wt') as file:
    json.dump(snp_associations, file)

# Process expression files
Each file ending in `*_residual expression.tsv` contains expression data for a given gene listed in the columns. Each file lists different genes so we need to concatenate each dataframe initialising missing columns to `0`. This file will be served for UMAP construction. Saved in `

In [None]:
dfs = {
    label: pd.read_csv(DATA_DIR / f"{label}_residual_expressions.tsv", header=0, delimiter='\t')
    for label in cell_labels
}

genes = list(set(g for df in dfs.values() for g in df.columns ))
for gene in genes:
    for cell_label, df in dfs.items():
        df['cell_label'] = cell_label
        if gene not in df.columns:
            df[gene] = 0
            
expression = pd.concat(dfs.values(), axis=0, ignore_index=True)
expression = expression[["cell_label"] + sorted(genes)]
expression.to_csv(OUTPUT_DIR / 'results' / 'cell_label_expression.csv', index=False, sep=",")