# Analysis of SI Table 6 for genes overexpressed in cancer

**Genes overexpressed in different human solid cancers exhibit different tissue-specific expression profiles**

In [304]:
%load_ext blackcellmagic
from glob import glob as glob
from io import StringIO
import os as os

import pandas as pd
import urllib
import json

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


To make things easier, I'm going to split Table 6 into separate tables for each type of cancer and then save as a `csv`.

In [310]:
file_list = glob("*.csv")

In [312]:
file_list

['melanoma.csv',
 'endometrial.csv',
 'lung.csv',
 'kidney.csv',
 'colon.csv',
 'ovarian.csv',
 'astrocytoma.csv',
 'liver.csv',
 'thyroid.csv',
 'breast.csv',
 'glioblastoma.csv',
 'prostate.csv']

Now, define the helper functions.

In [315]:
def get_uniprot(gene):
    """
    Look up gene name in Uniprot.
    """
    # print(f"Looking up {gene} in Uniprot database...")
    url = f"https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:9606+AND+gene_exact:{gene}&format=tab"
    request = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        return None
    page = response.read(200000)
    return pd.read_csv(StringIO(page.decode("utf-8")), sep="\t")


In [316]:
def get_bindingdb(uniprot):
    """
    Look up uniprot in Binding DB.
    """
    # print(f"Looking up {uniprot} in BindingDB...")
    url = f"http://www.bindingdb.org/axis2/services/BDBService/getLigandsByUniprots?uniprot={uniprot}&cutoff=1000&code=0&response=application/json"
    request = urllib.request.Request(url)
    try:
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        return None

    page = response.read(200000)
    binding_db = json.loads(page.decode("utf-8"))
    if binding_db["getLigandsByUniprotsResponse"] == "":
        return None
    return pd.DataFrame(binding_db["getLigandsByUniprotsResponse"]["affinities"])


In [317]:
def bindingdb_to_table(table):
    """
    Parse the Binding DB `json` response.
    """

    query = table["query"].values[0]
    affinities = table["affinity"].values
    affinity_types = table["affinity_type"].values
    smiles = table["smile"].values
    return pd.DataFrame(
        {
            "Affinities": affinities,
            "Affinity type": affinity_types,
            "SMILES": smiles,
            "Query": query,
        }
    )


In [318]:
def manual_pivot(table):
    """
    The data we get from Binding DB will be in rows.
    But we want each row to basically be a new column, so that gene names are the rows.
    """
    # I had problems working with the built-in `pivot` because the data is non-numeric
    # tmp3 = tmp2.pivot_table(index="Query",
    #                        columns=tmp2.index,
    #                        values=["Affinities"],
    #                        aggfunc=lambda x: ' '.join(x))

    df = pd.DataFrame()
    for row in range(len(table)):
        df[f"Affinity {row:03d}"] = pd.Series(table["Affinities"][row])
        df[f"Type {row:03d}"] = pd.Series(table["Affinity type"][row])
        df[f"SMILES {row:03d}"] = pd.Series(table["SMILES"][row])
    return df


Loop over the files, then loop over the genes.

To start I'm just looping over the breast cancer data to see if I can match the one file that Tiqing returned.

First, I map from gene to protein and Uniprot ID. That is stored in the `gene_df` DataFrame.
Then I use the Uniprot ID to look up data from Binding DB.

Then I join `gene_df` and the Binding DB data.

In [319]:
for file in [file_list[-3]]:
    table = pd.read_csv(file, skiprows=1, names=["Unknown", "Gene", "Overexpression"])
    table = table.drop_duplicates(subset="Gene", keep="first")
    df = pd.DataFrame()
    for gene, overexpression in zip(
        table["Gene"].values, table["Overexpression"].values
    ):

        if gene == "---":
            continue
        if "///" in gene:
            gene = gene.split("///")[0]

        gene_table = get_uniprot(gene)
        if gene_table is None:
            continue
        uniprot = gene_table["Entry"].values[0]
        protein = gene_table["Protein names"].values[0]

        gene_df = pd.DataFrame()
        gene_df["Gene"] = pd.Series(gene)
        gene_df["Uniprot"] = pd.Series(uniprot)
        gene_df["Protein"] = pd.Series(protein)
        gene_df["Overexpression"] = pd.Series(overexpression)

        binding_db = get_bindingdb(uniprot)

        if binding_db is not None:
            print(f"{gene:10} → {uniprot:10} → {len(binding_db):4} ligands found...")
            binding_table = bindingdb_to_table(binding_db)
            binding_pivot = manual_pivot(binding_table)

            gene_df = gene_df.join(binding_pivot)
            # Only track the proteins with entries in BindingDB
            df = df.append(gene_df, ignore_index=True)
        else:
            print(f"{gene:10} → {uniprot:10} → {'0':>4} ligands found...")
            pass
    df.to_csv(os.path.splitext(file)[0] + "-ligands.csv")


COL11A1    → P12107     →    0 ligands found...
COL10A1    → Q03692     →    0 ligands found...
RRM2       → P31350     →    3 ligands found...
MELK       → Q14680     →  818 ligands found...
LRRC15     → Q8TF66     →    0 ligands found...
CDC2       → P06493     →  723 ligands found...
CENPF      → P49454     →    0 ligands found...


JSONDecodeError: Unterminated string starting at: line 1 column 199992 (char 199991)