# Data Download
#### Introduction:

The notebook is to let user to specify the disease ID for the research purpose. Data are downloaded, parsed and consolidated into useful format.

Input:	Disease ID of interest. EFO_0000685 in this project

Process:	

-	Based on the input disease ID, get the genetic association data and drug association data from OpenTargets Platform using Google BigQuery API.
-	Download genes data and molecular interactions data from OpenTargerts Platform by wget method.
-	Integrate and parse the downloaded data into data files, in single json file or csv file.

Output:	
-	Interaction data in json format.
-	gene id and gene symbol mapping in csv format.

Quality control:	Manually verify the number of records in the program with the result from the Open Targets web platform

Remarks:	User can define different disease ID for other purpose


#### Import Library

In [11]:
import os
from google.cloud import bigquery
import json
import pandas as pd
import subprocess

#### Define Helper Functions

In [14]:
# define load targets data from downloaded files

def combine_json_splits (json_files_path, output_file_path):
    '''
    Utlise json library to read each json splits
    and use Pandas to integrate the data and export to a single JSON file
    parameters: 
        json_files_path: string, the path contain the json files
        output_file_path: string, the path to store the processed files
    return:
        None
    exception:
        JSON Decode error
    '''
    data = []
    for filename in os.listdir(json_files_path):
        if filename.endswith('.json'):
            file_path = os.path.join(json_files_path, filename)
            with open(file_path, 'r') as file:
                for line in file:
                    try:
                        json_data = json.loads(line)
                        data.append(json_data)
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON in file: {filename}")
                        print(e)

    df = pd.DataFrame(data)
    df.to_json(output_file_path)
    
    return None

In [8]:
def get_asso_data_google_big_query (api_key_path, disease_id, datatype_id, output_file_name_path):
    '''
    use google big query API to make query to the OpenTargets Database, and use pandas to process the download data and export it to a csv file
    
    parameters:
        api_key_path: str, the path where the api key is stored
        disease_id: str, the id of the disease
        datatype_id: str, the id of the datatype
        output_file_name_path: str, the path where te output file is stored
    return:
        None
    '''
    # set up the api client
    client = bigquery.Client.from_service_account_json(api_key_path)

    # prepare the queries
    query_statement = f"""
        SELECT
        associations.targetId AS target_id,
        targets.approvedSymbol AS target_approved_symbol,
        associations.diseaseId AS disease_id,
        diseases.name AS disease_name,
        associations.score AS genetic_association_score_Indirect_and_direct
        FROM
        `open-targets-prod.platform.associationByDatatypeIndirect` AS associations
        JOIN
        `open-targets-prod.platform.diseases` AS diseases
        ON
        associations.diseaseId = diseases.id
        JOIN
        `open-targets-prod.platform.targets` AS targets
        ON
        associations.targetId = targets.id
        WHERE
        associations.diseaseId= '{disease_id}'
        and
        associations.datatypeId='{datatype_id}'
        ORDER BY
        associations.score DESC
    """
    genetic_asso_df = client.query(query_statement).to_dataframe()
    genetic_asso_df.to_csv(output_file_name_path, index=False)

    return None

#### Download Genetic Association Data and Drug Association Data from OpenTargets Platform using Google BigQuery API

In [10]:
# specify the disease Id of interest, RA in this project

disease_id = 'EFO_0000685'  #EFO_0000685 is the id of RA

# specify the api key for Google Big Query
api_key_path = os.path.join('google_bigquery_json_key', 'opentargetquery-394208-e9aac3a40e3e.json')

# specify the directory to store the returned data
genetic_asso_file_name = 'RAGeneticAssociationAll.csv'
genetic_asso_file_path = os.path.join('open_targets_data', genetic_asso_file_name)

drug_asso_file_name = 'RADrugAssociationAll.csv'
drug_asso_file_path = os.path.join('open_targets_data', drug_asso_file_name)

In [11]:
# process Google big query and  the csv file of genetic association data with the disease id 

get_asso_data_google_big_query(api_key_path=api_key_path, disease_id=disease_id, datatype_id='genetic_association', output_file_name_path=genetic_asso_file_path)

# get the csv file of drug association data with the disease id 

get_asso_data_google_big_query(api_key_path=api_key_path, disease_id=disease_id, datatype_id='known_drug', output_file_name_path=drug_asso_file_path)

#### Download Targets and PPI data from OpenTargerts Platform by wget Method

In [3]:
open_targets_data_directory = os.path.join('open_targets_data')

# Caution about the long download time as this is downloading 5GB data

# download targets data from OpenTargets platform
url = "ftp://ftp.ebi.ac.uk/pub/databases/opentargets/platform/23.06/output/etl/json/targets"
subprocess.run(["wget", f"-P{open_targets_data_directory}", "--recursive", "--no-host-directories", "--cut-dirs=8", url])

# download interaction data from OpenTargets platform
url = "ftp.ebi.ac.uk/pub/databases/opentargets/platform/23.06/output/etl/json/interaction"
subprocess.run(["wget", f"-P{open_targets_data_directory}", "--recursive", "--no-host-directories", "--cut-dirs=8", url])

In [15]:
# integrate downloaded pieces into json file

targets_files_path = os.path.join(open_targets_data_directory, 'targets')
targets_output_json_path = os.path.join(open_targets_data_directory, 'targets.json')
combine_json_splits(json_files_path = targets_files_path, output_file_path = targets_output_json_path)

targets_files_path = os.path.join(open_targets_data_directory, 'interaction')
targets_output_json_path = os.path.join(open_targets_data_directory, 'interaction.json')
combine_json_splits(json_files_path = targets_files_path, output_file_path = targets_output_json_path)


#### Create Target Gene ID to Symbol Mapping file

In [16]:
# create data file that contain the gene ID to symbol mapping

# specify the output file name and path
mapping_file_path = os.path.join('data', 'others', 'target_gene_id_sym.csv')

# use pandas to filter the data that is needed
targets = pd.read_json(targets_output_json_path)
target_genes = targets.loc[targets['biotype'] == 'protein_coding']  # for biotype = protein_coding, they are gene related targets
target_gene_id_sym = target_genes[['id', 'approvedSymbol', 'symbolSynonyms', 'alternativeGenes']]
target_gene_id_sym.to_csv(mapping_file_path, index=False)