## Gene Dataset Creation
### Date: October 17, 2024
### Author: Selin Kaplanoglu

**Goal:** To create a dataset of genes.

**Information:** This dataset will include six values of information: ensembl id, symbol, aliases, name, other names, description. The format of this dataset should be a JSON dictionary.

In [1]:
import json
import logging
import requests
import pandas as pd
import time
from urllib.parse import quote

In [2]:
logging.basicConfig(filename='geneset_dict_log.log', level=logging.INFO, format='%(asctime)s - %(message)s')

In [12]:
def get_gene_info(gene_file_path):
    """
    Gets information on the genes by querying the gene symbol.

    Parameters
    ----------
    gene_file_path: Path of your gene file

    Returns
    -------
    responses: A list that contains all the responses after the query.
    """
    gene_df = pd.read_csv(gene_file_path, sep="\t")
    gene_symbol_list = list(gene_df['symbol'])
    responses = []  
    for gene_symbol in gene_symbol_list[-1]:
        start_time = time.time()
        
        # Update URL to include the current gene symbol
        requestURL = f"http://mygene.info/v3/query?q={gene_symbol}&species=human&fields=all"
        
        # Make the API request
        r = requests.get(requestURL, headers={"Accept": "application/json"})   
        if not r.ok:
            r.raise_for_status()
        
        # Append the response to the list
        response_json = r.json()
        responses.append(response_json) 
        
        # Log the time taken for each gene
        end_time = time.time()
        runtime = end_time - start_time
        logging.info(f"Time taken for term {gene_symbol}: {runtime:.2f} seconds")
    
    return responses  # return the list of all responses


In [11]:
def gene_dict(gene_file_path, out_path):
    """
    Creates the JSON dictionary file of gene information.

    Parameters
    ----------
    gene_file_path: Path of your gene file
    out_path: Output file for the JSON dictionary

    Returns
    -------
    out_path: Output file for the JSON dictionary
    """
    responses = get_gene_info(gene_file_path)
    gene_json = []
    for element in responses:
        if element["hits"]:
            #takes in the first 'hit' after the gene is queried (first result)
            best_hit = element["hits"][0]
            #checks if the gene has an ensembl id, and whether it is a list or dictionary
            if 'ensembl' in best_hit:
                if isinstance(best_hit['ensembl'], list):
                    gene_id = best_hit['ensembl'][0].get('gene', 'N/A')
                elif isinstance(best_hit['ensembl'], dict):
                    gene_id = best_hit['ensembl'].get('gene', 'N/A')
                else:
                    gene_id = 'N/A'
            else:
                gene_id = 'N/A'
            #write the json file dictionary
            gene_json.append({
                'Symbol': best_hit["symbol"],
                'Gene_ID': gene_id,
                'Name': best_hit["name"],
                'Aliases': best_hit.get('alias', 'N/A'),
                'Other names': best_hit.get("other_names", 'N/A'),
                'Description': best_hit.get("summary", 'N/A'),
            })
        else:
            logging.warning(f"No hits found for element: {element}")
    
    with open(out_path, 'w') as json_file:
        json.dump(gene_json, json_file, indent=4)
    print(f'Successfully created JSON file at {out_path}')
    return out_path


In [13]:
gene_file_path = '/mnt/DGX01/Personal/slndir/.gene_files/hgnc_complete_set_2024-10-01.txt'
out_path = '/mnt/DGX01/Personal/slndir/geneset_datasets/geneset_dict.json'

logging.info("Paths are initialized, starting dictionary creation.")
total_start_time = time.time()

gene_dict(gene_file_path, out_path)

total_end_time = time.time()
total_runtime = total_end_time - total_start_time
logging.info(f"Total execution time: {total_runtime:.2f} seconds")
print("Runtime logging complete.")

  gene_df = pd.read_csv(gene_file_path, sep="\t")


Successfully created JSON file at /mnt/DGX01/Personal/slndir/.gene_files/geneset_dict.json
Runtime logging complete.
