### Goal - Notebook Description


- Read gene expression data from MelanoDB
- Create precessed Table 
- Create simple vizualisation (heatmap)

In [1]:
# imports
import time
import pandas as pd
import numpy as np
import json
import sqlite3
import os
import re
import matplotlib.pyplot as plt
import ast
from joblib import Parallel, delayed
import multiprocessing as mp
from multiprocessing import Pool
from functools import partial
import get_ge

In [17]:
# Load MelanoDB
cnx = sqlite3.connect('../database/MelanoDB.db')
df_ge =  pd.read_sql_query("SELECT * FROM gene_expressions", cnx)

In [18]:
df_ge = df_ge[df_ge['temporality'] == 'pre treatment']
#df_ge = df_ge[df_ge['value'] != 0]
print(len(df_ge))

4466466


In [19]:
set(list(df_ge['source']))

{"{'title': 'Acquired resistance and clonal evolution in melanoma during BRAF inhibitor therapy', 'author': 'Hugo W., Roger S. Lo', 'journal': 'Cancers Discovery', 'location': 'Los Angeles (United State)', 'date': 2014}",
 "{'title': 'BRAF Inhibitor Resistance Mechanisms in Metastatic Melanoma: Spectrum and Clinical Impact', 'author': 'Helen Rizos, Georgina V.Long', 'journal': 'Clinical Cancer Research', 'location': 'Sydney (Australia)', 'date': 2014}",
 "{'title': 'Baseline Genomic Features in BRAFV600-Mutated Metastatic Melanoma Patients Treated with BRAF Inhibitor + MEK Inhibitor in Routine Care', 'author': 'Baptiste Louveau, Samia Mourah', 'journal': 'Cancers (Basel)', 'location': 'Paris (France)', 'date': 2019}",
 "{'title': 'Co-clinical assessment identifies patterns of BRAF inhibitor resistance in melanoma', 'author': 'Lawrence N. Kwong, Lynda Chin', 'journal': 'Journal of Clinical Investigations ', 'location': 'Boston (United States)', 'date': 2015}",
 "{'title': 'Genomic Featu

In [20]:
#Filter regarding sources
sources_to_keep = ["{'title': 'Acquired resistance and clonal evolution in melanoma during BRAF inhibitor therapy', 'author': 'Hugo W., Roger S. Lo', 'journal': 'Cancers Discovery', 'location': 'Los Angeles (United State)', 'date': 2014}",
                  "{'title': 'Co-clinical assessment identifies patterns of BRAF inhibitor resistance in melanoma', 'author': 'Lawrence N. Kwong, Lynda Chin', 'journal': 'Journal of Clinical Investigations ', 'location': 'Boston (United States)', 'date': 2015}",
                  "{'title': 'Genomic Features of Exceptional Response in Vemurafenib + Cobimetinib–treated Patients with BRAFV600-mutated Metastatic Melanoma', 'author': 'Yibing Yan, Antoni Ribas', 'journal': 'Clinical Cancer Research', 'location': 'San Francisco, California (United States)', 'date': 2019}"]
df_ge = df_ge[df_ge['source'].isin(sources_to_keep)]
df_ge.reset_index(inplace=True)
print(len(df_ge))
print(set(df_ge['source']))
df_ge.head()

2961950
{"{'title': 'Co-clinical assessment identifies patterns of BRAF inhibitor resistance in melanoma', 'author': 'Lawrence N. Kwong, Lynda Chin', 'journal': 'Journal of Clinical Investigations ', 'location': 'Boston (United States)', 'date': 2015}", "{'title': 'Acquired resistance and clonal evolution in melanoma during BRAF inhibitor therapy', 'author': 'Hugo W., Roger S. Lo', 'journal': 'Cancers Discovery', 'location': 'Los Angeles (United State)', 'date': 2014}", "{'title': 'Genomic Features of Exceptional Response in Vemurafenib + Cobimetinib–treated Patients with BRAFV600-mutated Metastatic Melanoma', 'author': 'Yibing Yan, Antoni Ribas', 'journal': 'Clinical Cancer Research', 'location': 'San Francisco, California (United States)', 'date': 2019}"}


Unnamed: 0,index,id,creation_datetime,patientID,sample_id,HGNC,GeneID,description,value,temporality,source
0,580,581,2024-07-10 11:47:53.298381,,03660196C,,GeneID:1,,6.898152,pre treatment,{'title': 'Genomic Features of Exceptional Res...
1,581,582,2024-07-10 11:47:53.298381,,03660196C,NAT2,GeneID:10,,0.068518,pre treatment,{'title': 'Genomic Features of Exceptional Res...
2,582,583,2024-07-10 11:47:53.298381,,03660196C,ADA,GeneID:100,,30.686816,pre treatment,{'title': 'Genomic Features of Exceptional Res...
3,583,584,2024-07-10 11:47:53.298381,,03660196C,CDH2,GeneID:1000,,0.180316,pre treatment,{'title': 'Genomic Features of Exceptional Res...
4,584,585,2024-07-10 11:47:53.298381,,03660196C,AKT3,GeneID:10000,,17.434889,pre treatment,{'title': 'Genomic Features of Exceptional Res...


In [21]:
def intersection(lst1, lst2, lst3):
    lst4 = [value for value in lst2 if value in lst3]
    intersect_all = [value for value in lst1 if value in lst4]
    return lst4

In [22]:
# Get list of common genes
list_genes_shi = list(set(df_ge[df_ge['source']==sources_to_keep[0]].HGNC))
list_genes_kwong = list(set(df_ge[df_ge['source']==sources_to_keep[1]].HGNC))
list_genes_ribas = list(set(df_ge[df_ge['source']==sources_to_keep[2]].HGNC))

list_genes = intersection(list_genes_shi, list_genes_kwong, list_genes_ribas)

In [23]:
# Filtering tab according to gene list
df_ge = df_ge[df_ge['HGNC'].isin(list_genes)]

In [24]:
print('# Patients Shi: '+str(len(df_ge[df_ge['source']==sources_to_keep[0]].sample_id.value_counts())))
print('# Patients kwong: ' +str(len(df_ge[df_ge['source']==sources_to_keep[1]].sample_id.value_counts())))
print('# Patients ribas: '+str(len(df_ge[df_ge['source']==sources_to_keep[2]].sample_id.value_counts())))

# Patients Shi: 12
# Patients kwong: 14
# Patients ribas: 72


## Get gene expression per sample - functions 

In [25]:
def set_ge_df_value(tab_ge, df, samp, list_g):
    print(samp)
    for gene in list_g:
        if (np.shape(df[(df['sample_id']==samp) & (df['HGNC']==gene)]) !=0):
            if (np.any(df[(df['sample_id']==samp) & (df['HGNC']==gene)])):
                val = df[(df['sample_id']==samp) & (df['HGNC']==gene)].value.to_numpy()[0]
                print(val)
                tab_ge.loc[gene, samp]=val
    return(tab_ge)

In [26]:
def set_ge_df_value_test(samp):
    print(samp)
    for gene in list_genes[0:5]:
        if (np.shape(df_ge[(df_ge['sample_id']==samp) & (df_ge['HGNC']==gene)]) !=0):
            if (np.any(df_ge[(df_ge['sample_id']==samp) & (df_ge['HGNC']==gene)])):
                val = df_ge[(df_ge['sample_id']==samp) & (df_ge['HGNC']==gene)].value.to_numpy()[0]
                print(val)
                GE_tab_test.loc[gene, samp]=val
    return(GE_tab_test)

In [27]:
def set_ge_df_value_col(samp, df_ge, list_genes):
    samp_tab = pd.DataFrame(index = list_genes[0:5])
    #print(samp)
    for gene in list_genes[0:5]:
        if (np.shape(df_ge[(df_ge['sample_id']==samp) & (df_ge['HGNC']==gene)]) !=0):
            if (np.any(df_ge[(df_ge['sample_id']==samp) & (df_ge['HGNC']==gene)])):
                val = df_ge[(df_ge['sample_id']==samp) & (df_ge['HGNC']==gene)].value.to_numpy()[0]
                print(val)
                samp_tab.loc[gene, samp]=val
    return(samp_tab)

## Get variance per genes & retrieve high variance gene list

In [28]:
df_ge.groupby('HGNC', as_index=False)['value'].var().sort_values(by='value', ascending=False)[0:100]

Unnamed: 0,HGNC,value
10727,MIR4649,2.096593e+10
10890,MIR5588,2.309566e+09
10700,MIR4517,9.855878e+08
10740,MIR4668,9.206484e+08
10644,MIR4442,5.202958e+08
...,...,...
8152,KRT6A,5.948337e+05
15378,RPS3A,5.849340e+05
15286,RPL21P28,5.796954e+05
8114,KRT10,5.620441e+05


In [29]:
high_variance_genes = list(df_ge.groupby('HGNC', as_index=False)['value'].var().sort_values(by='value', ascending=False)['HGNC'][0:500])

## Get gene expression tab - Sequential iterations 

In [30]:
GE_tab_test = pd.DataFrame(index = list_genes[0:2], columns = list(set(df_ge.sample_id)))
curr_time = time.time()
for samp in list(set(df_ge.sample_id)):
    test = set_ge_df_value_col(samp, df_ge, list_genes[0:2])
print(time.time()-curr_time)
print(f'Parralel process computed in: {(time.time()-curr_time)/60} min')

0.0
2.026665692
0.0
0.007951394
0.0
0.136472765
0.0
0.056639186
0.017666947
0.134264596
0.01
0.38
0.01
1.64
0.0
0.53
0.0
0.122812043
0.0
0.08
0.0
0.967963937
0.0
0.037499961
0.0
1.54
0.0
0.037028924
0.01597747
0.11675478
0.0
0.15
0.0
0.101516511
0.0
0.246343348
0.01784255
0.139947
0.0
0.084368629
0.01
0.39
0.0
0.094403548
0.0
0.191821195
0.0
0.412164683
0.0
0.226420602
0.0
0.104606095
0.01791285
2.398925
0.0
0.372707888
7.523176801
8.307384777
0.0
0.837986152
0.0
2.066005036
0.0
0.075774843
0.0
0.053561446
0.0
6.29765
0.0
0.181347251
0.0
0.153302772
0.0
0.403671
0.0
0.454132981
0.0
0.7915855
0.0
0.108534678
0.0
0.643877937
0.0
1.822334345
0.0
0.0
0.0
0.050971632
0.0
1.376429483
0.0
4.690820074
0.0
0.365583678
0.02
0.24
0.0
15.7861
0.0236756
2.496095
0.0
10.56356072
0.0
4.24862846
0.0
0.024176567
0.0
0.050734326
0.01274757
0.081974113
0.0
1.27
0.0
1.134199266
0.0
0.856619902
0.016571895
0.048439408
0.0155365
0.309980333
0.0
1.580358098
0.0
0.315283107
0.0
0.114436452
0.06
0.14
0.0
0.035

## Get gene expression tab - Parallel iterations

In [31]:
mp.cpu_count()

16

In [32]:
#pool = mp.Pool(processes= mp.cpu_count())
pool = mp.Pool(processes= 10)

In [None]:
from functools import partial
t = time.time()
GE_tab = pool.map(partial(get_ge.set_ge_df_value_col, df_ge=df_ge, list_genes=list_genes), 
                [samp for samp in list(set(df_ge.sample_id))], 
                chunksize=10)
pool.terminate()
print(f'Parralel process computed in: {(time.time()-t)/60} min')

In [51]:
# Concatenate all chunks to the final result
GE_tab = pd.concat(GE_tab, axis=1)
GE_tab.head()

Unnamed: 0,05320234B,03660196C,05320102B,Pt4-baseline,05320444B,Pt8-baseline,03660598B,04240151B,03660501B,05320446B,...,22A,05320384B,03660555B,05420169C,05320003B,05420159C,05320216B,05320459B,05420199C,05320385B
KAT5,19.739645,27.128648,12.539045,10.42225,16.242073,19.29555,26.425709,22.062084,19.457436,17.235895,...,32.34,14.58108,11.388035,17.090183,12.617297,16.12212,6.543853,12.887998,38.806821,20.574456
OR4C15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DEFB1,1.41903,0.097449,0.042814,25.54325,0.0,140.4855,0.048933,0.0,3.968038,0.0,...,0.34,2.61053,0.069665,0.0,3.20522,0.0,0.571171,0.0,0.412324,6.081646
TSSC2,1.828737,3.085323,0.28184,,1.146409,,1.372863,0.392965,1.665028,0.59388,...,3.08,0.25051,0.349408,2.490883,0.205272,7.73582,0.015569,3.198403,1.599501,0.767749
TTTY7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
# Register GEx table as csv
GE_tab.to_csv('./melanodb_ge.csv', sep=',', header=True, index=True)

## Get sample information table

In [33]:
# Register table with meta informations
GE_melanoDB_info = df_ge[['sample_id', 'source', 'temporality']]
GE_melanoDB_info['source'] = GE_melanoDB_info['source'].replace(["{'title': 'Genomic Features of Exceptional Response in Vemurafenib + Cobimetinib–treated Patients with BRAFV600-mutated Metastatic Melanoma', 'author': 'Yibing Yan, Antoni Ribas', 'journal': 'Clinical Cancer Research', 'location': 'San Francisco, California (United States)', 'date': 2019}", "{'title': 'Co-clinical assessment identifies patterns of BRAF inhibitor resistance in melanoma', 'author': 'Lawrence N. Kwong, Lynda Chin', 'journal': 'Journal of Clinical Investigations ', 'location': 'Boston (United States)', 'date': 2015}", "{'title': 'Acquired resistance and clonal evolution in melanoma during BRAF inhibitor therapy', 'author': 'Hubing Shi, Roger S. Lo', 'journal': 'Cancers Discovery', 'location': 'Los Angeles (United State)', 'date': 2014}"], [1,2,3])
GE_melanoDB_info = GE_melanoDB_info.drop_duplicates()
GE_melanoDB_info = GE_melanoDB_info.rename(columns = {"sample_id": "samplename","source": "Batch", "temporality": "category"})
GE_melanoDB_info.insert(0, 'Arrayname', 'melanoDB')
GE_melanoDB_info.to_csv('./melanodb_ge_info.csv', sep=',', header=True, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GE_melanoDB_info['source'] = GE_melanoDB_info['source'].replace(["{'title': 'Genomic Features of Exceptional Response in Vemurafenib + Cobimetinib–treated Patients with BRAFV600-mutated Metastatic Melanoma', 'author': 'Yibing Yan, Antoni Ribas', 'journal': 'Clinical Cancer Research', 'location': 'San Francisco, California (United States)', 'date': 2019}", "{'title': 'Co-clinical assessment identifies patterns of BRAF inhibitor resistance in melanoma', 'author': 'Lawrence N. Kwong, Lynda Chin', 'journal': 'Journal of Clinical Investigations ', 'location': 'Boston (United States)', 'date': 2015}", "{'title': 'Acquired resistance and clonal evolution in melanoma during BRAF inhibitor therapy', 'author': 'Hubing Shi, Roger S. Lo'

In [34]:
GE_melanoDB_info

Unnamed: 0,Arrayname,samplename,Batch,category
1,melanoDB,03660196C,1,pre treatment
30728,melanoDB,03660356C,1,pre treatment
61455,melanoDB,03660445B,1,pre treatment
92182,melanoDB,03660447B,1,pre treatment
122909,melanoDB,03660501B,1,pre treatment
...,...,...,...,...
2835612,melanoDB,Pt9-baseline,{'title': 'Acquired resistance and clonal evol...,pre treatment
2860880,melanoDB,Pt10-baseline,{'title': 'Acquired resistance and clonal evol...,pre treatment
2886148,melanoDB,Pt15-baseline,{'title': 'Acquired resistance and clonal evol...,pre treatment
2911416,melanoDB,Pt16-baseline,{'title': 'Acquired resistance and clonal evol...,pre treatment
