In [1]:
import pandas as pd
import csv
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"]=20,10
import matplotlib.mlab as mlab
from IPython.display import display
pd.options.display.max_columns = None
from numpy import random
from scipy import stats
import collections


In [2]:
fields = ['Name','Description']

gtex = pd.read_csv('/fs/cbcb-scratch/gowthami/GTF/GTF_Project_v3/data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.tsv', 
                 skipinitialspace=True, usecols=fields,sep='\t')
print(gtex.shape)
gtex['Name'] = gtex['Name'].apply(lambda x: x.split('.')[0])    

gtex_genes = set(gtex['Name'].tolist())
gtex_mapping_dict = pd.Series(gtex['Name'].values,
                         index=gtex['Description']).to_dict()
print('Gtex genes #: '+str(len(gtex_genes)))
gtex.head(4)

(56200, 2)
Gtex genes #: 56156


Unnamed: 0,Name,Description
0,ENSG00000223972,DDX11L1
1,ENSG00000227232,WASH7P
2,ENSG00000278267,MIR6859-1
3,ENSG00000243485,MIR1302-2HG


In [3]:
mapping = pd.read_csv('/fs/cbcb-scratch/gowthami/GTF/GTF_Project_v3/data/mapping_ids.txt',sep='\t')
mapping = mapping[['NCBI gene ID','Gene stable ID']]
mapping = mapping.dropna()
mapping = mapping.drop_duplicates(keep='first') 
mapping_dict = pd.Series(mapping['Gene stable ID'].values,index=mapping['NCBI gene ID']).to_dict()
mapping.head()

Unnamed: 0,NCBI gene ID,Gene stable ID
12,4535.0,ENSG00000198888
21,4536.0,ENSG00000198763
34,4512.0,ENSG00000198804
41,4513.0,ENSG00000198712
47,4509.0,ENSG00000228253


In [4]:
hippie = pd.read_csv('/fs/cbcb-scratch/gowthami/GTF/GTF_Project_v3/data/hippie_current.txt',sep = '\t',header=None)
hippie.columns = ['Gene1', 'Gene1_Entrezid', 'Gene2', 'Gene2_Entrezid' ,'PPI_val', 'Procedure']
hippie = hippie[['Gene1', 'Gene1_Entrezid', 'Gene2', 'Gene2_Entrezid' ,'PPI_val']]
hippie = hippie[hippie['PPI_val']>0]
hippie = hippie.replace({'_HUMAN':''}, regex=True)
print(hippie.shape)
hippie['Gene1_Ensmblid'] = hippie['Gene1'].map(gtex_mapping_dict)
hippie['Gene2_Ensmblid'] = hippie['Gene2'].map(gtex_mapping_dict)
hippie.head()

(406111, 5)


Unnamed: 0,Gene1,Gene1_Entrezid,Gene2,Gene2_Entrezid,PPI_val,Gene1_Ensmblid,Gene2_Ensmblid
0,AL1A1,216,AL1A1,216,0.76,,
1,ITA7,3679,ACHA,1134,0.73,,
2,NEB1,55607,ACTG,71,0.65,,
3,SRGN,5552,CD44,960,0.63,ENSG00000122862,ENSG00000026508
4,GRB7,2886,ERBB2,2064,0.9,ENSG00000141738,ENSG00000141736


In [5]:
unmapped_genes = set(hippie[hippie['Gene1_Ensmblid'].isna()]['Gene1_Entrezid'].
    tolist()) | set(hippie[hippie['Gene2_Ensmblid'].isna()]['Gene2_Entrezid'].tolist())
print(len(unmapped_genes))
mappable_genes = set(mapping_dict.keys()) & unmapped_genes
print(len(mappable_genes))

count = 0
for gene in mappable_genes:
    count+=1
    hippie.loc[hippie['Gene1_Entrezid']==gene,'Gene1_Ensmblid'] = mapping_dict[gene]
    hippie.loc[hippie['Gene2_Entrezid']==gene,'Gene2_Ensmblid'] = mapping_dict[gene]
    if count%1000==0:
        print(count)

hippie_genes = set(hippie['Gene1_Ensmblid'].tolist()) | set(hippie['Gene2_Ensmblid'].tolist())
hippie.to_csv('/fs/cbcb-scratch/gowthami/GTF/GTF_Project_v3/data/hippie_cleaned.tsv')
print(len(hippie_genes))

10637
9694
1000
2000
3000
4000
5000
6000
7000
8000
9000
17104


In [6]:
# This is just to get the list of missing genes from HIPPIE, GTEX overlap

hip_mapping_dict = {}
count = 0
for index, row in hippie.iterrows():
    count+=1
    hip_mapping_dict[row['Gene1_Ensmblid']] = row['Gene1']
    hip_mapping_dict[row['Gene2_Ensmblid']] = row['Gene2']
    if count%100000==0:
        print(count)
        
up = list(set(hippie[hippie['Gene1_Ensmblid'].isna()]['Gene1'].
    tolist()) | set(hippie[hippie['Gene2_Ensmblid'].isna()]['Gene2'].tolist()))
nop = hippie_genes - gtex_genes
nop = {x for x in nop if pd.notna(x)}
lost = [hip_mapping_dict[k] for k in nop]
lost = set(lost + up)
print(len((lost)))

with open("/fs/cbcb-scratch/gowthami/GTF/GTF_Project_v3/data/unmapped_genes.txt", "w") as output:
    output.write(str(lost))

In [7]:
final_genes = (hippie_genes & gtex_genes)
print('Total overlap genes #: '+str(len(final_genes)))

Total overlap genes #: 16620


Shortlisting the gtex file.

In [10]:
temp  = pd.read_csv('/fs/cbcb-scratch'+
        '/gowthami/GTF/GTF_Project_v3/data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.tsv', 
                 skipinitialspace=True, nrows = 2, sep='\t')
col_names = list(temp.columns)

In [11]:
# shortlisting only with unique hippie genes from the whole samplespace

gtex_shortlisted = []
count = 0
with open('/fs/cbcb-scratch/gowthami/GTF/GTF_Project_v3/data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.tsv') as csvfile:
    # reader = csv.DictReader(csvfile, delimiter='\t')
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        count+=1
        if count % 1000 == 0:
            print(count)
        e_id = row[0].split('.')[0]
        if e_id in final_genes:
            gtex_shortlisted.append(row)
            
pickle.dump( gtex_shortlisted, 
            open( "/fs/cbcb-scratch/gowthami/GTF/GTF_Project_v3/data_generated/gtex_exp.p", "wb" ) )
print('pickle done')
gtex_shortlisted = pd.DataFrame(gtex_shortlisted,columns = col_names)
gtex_shortlisted.to_csv("/fs/cbcb-scratch/gowthami/GTF/GTF_Project_v3/data_generated/gtex_shortlisted.tsv",
                        sep="\t",index = False)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
pickle done


In [12]:
from collections import Counter

d =  Counter(gtex_shortlisted['Description'].tolist())
res = [k for k, v in d.items() if v > 1]
print(res)

['PLCXD1', 'GTPBP6', 'PPP2R3B', 'SHOX', 'CRLF2', 'CSF2RA', 'IL3RA', 'SLC25A6', 'ASMTL', 'P2RY8', 'AKAP17A', 'ASMT', 'DHRSX', 'ZBED1', 'CD99', 'SPRY3', 'VAMP7', 'IL9R']
The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.


Get sample information

In [13]:
fields = ['SAMPID','SMTS','SMTSD']

sampleid_mapping = pd.read_csv('/fs/cbcb-scratch/gowthami/GTF/'+
                               'GTF_Project_v3/data/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt',
                               sep='\t',usecols=fields)
sampleid_mapping.head()

sampleid_mapping.to_csv("/fs/cbcb-scratch/gowthami/GTF/GTF_Project_v3/data_generated/GTEx_Tissue_Attributes.csv",
                        sep="\t",index = False)

In [None]:
genes = gtex_shortlisted['Name'].tolist()
expressions_np = np.array(gtex_shortlisted.iloc[:,2:]).astype(np.float32)

In [2]:
hippie = pd.read_csv('/fs/cbcb-scratch/gowthami/GTF/GTF_Project_v3/data/hippie_current.txt',sep = '\t',header=None)
hippie.columns = ['Gene1', 'Gene1_Entrezid', 'Gene2', 'Gene2_Entrezid' ,'PPI_val', 'Procedure']
hippie.head()

Unnamed: 0,Gene1,Gene1_Entrezid,Gene2,Gene2_Entrezid,PPI_val,Procedure
0,AL1A1_HUMAN,216,AL1A1_HUMAN,216,0.76,"experiments:in vivo,Two-hybrid;pmids:12081471,..."
1,ITA7_HUMAN,3679,ACHA_HUMAN,1134,0.73,"experiments:in vivo,Affinity Capture-Western,a..."
2,NEB1_HUMAN,55607,ACTG_HUMAN,71,0.65,"experiments:in vitro,in vivo;pmids:9362513,120..."
3,SRGN_HUMAN,5552,CD44_HUMAN,960,0.63,"experiments:in vivo;pmids:9334256,16189514,167..."
4,GRB7_HUMAN,2886,ERBB2_HUMAN,2064,0.9,"experiments:in vitro,in vivo,Reconstituted Com..."


In [7]:
print(hippie.shape)
hippie[hippie['PPI_val']<=0]['Procedure'].drop_duplicates().tolist()

(411430, 6)


['sources:IntAct',
 'sources:BIND',
 'sources:BIND,STRING',
 'sources:MINT',
 'sources:IntAct,MINT',
 'sources:KEGG,STRING',
 'sources:KEGG',
 'sources:STRING',
 'sources:BioCarta,KEGG,STRING',
 'sources:BioCarta,KEGG',
 nan,
 'sources:BioCarta,STRING',
 'sources:BioCarta']

In [14]:
set(hippie[hippie['Procedure']=='sources:BioCarta']['PPI_val'].tolist())

{0.0}

In [15]:
hippie[hippie['PPI_val']<=0]

Unnamed: 0,Gene1,Gene1_Entrezid,Gene2,Gene2_Entrezid,PPI_val,Procedure
58333,PSMD9_HUMAN,5715,PSA7_HUMAN,5688,0.0,sources:IntAct
58334,PSMD9_HUMAN,5715,PSB10_HUMAN,5699,0.0,sources:IntAct
58336,PRS8_HUMAN,5705,PSB10_HUMAN,5699,0.0,sources:IntAct
58414,PSB10_HUMAN,5699,PRS4_HUMAN,5700,0.0,sources:IntAct
58416,PSB4_HUMAN,5692,PSB4_HUMAN,5692,0.0,sources:IntAct
...,...,...,...,...,...,...
193561,BIRC3_HUMAN,330,M3K5_HUMAN,4217,0.0,sources:STRING
193562,CCR8_HUMAN,1237,DRD2_HUMAN,1813,0.0,sources:STRING
193563,EPHA5_HUMAN,2044,HCK_HUMAN,3055,0.0,sources:STRING
193564,GBG2_HUMAN,54331,5HT2C_HUMAN,3358,0.0,sources:STRING


In [16]:
5319/411430

0.012928080110832949