In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import os
import random

In [2]:
counts = pd.read_csv("all_samples_gene_counts.txt", sep="\t", header=0)
print(counts.head)
print(counts.shape)
print(counts.columns)
new_columns = ['Gene'] + [name[9:] for name in counts.columns[1:]]
print(new_columns)
counts.columns = new_columns

<bound method NDFrame.head of               Gene  HUVEC_RNAd0_W1_R1  HUVEC_RNAd0_W0_R1  HUVEC_RNAd0_W2_R1  \
0          DDX11L1                  0                  0                  0   
1           WASH7P                112                246                132   
2        MIR6859-1                  0                  2                  4   
3      MIR1302-2HG                  0                  0                  0   
4        MIR1302-2                  0                  0                  0   
...            ...                ...                ...                ...   
50001          ND6              10230              29851              15385   
50002         TRNE                  0                  0                  0   
50003         CYTB              47313             118262              68154   
50004         TRNT                  0                  0                  0   
50005         TRNP                507                686                636   

       HUVEC_RNAd0_W3

In [3]:
# TPM normalizes using the length of the gene
# Allows us to more accurately compare abundance between genes (I think)
tpm = pd.read_csv("all_samples_tpm_matrix.txt", sep="\t", header=0)
print(tpm.shape)
print(tpm.head)
print(tpm.columns)
dose_map = {"0.001" : "dA", "0.01": "dB", "0.1":"dC", "1.0": "dD", "2.0":"dE", "control": "d0"}
new_columns = ['Gene'] + [ dose_map[name.split("_")[1]] + "_" + name[0:2]+ "_" + name[-2:] for name in tpm.columns[1:] ]
print(new_columns)
tpm.columns = new_columns


(48021, 38)
<bound method NDFrame.head of        GeneID  W1_0.001_R1  W1_0.001_R2  W2_0.001_R1  W2_0.001_R2  \
0        A1BG     0.298326     0.673238     0.941353     1.074990   
1        A1CF     0.000000     0.000000     0.000000     0.000000   
2         A2M     0.030412     0.053326     0.056793     0.012807   
3       A2ML1     0.004552     0.006471     0.010968     0.002396   
4       A2MP1     0.000000     0.000000     0.000000     0.000000   
...       ...          ...          ...          ...          ...   
48016  ZYG11B     2.826560     3.174150     4.115840     4.209530   
48017     ZYX   239.398000   272.155000   365.450000   353.149000   
48018   ZYX_1   239.386000   272.140000   365.441000   353.141000   
48019   ZZEF1     2.533350     2.784970     3.693370     3.597940   
48020    ZZZ3     1.665650     1.949720     2.457780     2.410330   

       W3_0.001_R1  W3_0.001_R2  W1_0.01_R1  W1_0.01_R2  W2_0.01_R1  ...  \
0         1.076690     1.419650    0.583366    0.5152

In [4]:
metadata = pd.read_csv("metadata_rna.csv", header=0)
print(metadata)

   SampleName ExposureRate_mGh  Week  Replicate  TotalExposure_mG
0    dA_W1_R1            0.001     1          1             0.168
1    dA_W1_R2            0.001     1          2             0.168
2    dA_W2_R1            0.001     2          1             0.336
3    dA_W2_R2            0.001     2          2             0.336
4    dA_W3_R1            0.001     3          1             0.504
5    dA_W3_R2            0.001     3          2             0.504
6    dB_W1_R1             0.01     1          1             1.680
7    dB_W1_R2             0.01     1          2             1.680
8    dB_W2_R1             0.01     2          1             3.360
9    dB_W2_R2             0.01     2          2             3.360
10   dB_W3_R1             0.01     3          1             5.040
11   dB_W3_R2             0.01     3          2             5.040
12   dC_W1_R1              0.1     1          1            16.800
13   dC_W1_R2              0.1     1          2            16.800
14   dC_W2

In [5]:
count_data_only = counts.drop(columns='Gene').T
drop_inds_count = count_data_only != 0
count_data_only = count_data_only.loc[:,(drop_inds_count).any(axis=0)]
print(count_data_only.shape)
genes_count = counts['Gene'].loc[(drop_inds_count).any(axis=0)]
print(genes_count.head)

(37, 29628)
<bound method NDFrame.head of 1              WASH7P
2           MIR6859-1
7        LOC124903816
8        LOC124900384
9           LOC729737
             ...     
50000             ND5
50001             ND6
50002            TRNE
50003            CYTB
50005            TRNP
Name: Gene, Length: 29628, dtype: object>


In [6]:
tpm_data_only = tpm.drop(columns="Gene").T
drop_inds_tpm = tpm_data_only != 0
tpm_data_only = tpm_data_only.loc[:,(drop_inds_tpm).any(axis=0)]
print(tpm_data_only.shape)
genes_tpm = tpm['Gene'].loc[(drop_inds_tpm).any(axis=0)]
print(genes_tpm.head)
print(len(set(genes_count).intersection(set(genes_tpm))))

(37, 37864)
<bound method NDFrame.head of 0           A1BG
1           A1CF
2            A2M
3          A2ML1
5        A3GALT2
          ...   
48016     ZYG11B
48017        ZYX
48018      ZYX_1
48019      ZZEF1
48020       ZZZ3
Name: Gene, Length: 37864, dtype: object>
28216


In [7]:
use_tpm = False # Must use count data because DESEQ2 is run on count matrix, so there are de genes that are not in the TPM matrix
genes = genes_tpm if use_tpm else genes_count 
data = tpm if use_tpm else counts
matrix = tpm_data_only if use_tpm else count_data_only 
dose = [f"{metadata.loc[metadata['SampleName']== name]['TotalExposure_mG'].values[0]}Gy" for name in data.columns[1:]]
dose_rate = [metadata.loc[metadata['SampleName']== name]['ExposureRate_mGh'].values[0] for name in data.columns[1:]]
print(set(dose))
print(set(dose_rate))
matrix.columns=genes
matrix['Dose'] = dose
matrix['Dose Rate'] = dose_rate

{'1.68Gy', '16.8Gy', '0.168Gy', '504.0Gy', '168.0Gy', '5.04Gy', '0.336Gy', '0.504Gy', '3.36Gy', '0.0Gy', '672.0Gy', '33.6Gy', '1008.0Gy', '336.0Gy', '50.4Gy'}
{'0.1', '1.0', 'Control', '0.001', '0.01', '2.0'}


In [8]:
print(matrix.columns)

Index(['WASH7P', 'MIR6859-1', 'LOC124903816', 'LOC124900384', 'LOC729737',
       'DDX11L17', 'WASH9P', 'MIR6859-2', 'LOC127239154', 'LOC124903815',
       ...
       'ND4L', 'ND4', 'TRNH', 'ND5', 'ND6', 'TRNE', 'CYTB', 'TRNP', 'Dose',
       'Dose Rate'],
      dtype='object', name='Gene', length=29630)


In [9]:
# Get DE genes for each dose 
dose_to_file_map = {"dA" : ["deseq2_0.168_W1vs0_by_week_results.csv", "deseq2_0.336_W2vs0_by_week_results.csv", "deseq2_0.504_W3vs0_by_week_results.csv"],
                         "dB": ["deseq2_1.68_W1vs0_by_week_results.csv","deseq2_3.36_W2vs0_by_week_results.csv", "deseq2_5.04_W3vs0_by_week_results.csv"],
                         "dC" : ["deseq2_16.8_W1vs0_by_week_results.csv","deseq2_33.6_W2vs0_by_week_results.csv", "deseq2_50.4_W3vs0_by_week_results.csv"],
                         "dD": ["deseq2_168_W1vs0_by_week_results.csv", "deseq2_336_W2vs0_by_week_results.csv", "deseq2_504_W3vs0_by_week_results.csv"],
                         "dE": ["deseq2_336_W1vs0_by_week_results.csv", "deseq2_672_W2vs0_by_week_results.csv", "deseq2_1008_W3vs0_by_week_results.csv"]
                         }
padj_value = 0.05
all_de_genes = {}
for dose, list_of_files in dose_to_file_map.items():
    print(dose)
    de_by_dose = []
    for file in list_of_files:
        de_df = pd.read_csv(f"deseq2/{file}", header=0)
        de_genes = de_df.loc[de_df['padj']<=padj_value].iloc[:,0].values
        print(f"NUMBER OF DE GENES FOR WEEK {len(de_genes)}")
        print(f"NUMBER OF OVERLAP WITH PREVIOUS WEEK {len(set(de_by_dose).intersection(set(de_genes)))}")
        de_by_dose += list(de_genes)
    print(f"NUMBER OF DE GENES FOR DOSE {len(set(de_by_dose))}")
    all_de_genes[dose] = list(set(de_by_dose))
    
print(f"NUMBER OF DE GENES SHARED ACROSS ALL DOSES {len(set.intersection(*[set(i) for i in all_de_genes.values()]))}")

dA
NUMBER OF DE GENES FOR WEEK 2692
NUMBER OF OVERLAP WITH PREVIOUS WEEK 0
NUMBER OF DE GENES FOR WEEK 4133
NUMBER OF OVERLAP WITH PREVIOUS WEEK 1742
NUMBER OF DE GENES FOR WEEK 2404
NUMBER OF OVERLAP WITH PREVIOUS WEEK 1601
NUMBER OF DE GENES FOR DOSE 5886
dB
NUMBER OF DE GENES FOR WEEK 3701
NUMBER OF OVERLAP WITH PREVIOUS WEEK 0
NUMBER OF DE GENES FOR WEEK 4616
NUMBER OF OVERLAP WITH PREVIOUS WEEK 2324
NUMBER OF DE GENES FOR WEEK 3213
NUMBER OF OVERLAP WITH PREVIOUS WEEK 2228
NUMBER OF DE GENES FOR DOSE 6978
dC
NUMBER OF DE GENES FOR WEEK 3638
NUMBER OF OVERLAP WITH PREVIOUS WEEK 0
NUMBER OF DE GENES FOR WEEK 4676
NUMBER OF OVERLAP WITH PREVIOUS WEEK 2386
NUMBER OF DE GENES FOR WEEK 2202
NUMBER OF OVERLAP WITH PREVIOUS WEEK 1493
NUMBER OF DE GENES FOR DOSE 6637
dD
NUMBER OF DE GENES FOR WEEK 2658
NUMBER OF OVERLAP WITH PREVIOUS WEEK 0
NUMBER OF DE GENES FOR WEEK 2914
NUMBER OF OVERLAP WITH PREVIOUS WEEK 1246
NUMBER OF DE GENES FOR WEEK 2154
NUMBER OF OVERLAP WITH PREVIOUS WEEK 1332
N

In [10]:
# Now save files for each dose (should be 13 samples each for 3 weeks x 2 replicates + 7 controls)
for dose in dose_to_file_map.keys():
    print(dose)
    matrix_by_dose =  matrix.filter(regex=f"d0+|{dose}+", axis=0)
    print(matrix_by_dose.shape)
    print(set(matrix_by_dose.columns.values).intersection(all_de_genes[dose]))
    print(all_de_genes[dose])
    matrix_by_dose = matrix_by_dose[all_de_genes[dose]]
    print(matrix_by_dose.shape)
    matrix_by_dose.to_csv(f"cd_matrix_{dose}.csv", header=True, index=False)
    

dA
(13, 29630)
{'SHCBP1', 'C17orf58', 'CDC42SE1', 'RIN1', 'CACUL1', 'LMNA', 'SNHG8', 'LOC101927393', 'ADAM19', 'CCDC14', 'RBP1', 'FAM171B', 'XPC', 'SDHAF4', 'AOPEP', 'CDCA7L', 'FANCM', 'FANCA', 'SUMF2', 'DRP2', 'FYCO1', 'CFL2', 'VBP1', 'PKIG', 'EPB41L2', 'SQOR', 'PIGK', 'CASP9', 'TTL', 'GON4L', 'CDK5', 'SRSF2', 'SNX18', 'CIC', 'AP4E1', 'AACS', 'BLCAP', 'ALDH9A1', 'EXTL2', 'TPRG1L', 'PTX3', 'DUSP5', 'CHPT1', 'PDK4', 'TOB2', 'CDC42EP3', 'IFT52', 'PTPRN2', 'HAGH', 'MAGI1', 'HROB', 'SPRY4', 'IFT172', 'TOP2A', 'CARD19', 'TTLL11', 'AHI1', 'FRMD4A', 'KIF20A', 'ZNF528', 'FRRS1', 'MEGF6', 'GYG1', 'ARHGEF6', 'TTK', 'ITGA6', 'EMC10', 'NALCN', 'RAPH1', 'PIM3', 'KDM5A', 'NEMP1', 'FOXP1', 'BRIP1', 'PROM1', 'MRPL15', 'CTSB_1', 'GRK5', 'FXYD5', 'LFNG', 'SCN5A', 'ATXN1', 'HOPX', 'KCNAB2', 'PLD3', 'LOC107987020', 'RPL35A', 'POLE3', 'CEP19', 'TM4SF18', 'PITPNC1', 'EHBP1', 'GAREM2', 'RMND5A', 'ALG9', 'DCTN5', 'TNFRSF10D', 'RPS27A', 'LYL1', 'TCP11L2', 'FNTA', 'ACSL3', 'IMMT', 'ATP6V1D', 'FAM24B', 'TNRC6C',