### Unzip the downloaded data

```shell
gunzip /home/q5gong/CSE185FinalProject/Figure_1e/data/GSE132440_ATAC_PeakNorm.txt.gz
```

### Import python packages

In [1]:
pip install fastcluster

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import fastcluster

### ATAC Peak Normalized Data Processing
- data: `ATAC_PeakNorm.txt`
>- All ATAC peaks from all samples were merged by combining peaks within 500bp of each. featureCount was used to count the mapped reads for each sample. The resulting peak atlas was normalized using DESeq2.

In [3]:
data = pd.read_csv("/home/q5gong/CSE185FinalProject/Figure_1e/data/GSE132440_ATAC_PeakNorm.txt", sep="\t")
display(data.head())
data.columns

Unnamed: 0,Peaks,D13_2_T1.Normal (C;RIK+PBS-d2),D7_3_T1.Normal (C;RIK+PBS-d2),D1_2_C-Ren-PBS-D2 -female-,D8_2_T2.Reg-ADM (C;RIK+Caer-d2),D8_3_T2.Reg-ADM (C;RIK+Caer-d2),D11_1_T2.Reg-ADM (C;RIK+Caer-d2),D14_2_C-Ren-C-d2 (ADM-shRen+),D6_1_C-Ren-C-d2 (ADM-shRen+),D6_3_T3.KrasG12D (KC;RIK+PBS-d2),...,D19_1_T4.Tum-ADR (KC;RIK+Caer-d2),D19_3_T4.Tum-ADR (KC;RIK+Caer-d2),D4_2_KC-REN-C-d2 (ADR-shRen+),D11_3_KC-552OFF-C-d2 (ADR-sh552Off),D2_1_T7.PDAC (KPflC-GEMMs),D12_1_T7.PDAC (KPcr_organoids),D12_2_T7.PDAC (KPcr_organoids),D22_3_T7.PDAC (KPcr_organoids),D3_2_T7.PDAC (KPR127H_shp53_orgs),D3_3_T7.PDAC (KPR127H_Ren_orgs)
0,chr1_3001656_3001961,13.41996,5.36796,11.504871,2.645634,4.317972,6.110474,6.988018,4.69051,6.218671,...,5.297277,2.788978,7.399043,2.088005,12.56451,4.917989,3.185094,13.58476,6.75689,13.294766
1,chr1_3109719_3110326,9.585685,19.324655,34.514614,14.550989,12.474142,13.748566,29.564693,10.051092,9.567187,...,9.837801,7.66969,4.708482,4.69801,3.769353,6.762235,9.555281,81.508557,10.61797,5.908785
2,chr1_3192412_3193073,1.917137,8.588735,13.805846,8.598312,7.676395,6.110474,10.213258,12.061311,7.17539,...,6.810785,4.880712,6.053762,6.264014,6.282255,6.762235,11.678677,9.056506,14.47905,38.407103
3,chr1_3239108_3239709,15.337097,6.441552,32.21364,14.550989,14.39324,11.711741,10.213258,18.091966,15.785858,...,13.62157,16.73387,14.125445,9.396021,26.385471,11.065476,30.789239,31.697772,21.23594,29.543925
4,chr1_3351007_3351367,21.088508,7.515144,32.21364,8.598312,9.115719,7.638092,10.213258,10.051092,10.045546,...,11.351308,11.155913,10.089604,8.87402,15.077412,6.147486,8.493583,18.113013,33.78445,14.771963


Index(['Peaks', 'D13_2_T1.Normal (C;RIK+PBS-d2)',
       'D7_3_T1.Normal (C;RIK+PBS-d2)', 'D1_2_C-Ren-PBS-D2 -female-',
       'D8_2_T2.Reg-ADM (C;RIK+Caer-d2)', 'D8_3_T2.Reg-ADM (C;RIK+Caer-d2)',
       'D11_1_T2.Reg-ADM (C;RIK+Caer-d2)', 'D14_2_C-Ren-C-d2 (ADM-shRen+)',
       'D6_1_C-Ren-C-d2 (ADM-shRen+)', 'D6_3_T3.KrasG12D (KC;RIK+PBS-d2)',
       'D8_1_T3.KrasG12D (KC;RIK+PBS-d2)', 'D13_3_T3.KrasG12D (KC;RIK+PBS-d2)',
       'D7_1_T4.Tum-ADR (KC;RIK+Caer-d2)', 'D18_3_T4.Tum-ADR (KC;RIK+Caer-d2)',
       'D19_1_T4.Tum-ADR (KC;RIK+Caer-d2)',
       'D19_3_T4.Tum-ADR (KC;RIK+Caer-d2)', 'D4_2_KC-REN-C-d2 (ADR-shRen+)',
       'D11_3_KC-552OFF-C-d2  (ADR-sh552Off)', 'D2_1_T7.PDAC (KPflC-GEMMs)',
       'D12_1_T7.PDAC (KPcr_organoids)', 'D12_2_T7.PDAC (KPcr_organoids)',
       'D22_3_T7.PDAC (KPcr_organoids)', 'D3_2_T7.PDAC (KPR127H_shp53_orgs)',
       'D3_3_T7.PDAC (KPR127H_Ren_orgs)'],
      dtype='object')

In [4]:
# mannual column name replacement base on information of each samples 
# listed on https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE132440

columns = {"D1_2_C-Ren-PBS-D2 -female-":"Normal_ATAC1", 
           "D13_2_T1.Normal (C;RIK+PBS-d2)":"Normal_ATAC2", 
           "D7_3_T1.Normal (C;RIK+PBS-d2)":"Normal_ATAC3",
           "D11_1_T2.Reg-ADM (C;RIK+Caer-d2)":"Injury_ATAC1", 
           "D14_2_C-Ren-C-d2 (ADM-shRen+)":"Injury_ATAC2", 
           "D6_1_C-Ren-C-d2 (ADM-shRen+)":"Injury_ATAC3", 
           "D8_2_T2.Reg-ADM (C;RIK+Caer-d2)":"Injury_ATAC4", 
           "D8_3_T2.Reg-ADM (C;RIK+Caer-d2)":"Injury_ATAC5", 
           "D13_3_T3.KrasG12D (KC;RIK+PBS-d2)":"Kras*_ATAC1", 
           "D6_3_T3.KrasG12D (KC;RIK+PBS-d2)":"Kras*_ATAC2", 
           "D8_1_T3.KrasG12D (KC;RIK+PBS-d2)":"Kras*_ATAC3",
           "D11_3_KC-552OFF-C-d2  (ADR-sh552Off)":"Kras*+Injury_ATAC1", 
           "D18_3_T4.Tum-ADR (KC;RIK+Caer-d2)":"Kras*+Injury_ATAC2", 
           "D19_3_T4.Tum-ADR (KC;RIK+Caer-d2)":"Kras*+Injury_ATAC3", 
           "D4_2_KC-REN-C-d2 (ADR-shRen+)":"Kras*+Injury_ATAC4", 
           "D7_1_T4.Tum-ADR (KC;RIK+Caer-d2)":"Kras*+Injury_ATAC5", 
           "D19_1_T4.Tum-ADR (KC;RIK+Caer-d2)":"Kras*+Injury_ATAC6", 
           "D2_1_T7.PDAC (KPflC-GEMMs)":"PDAC_ATAC1", 
           "D12_1_T7.PDAC (KPcr_organoids)":"PDAC_ATAC2", 
           "D12_2_T7.PDAC (KPcr_organoids)":"PDAC_ATAC3",
           "D22_3_T7.PDAC (KPcr_organoids)":"PDAC_ATAC4"}

data = data.rename(columns=columns)
data.columns

Index(['Peaks', 'Normal_ATAC2', 'Normal_ATAC3', 'Normal_ATAC1', 'Injury_ATAC4',
       'Injury_ATAC5', 'Injury_ATAC1', 'Injury_ATAC2', 'Injury_ATAC3',
       'Kras*_ATAC2', 'Kras*_ATAC3', 'Kras*_ATAC1', 'Kras*+Injury_ATAC5',
       'Kras*+Injury_ATAC2', 'Kras*+Injury_ATAC6', 'Kras*+Injury_ATAC3',
       'Kras*+Injury_ATAC4', 'Kras*+Injury_ATAC1', 'PDAC_ATAC1', 'PDAC_ATAC2',
       'PDAC_ATAC3', 'PDAC_ATAC4', 'D3_2_T7.PDAC (KPR127H_shp53_orgs)',
       'D3_3_T7.PDAC (KPR127H_Ren_orgs)'],
      dtype='object')

In [5]:
# two PDAC data not found on. the website, 
# rename to PDAC_ATAC5 and PDAC_ATAC6 for now
data = data.rename(columns={"D3_2_T7.PDAC (KPR127H_shp53_orgs)":"PDAC_ATAC5",
                            "D3_3_T7.PDAC (KPR127H_Ren_orgs)":"PDAC_ATAC6"})

In [6]:
# set peak as index 
peak_count = data.drop("Peaks", axis="columns")
peak_count

Unnamed: 0,Normal_ATAC2,Normal_ATAC3,Normal_ATAC1,Injury_ATAC4,Injury_ATAC5,Injury_ATAC1,Injury_ATAC2,Injury_ATAC3,Kras*_ATAC2,Kras*_ATAC3,...,Kras*+Injury_ATAC6,Kras*+Injury_ATAC3,Kras*+Injury_ATAC4,Kras*+Injury_ATAC1,PDAC_ATAC1,PDAC_ATAC2,PDAC_ATAC3,PDAC_ATAC4,PDAC_ATAC5,PDAC_ATAC6
0,13.419960,5.367960,11.504871,2.645634,4.317972,6.110474,6.988018,4.690510,6.218671,9.213464,...,5.297277,2.788978,7.399043,2.088005,12.564510,4.917989,3.185094,13.584760,6.75689,13.294766
1,9.585685,19.324655,34.514614,14.550989,12.474142,13.748566,29.564693,10.051092,9.567187,8.555359,...,9.837801,7.669690,4.708482,4.698010,3.769353,6.762235,9.555281,81.508557,10.61797,5.908785
2,1.917137,8.588735,13.805846,8.598312,7.676395,6.110474,10.213258,12.061311,7.175390,7.239150,...,6.810785,4.880712,6.053762,6.264014,6.282255,6.762235,11.678677,9.056506,14.47905,38.407103
3,15.337097,6.441552,32.213640,14.550989,14.393240,11.711741,10.213258,18.091966,15.785858,13.162091,...,13.621570,16.733870,14.125445,9.396021,26.385471,11.065476,30.789239,31.697772,21.23594,29.543925
4,21.088508,7.515144,32.213640,8.598312,9.115719,7.638092,10.213258,10.051092,10.045546,7.897255,...,11.351308,11.155913,10.089604,8.874020,15.077412,6.147486,8.493583,18.113013,33.78445,14.771963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103932,17.254234,9.662327,2.300974,5.291269,13.433691,14.766978,10.750798,16.751820,22.482889,23.033659,...,33.297171,23.706315,37.667853,28.188062,28.898373,3.073743,6.370187,4.528253,0.96527,1.477196
103933,9.585685,8.588735,2.300974,5.952677,3.838197,7.128886,10.213258,2.010218,8.132109,10.529673,...,22.702617,5.577957,32.286731,24.012053,17.590314,1.229497,3.185094,9.056506,0.96527,1.477196
103934,191.713707,122.389481,4.601949,142.202849,95.954936,106.424086,122.021553,137.364926,112.892803,102.006205,...,116.540098,147.118603,113.676200,119.538263,98.003177,111.884252,32.912635,58.867291,0.96527,1.477196
103935,13.419960,6.441552,2.300974,8.598312,11.994367,10.184123,13.976037,8.710947,21.047811,11.845882,...,17.405339,24.403560,22.869768,22.446049,25.129020,1.229497,2.123396,9.056506,1.93054,1.477196


In [None]:
cg = sns.clustermap(peak_count, method="fastcluster.linkage()", cmap="coolwarm")
cg.ax_row_dendrogram.set_visible(False)
cg.ax_col_dendrogram.set_visible(False)