In [3]:
import pandas as pd
import numpy as np
import string as s
import seaborn as sns
import math
# import tables
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style, colors
from itertools import groupby
import matplotlib.patches as patches
from matplotlib import gridspec
import scipy.stats as stats
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, fclusterdata, leaves_list
from scipy.spatial.distance import pdist
from scipy.ndimage.interpolation import shift
# matplotlib.style.use('ggplot')
# %matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12, 6)
# matplotlib.rcParams['figure.facecolor'] = 'white'
import mpl_toolkits.mplot3d.axes3d as p3
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets.samples_generator import make_swiss_roll
matplotlib.style.use('ggplot')
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# constants

In [2]:
wkdir = '/projects/da_workspace/software/ChromHMM/Cervical_all_patients'
mark = ["H3K27ac","H3K27me3","H3K36me3","H3K4me1","H3K4me3","H3K9me3"]

# running ChromHMM

# prep input files: cellmarkerfiletable.txt, symlink peak files in  peak_bed_files directory

In [7]:
%%bash -s $wkdir $mark

wkdir=$1
# mark=$2
echo $wkdir, $mark
cd $wkdir
for mark in H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me3 H3K9me3
do
    ls -1 /projects/chipseq/macs/A*/75nt/hg19a/bwa-mem-0.7.6a-sb/MACS_2.1.1.20160309/*/*$mark*Peak|grep -v gappedPeak > a.tmp
    cat a.tmp |awk -F "/" '{print $5"\t"$0}'|awk -F "_" '{print $3"\t"$0}'>d.tmp

    while read a b c;do grep $b 57_chipseq_samples_library_ids.csv|cut -f 1|\
    awk -v lib=$b -v path=$c -v mark=$a -F "-01" '{print $1"\t"lib"\t"mark"\t"path}' ;done \
    < d.tmp |sort -k1 -k2 > $mark"_peak_file_path.txt"

    rm a.tmp  d.tmp
    head -2 $mark"_peak_file_path.txt"
    wc -l $mark"_peak_file_path.txt";
done

/projects/da_workspace/software/ChromHMM/Cervical_all_patients,
HTMCP-03-06-02001	A84615	H3K27ac	/projects/chipseq/macs/A84615/75nt/hg19a/bwa-mem-0.7.6a-sb/MACS_2.1.1.20160309/370/A84615_H3K27ac_peaks.narrowPeak
HTMCP-03-06-02002	A94516	H3K27ac	/projects/chipseq/macs/A94516/75nt/hg19a/bwa-mem-0.7.6a-sb/MACS_2.1.1.20160309/691/A94516_H3K27ac_peaks.narrowPeak
54 H3K27ac_peak_file_path.txt
HTMCP-03-06-02001	A84613	H3K27me3	/projects/chipseq/macs/A84613/75nt/hg19a/bwa-mem-0.7.6a-sb/MACS_2.1.1.20160309/374/A84613_H3K27me3_peaks.broadPeak
HTMCP-03-06-02002	A94514	H3K27me3	/projects/chipseq/macs/A94514/75nt/hg19a/bwa-mem-0.7.6a-sb/MACS_2.1.1.20160309/689/A94514_H3K27me3_peaks.broadPeak
54 H3K27me3_peak_file_path.txt
HTMCP-03-06-02001	A84614	H3K36me3	/projects/chipseq/macs/A84614/75nt/hg19a/bwa-mem-0.7.6a-sb/MACS_2.1.1.20160309/371/A84614_H3K36me3_peaks.broadPeak
HTMCP-03-06-02002	A94515	H3K36me3	/projects/chipseq/macs/A94515/75nt/hg19a/bwa-mem-0.7.6a-sb/MACS_2.1.1.20160309/690/A94515_H3K36me3

In [None]:
# format and concatenate the peak files from each mark

In [10]:
%%bash
ssh gphost06
# step 1: make chromhmm input file cellmarkerfiletable.txt, this file needs to be 3 columns, patient, mark, and mark peak file.
# step 2: create symlink to the MCS2 broadPeaks and narrowPeaks files
cd /projects/da_workspace/software/ChromHMM/Cervical_all_patients
cat *.marks|awk -F "/" '{print $0"\t"$11}' > cellmarkfiletable.txt.full.path
while read a b c d;do grep $a cellmarkfiletable.txt.full.path;done < H3K9me3_peak_file_path.txt.with.6.marks>cellmarkfiletable.txt.full.path.6.marks
mkdir /projects/da_workspace/software/ChromHMM/Cervical_all_patients/peak_bed_files
cd /projects/da_workspace/software/ChromHMM/Cervical_all_patients/peak_bed_files
while read a b c d e;do ln -s $d $e;done < /projects/da_workspace/software/ChromHMM/Cervical_all_patients/cellmarkfiletable.txt.full.path.6.marks

# run chromHMM BinarizeBed, memory may need to significantly larger, e.g. -Xmx120G
cd /projects/da_workspace/software/ChromHMM
/gsc/software/linux-x86_64-centos6/jdk1.8.0_102/bin/java -mx1600M -jar ChromHMM.jar BinarizeBed -b 200 -peaks CHROMSIZES/hg19.txt /projects/da_workspace/software/ChromHMM/Cervical_all_patients/peak_bed_files/ /projects/da_workspace/software/ChromHMM/Cervical_all_patients/cellmarkfiletable.txt /projects/da_workspace/software/ChromHMM/Cervical_all_patients/binary/

# add chr suffix to binary files
cd /projects/da_workspace/software/ChromHMM/Cervical_all_patients/binary/
for f in *_binary.txt;do awk '/^HTMCP/{$2="chr"$2} 1' $f |sed 's/ /\t/g'> $f.chr_suffix;done 
rm *_binary.txt

# run chromHMM, need to run on gphost because of memory requirement
# /gsc/software/linux-x86_64-centos6/jdk1.8.0_102/bin/java -Xmx50g -jar ChromHMM.jar LearnModel -b 200 -p 0 ./Cervical_all_patients/binary/ ./Cervical_all_patients/18_states 18 hg19

# or run for multiple states models
cd /projects/da_workspace/software/ChromHMM
for f in 15 16 17 18 19 20 21;do mkdir ./Cervical_all_patients/$f"_states"; /gsc/software/linux-x86_64-centos6/jdk1.8.0_102/bin/java -Xmx50g -jar ChromHMM.jar LearnModel -b 200 -p 0 ./Cervical_all_patients/binary/ ./Cervical_all_patients/$f"_states" $f hg19;done
for f in 22 23 24;do /gsc/software/linux-x86_64-centos6/jdk1.8.0_102/bin/java -Xmx100g -jar ChromHMM.jar LearnModel -b 200 -p 0 ./Cervical_all_patients/binary/ ./Cervical_all_patients/$f"_states" $f hg19;done

Pseudo-terminal will not be allocated because stdin is not a terminal.
mkdir: cannot create directory `/projects/da_workspace/software/ChromHMM/Cervical_all_patients/peak_bed_files': File exists
ln: target `A84615_H3K27ac_peaks.narrowPeak' is not a directory
ln: target `A84613_H3K27me3_peaks.broadPeak' is not a directory
ln: target `A84614_H3K36me3_peaks.broadPeak' is not a directory
ln: target `A84610_H3K4me1_peaks.broadPeak' is not a directory
ln: target `A84611_H3K4me3_peaks.narrowPeak' is not a directory
ln: target `A84612_H3K9me3_peaks.broadPeak' is not a directory
ln: target `A84615_H3K27ac_peaks.narrowPeak' is not a directory
ln: target `A84613_H3K27me3_peaks.broadPeak' is not a directory
ln: target `A84614_H3K36me3_peaks.broadPeak' is not a directory
ln: target `A84610_H3K4me1_peaks.broadPeak' is not a directory
ln: target `A84611_H3K4me3_peaks.narrowPeak' is not a directory
ln: target `A84612_H3K9me3_peaks.broadPeak' is not a directory
ln: target `A94516_H3K27ac_peaks.narrowP