## Week 6: DNA Methylation
### Radhika Jangi 10/16/2020

### 1. Getting data
SRR3083926_1.chr6.fastq from STEM-seq E4.0ICM rep1<br/>
SRR3083926_2.chr6.fastq from STEM-seq E4.0ICM rep1<br/>
SRR3083929_1.chr6.fastq from STEM-seq E5.5Epi rep1<br/>
SRR3083929_2.chr6.fastq from STEM-seq E5.5Epi rep1

bismark_genome_preparation --bowtie2 chr6<br/>

bismark --genome chr6/ -1 SRR3083926_1.chr6.fastq,SRR3083929_1.chr6.fastq -2 SRR3083926_2.chr6.fastq,SRR3083929_2.chr6.fastq<br/>

samtools sort SRR3083926_1.chr6_bismark_bt2_pe.bam -o SRR26.sorted.bam<br/>
samtools sort SRR3083929_1.chr6_bismark_bt2_pe.bam -o SRR29.sorted.bam<br/>

samtools index SRR26.sorted.bam<br/>
samtools index SRR29.sorted.bam<br/>

bismark_methylation_extractor --bedgraph --comprehensive SRR3083926_1.chr6_bismark_bt2_pe.bam<br/>
bismark_methylation_extractor --bedgraph --comprehensive SRR3083929_1.chr6_bismark_bt2_pe.bam

In [22]:
f = open('SRR3083926_1.chr6_bismark_bt2_pe.bedGraph', 'r')
SRR26 = f.readlines()
SRR26 = SRR26[1:]
f.close()
fs = open('SRR3083929_1.chr6_bismark_bt2_pe.bedGraph', 'r')
SRR29 = fs.readlines()
SRR29 = SRR29[1:]
fs.close()

In [24]:
f = open('mm10_refseq_genes_chr6_50M_60M.bed', 'r')
refseq = f.readlines()
f.close()

In [42]:
# Keeps unique genes and their coordinates
unique_genes = {}
for line in refseq:
    gene_coord = list(map(int,line.split()[5:7])) # Grabs gene coordinates from columns 5 and 6
    gene_name = line.split()[12] # Grabs gene name from column 13
    unique_genes.setdefault(gene_name,gene_coord)
#     if gene_name not in unique_genes:
#         unique_genes[gene_name] = gene_coord

In [43]:
print(len(unique_genes))

115


In [48]:
def methylation_count(srr, search_range):
    '''Calculates average methylation score in a specified region
    INPUT: 
        - srr: bedgraph data for specific embryonic cell type
        - search_range: coordinates of specific gene within genome
    OUTPUT:
        - meth_score/count: average methylation score for a gene
        - 0: if no methylation found on gene
    '''
    meth_score = 0 # Total methylation score
    count = 0 # Counts number of non-zero methylation sites
    for line in srr:
        meth_range = list(map(int,line.split()[1:3])) # Gets location of nucleotide
        score = float(line.split()[3]) # Gets methylation score at that site
        if meth_range[1]>=search_range[1] and meth_range[1]<=search_range[0]: # Checks if nuc falls within gene range
            meth_score+= score
            count+=1
    if count != 0:
        return meth_score/count
    else:
        return 0

In [59]:
# Maps genes to fold change from e4 to e5.5
f = open("foldchange.txt", "a")
fs = open("e4_avg_methylation.txt", "a")
fl = open("e5_5_avg_methylation.txt", "a")

for gene, coords in unique_genes.items():
    e4 = methylation_count(SRR26,coords)
    fs.write(gene+ ":\t"+str(e4))
    fs.write('\n')
    
    e5_5 = methylation_count(SRR29,coords)
    fl.write(gene+ ":\t"+str(e5_5))
    fl.write('\n')
    
    if e4 is not None and e4 !=0:
        if e5_5 is not None:
            f.write(gene+ ":\t"+str((e5_5-e4)/e4))
            f.write('\n')
        else:
            f.write(gene+ ":\t"+'-1.0')
            f.write('\n')
f.close()
fs.close()
fl.close()

{'Mpp6': 1.82170046254583, 'Dfna5': 3.0541991816405196, 'Osbpl3': 3.1545238303356444, 'Cycs': 27.51530612244897, '4921507P07Rik': 2.190640621972486, 'Npvf': 3.164835164835164, 'Nfe2l3': 2.4207464847441558, 'Hnrnpa2b1': 7.651479196329696, 'Cbx3': 3.916182988495173, 'Snx10': 3.515415453267093, 'Skap2': 1.8305249285330623, 'Hoxa3': 1.2286824375134617, 'Hoxa6': 12.333333333333306, 'Hoxa7': 0.3781250000000001, 'Hoxa9': 0.3858422939068102, 'Hoxa10': 0.3778579732609961, 'Hoxa11': -1.0, 'Hoxa13': 0.875, 'Evx1': -0.762396694214876, 'Hibadh': 1.7160588376653512, 'Tax1bp1': 1.3777291198509418, 'Jazf1': 1.9046040786157405, 'Creb5': 3.590647943184077, 'Tril': -1.0, 'Cpvl': 2.915324579296475, 'Chn2': 1.7562881807997404, 'Wipf3': 5.172231672480712, 'Scrn1': 5.147211791119173, 'Fkbp14': 2.2778068444220745, 'Plekha8': 2.5418391995075194, 'Mturn': 11.723113158790555, 'Znrf2': 3.93371270798042, 'Nod1': 3.9535689722423073, 'Ggct': 1.6787043851885985, 'Gars': 2.0046818048022534, 'Crhr2': 2.8459287757081624