In [1]:
import json
import os
import shutil
import subprocess
import tempfile
import glob
import pandas as pd
from functools import reduce
import re
import pandas as pd
import time

# reduce new function https://stackoverflow.com/questions/44327999/python-pandas-merge-multiple-dataframes

commonpeaksfile = '/home/joaquin/projects/methylation/data/commonData/arabidopsisThaliana/intersect/allThePossiblePeaksnine.bed'

metadata = '/home/joaquin/projects/methylation/data/commonData/ids_data_allReplicates_methylation.json'
basePathDataFolder = '/home/joaquin/projects/methylation/data'
narrowPeakLocationFolders = ['tfs_rep_1','tfs_rep_3_input_from_rep_2', 'tfs_rep_2', 'tfs_rep_4']
specificPathsSumary = {}


In [2]:
with open(metadata) as jsonMetadata:
     experimentsClasification = json.load(jsonMetadata)['experiments']

In [None]:
for experiment in experimentsClasification:
    specificPathsSumary[experiment['condition']] = {}
    for metState in ['direct', 'amplified']:
        specificPathsSumary[experiment['condition']][metState] = {}
        for exptype in ['sample', 'input']:
            specificPathsSumary[experiment['condition']][metState][exptype] = {}
            for replicate, number in zip(
                experiment[metState],
                ['replicate1','replicate2', 'replicate3']
            ):
            
                expeId, expPath = replicate[number][0][exptype].strip().split(',')
        # if it is a missing experiment dont continue with the analisys
                if 'MISSING' in expeId:
                    print(number, expPath, expeId)
                    continue
                for possiblenarrowPeakFolder in narrowPeakLocationFolders:
                    narrowPeakFolder = None
                    path = os.path.join(basePathDataFolder,possiblenarrowPeakFolder,expPath)
        # make a list of the files in each posible directory. Try and continue if the experiment was not 
        # done for the replicate. 
                    try:
                        filesInFolder = os.listdir(path)
                    except FileNotFoundError:
                        continue
        # search for the specific experiment id inside the folder in the names to check if it is the correct folder
        # and stop searching if it is inside

                    for file in filesInFolder:
                        if 'html' in file:
                            fileid=file
    #                 print(fileid)
    #                 print(expeId)
                    if fileid.startswith(expeId):
                        narrowPeakFolder = possiblenarrowPeakFolder
                        break

                narrowpeakFileOriginalPath = os.path.join(
                    basePathDataFolder,narrowPeakFolder,expPath
                )
    #             specificPathsSumary[experiment['condition']][metState].append((narrowPeakFolder,narrowpeakFileOriginalPath))
                specificPathsSumary[experiment['condition']][metState][exptype][number] = narrowpeakFileOriginalPath 

In [None]:
def performIntersect(folder, intersectFile):
    
    sortedBamFile = glob.glob(f'{folder}/*orted.bam')
    
    if len(sortedBamFile) != 1:
        return print(folder, ' has a problem selecting File')
    else:
        sortedBamFile = sortedBamFile[0]
    
    outputFile = intersectFile.strip().split('/')[-1][:-4]+'_'+sortedBamFile.split('/')[-1][:-10]+'.bed'
    outputFilePath = os.path.join(folder,outputFile)


    print(outputFilePath)
    
    if os.path.isfile(outputFilePath):
        os.remove(outputFilePath)
    if not os.path.isfile(outputFilePath):
        subprocess.call(
            'samtools' + ' view -q1 -b ' + sortedBamFile + ' | ' +
            'bedtools' + ' intersect -abam stdin -b ' + intersectFile + ' -bed -wb -f 0.5 ' +
            '> ' + outputFilePath , shell=True
        )
    else:
        print(outputFile, ' is already done')
    
    totalForBox = {}
    with open(outputFilePath, 'r') as intersectOut:
        intersectDf = pd.read_csv(
            intersectOut, sep='\t', usecols=[3, 12, 13, 14, 15],
            names=['intersected', 'chr', 'start', 'end', 'boxname'],
        )
        for index, ip in intersectDf.iterrows():
            intersectOcurrence = str(ip.intersected.split('/')[0])
            box = ','.join([str(ip.chr), str(ip.start), str(ip.end), ip.boxname])
            if box in totalForBox:
                totalForBox[box].add(intersectOcurrence)
            else:
                totalForBox[box] = {intersectOcurrence}

        for box in totalForBox:
            boxlen = len(totalForBox[box])
            totalForBox[box] = boxlen

        with open(outputFilePath[:-4] + '_boxtotals.csv', 'w') as elcsv:
            elcsv.write('chr,start,end,boxname,{}\n'.format(sortedBamFile.split('/')[-1][:-10]))
            for name, recount in totalForBox.items():
                elcsv.write('{},{}\n'.format(name, recount))

In [None]:
for experiment in specificPathsSumary:
    for metState in specificPathsSumary[experiment]:
        for exptype in specificPathsSumary[experiment][metState]:
            for replicate in specificPathsSumary[experiment][metState][exptype]:
                workingFolder = specificPathsSumary[experiment][metState][exptype][replicate]
                # performIntersect(workingFolder, commonpeaksfile)
           

In [None]:
def calculationBowtieSummary(filepath):
    filename = os.path.join(filepath,'bowtie2stats.txt')
    with open(filename, 'r') as bowstats:
        for line in bowstats:
            reads = re.search(r'([\d]+) reads; of these:',line)
            regular = re.search(r'([\d,\.]+)% overall alignment rate',line)

        return reads.group(1),regular.group(1)

In [None]:

def generateDfandNormaliceDataTPMs(folder,dataColName):
    csvFile = glob.glob(f'{folder}/allThePossiblePeaksnine*.csv')
    
    if len(csvFile) != 1:
        return print(folder, ' has a problem selecting File')
    else:
        csvFilePath = csvFile[0]

    fileDf = pd.read_csv(csvFilePath,header=0, names=['chr','star','end','id',dataColName])
    totalReads = fileDf[dataColName].sum()

    scalingFactor = totalReads/100000

    fileDf[dataColName] = fileDf[dataColName].apply(lambda x: x/scalingFactor)

    return fileDf
    

In [None]:

def generateDfandNormaliceDataRPPMs(folder,dataColName, normalizationValue):
    """If we want to use RPKMs we should introduce in the line before this one
    this 2 lines:
            reads, alingpercent = calculationBowtieSummary(workingFolder)
            totalReads = int(round(int(reads)*(float(alingpercent)/100),0)) # ---> normalizationValue
     """
    csvFile = glob.glob(f'{folder}/allThePossiblePeaksnine*.csv')
    
    if len(csvFile) != 1:
        return print(folder, ' has a problem selecting File')
    else:
        csvFilePath = csvFile[0]

    fileDf = pd.read_csv(csvFilePath,header=0, names=['chr','star','end','id',dataColName])
    fileDf[dataColName] = fileDf[dataColName].apply(lambda x: x/scalingFactor)
    totalReads = fileDf[dataColName].sum()
    scalingFactor = totalReads/1000000
    fileDf[dataColName] = fileDf[dataColName].apply(lambda x: x/scalingFactor)
    return fileDf
    

In [None]:
# reduce new function https://stackoverflow.com/questions/44327999/python-pandas-merge-multiple-dataframes
def generateMeanReplicatesDf(tf):
    allNormalizedreplicates = []
    for experiment in specificPathsSumary:
        if tf in experiment:
            for metState in specificPathsSumary[experiment]:
                for exptype in specificPathsSumary[experiment][metState]:
                    listofdfs = []
                    datacolnames = []
                    for replicate in specificPathsSumary[experiment][metState][exptype]:
                        workingFolder = specificPathsSumary[experiment][metState][exptype][replicate]
                        datacolname = '{}{}{}{}'.format(experiment,metState,exptype,replicate)
                        listofdfs.append(generateDfandNormaliceDataTPMs(workingFolder,datacolname))
                        datacolnames.append(datacolname)
        #             los valores que no estan en una de las replicas los completo con un 0
                    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['chr','star','end','id'],
                                                    how='outer'), listofdfs).fillna(0)
    #                 print(df_merged)
                    mean_col_name = '{}{}{}'.format(experiment,metState,exptype)
                    df_merged[mean_col_name] = df_merged[datacolnames].mean(axis=1)
                    df_merged = df_merged.drop(columns=datacolnames)
                    if not 'input' in mean_col_name:
                        allNormalizedreplicates.append(df_merged)

    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['chr','star','end','id'],
                                                    how='outer'), allNormalizedreplicates).fillna(0)
    return df_merged


In [None]:
mycH7df = generateMeanReplicatesDf('MYCH7')

direct/amplified

In [None]:
dfclasification = [('MYC31Mockdirectsample', 'MYC31Mockamplifiedsample'),
 ('MYC36Mockdirectsample', 'MYC36Mockamplifiedsample'),
 ('MYC324Mockdirectsample', 'MYC324Mockamplifiedsample'),
 ('MYC31JAdirectsample', 'MYC31JAamplifiedsample'),
 ('MYC36JAdirectsample', 'MYC36JAamplifiedsample'),
 ('MYC324JAdirectsample','MYC324JAamplifiedsample'),
 ('MYC31ACCdirectsample', 'MYC31ACCamplifiedsample'),
 ('MYC36ACCdirectsample','MYC36ACCamplifiedsample'),
 ('MYC324ACCdirectsample', 'MYC324ACCamplifiedsample')
 ]
df_plot = df_merged.drop(columns=["star", "chr", "end"])
df_plot = df_plot.set_index('id')
df_plot = df_plot.applymap(lambda v: v+0.5)
for (direct,amplified) in dfclasification:
    
    df_plot[direct[:-12]] = df_plot[direct]/df_plot[amplified]
    df_plot = df_plot.drop(columns=[direct,amplified])

In [None]:
for col in df_plot.columns:
    print(col, df_plot[col].mean()*100)

In [None]:
[0]

In [None]:
# MYC3
import pandas as pd
import seaborn as sns
import math
df_plot = df_merged.drop(columns=["star", "chr", "end"])
df_plot = df_plot.set_index('id')
%matplotlib inline
df_plot =df_plot.applymap(lambda v: v+0.5)
df_plot = df_plot.applymap(lambda boxSum: math.log2(boxSum))
img = sns.heatmap(df_plot, cmap="YlGnBu", robust=True)
# df_merged.to_csv('MYC2_meanTPM100k_log2_+0.5.tsv',sep='\t')

In [None]:
df_merged.columns
CGdf = CGdf[['MYC31Mockdirectsample', 'MYC31Mockamplifiedsample',
       'MYC31JAdirectsample', 'MYC31JAamplifiedsample', 'MYC31ACCdirectsample',
       'MYC31ACCamplifiedsample', 'MYC36Mockdirectsample',
       'MYC36Mockamplifiedsample', 'MYC36JAdirectsample',
       'MYC36JAamplifiedsample', 'MYC36ACCdirectsample',
       'MYC36ACCamplifiedsample', 'MYC324Mockdirectsample',
       'MYC324Mockamplifiedsample', 'MYC324JAdirectsample',
       'MYC324JAamplifiedsample', 'MYC324ACCdirectsample',
       'MYC324ACCamplifiedsample']]


In [None]:
import pandas as pd
import seaborn as sns
import math
df_merged = df_merged.drop(columns=["star", "chr", "end"])
df_merged = df_merged.set_index('id')
%matplotlib inline
df_merged = df_merged.applymap(lambda v: v+0.01)
df_merged = df_merged.applymap(lambda boxSum: math.log2(boxSum))
img = sns.heatmap(df_merged, cmap="YlGnBu", robust=True)

In [None]:
img = sns.heatmap(df_merged, cmap="YlGnBu")

In [None]:
# https://www.rna-seqblog.com/rpkm-fpkm-and-tpm-clearly-explained/
#                 esto solo es necesario si queremos calcular los RPKMs, ahora estamos normalizando por TPMs, pero sin dividir por la 
#                 longitud del gen, porque es constante. si se quiere calcular RPKMs

#         ----------------------------------------------------------------------------------
allNormalizedreplicates = []
for experiment in specificPathsSumary:
    if 'MYC3' in experiment:
        for metState in ['direct', 'amplified']:
            listofdfNormalizedReplicates = []
            NormalizedReplicatesnames = []
            for replicate in ['replicate1','replicate2', 'replicate3']:
                skiptReplicate = False
                sampleAndControl = []
                sampleAndControlNames = []
                for exptype in ['sample', 'input']:
                    try:
                        workingFolder = specificPathsSumary[experiment][metState][exptype][replicate]
                    except KeyError:
                        skiptReplicate = True
                        break
                    datacolname = '{}{}{}{}'.format(experiment,metState,exptype,replicate)
                    sampleAndControlNames.append(datacolname)
                    sampleAndControl.append(generateDfandNormaliceDataTPMs(workingFolder,datacolname))

                if not skiptReplicate:
                    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['chr','star','end','id'],
                                                    how='outer'), sampleAndControl).fillna(0)
                    ratio_col_name = '{}{}{}'.format(experiment,metState,replicate)
                    df_merged[ratio_col_name] = df_merged[sampleAndControlNames[0]]/df_merged[sampleAndControlNames[1]]
                                                                                    
                    df_merged = df_merged.drop(columns=sampleAndControlNames)
                    listofdfNormalizedReplicates.append(df_merged)
                    NormalizedReplicatesnames.append(ratio_col_name)

            df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['chr','star','end','id'],
                                                how='outer'), listofdfNormalizedReplicates).fillna(0)
            mean_col_name = '{}{}'.format(experiment,metState)
            df_merged[mean_col_name] = df_merged[NormalizedReplicatesnames].mean(axis=1)
            df_merged = df_merged.drop(columns=NormalizedReplicatesnames)
            allNormalizedreplicates.append(df_merged)


In [None]:
df_merged