https://crazyhottommy.github.io/reproduce_genomics_paper_figures/02_align_to_hg38.html

## Align the fastq files

In [1]:
from fig_remake.data_path import DATA_DIR, DATA

In [2]:
command_template = [
    'bowtie2',
    '-x',
    'G:/My Drive/BioData/GRCh38_noalt_as/GRCh38_noalt_as',
    '-U',
    'fastq/YAP.fastq.gz',
    '-S',
    'fastq/YAP1.sam',
    '--threads',
    '8',
    '--no-mixed',
    '--no-discordant',
    '-k',
    '1',
]

In [3]:
data_labels = ['IgG', 'TAZ', 'TEAD4', 'YAP']

In [4]:
from copy import deepcopy

In [5]:
for label in data_labels:
    command = deepcopy(command_template)
    data = DATA(label)
    command[command.index('fastq/YAP.fastq.gz')] = data
    command[command.index('fastq/YAP1.sam')] = DATA_DIR.joinpath(label + ".sam")

    print(command)

['bowtie2', '-x', 'G:/My Drive/BioData/GRCh38_noalt_as/GRCh38_noalt_as', '-U', WindowsPath('D:/src/fig_remake/data/SRR1810912.fastq.gz'), '-S', WindowsPath('D:/src/fig_remake/data/IgG.sam'), '--threads', '8', '--no-mixed', '--no-discordant', '-k', '1']
['bowtie2', '-x', 'G:/My Drive/BioData/GRCh38_noalt_as/GRCh38_noalt_as', '-U', WindowsPath('D:/src/fig_remake/data/SRR1810907.fastq.gz'), '-S', WindowsPath('D:/src/fig_remake/data/TAZ.sam'), '--threads', '8', '--no-mixed', '--no-discordant', '-k', '1']
['bowtie2', '-x', 'G:/My Drive/BioData/GRCh38_noalt_as/GRCh38_noalt_as', '-U', WindowsPath('D:/src/fig_remake/data/SRR1810918.fastq.gz'), '-S', WindowsPath('D:/src/fig_remake/data/TEAD4.sam'), '--threads', '8', '--no-mixed', '--no-discordant', '-k', '1']
['bowtie2', '-x', 'G:/My Drive/BioData/GRCh38_noalt_as/GRCh38_noalt_as', '-U', WindowsPath('D:/src/fig_remake/data/SRR1810900.fastq.gz'), '-S', WindowsPath('D:/src/fig_remake/data/YAP.sam'), '--threads', '8', '--no-mixed', '--no-discordant

In [6]:
for label in data_labels:
    command = deepcopy(command_template)
    data = DATA(label)
    command[command.index('fastq/YAP.fastq.gz')] = data
    command[command.index('fastq/YAP1.sam')] = DATA_DIR.joinpath(label + ".sam")

    if DATA_DIR.joinpath(label + ".sam").exists():
        print("data already processed")
        continue

    from fig_remake.timing import TimeSubprocess

    with TimeSubprocess(command) as process:
        print(process.stdout)

"""
Subprocess executed in 425.9841 seconds.

Subprocess executed in 435.5488 seconds.

Subprocess executed in 527.9717 seconds.

Subprocess executed in 413.6645 seconds.
"""

data already processed
data already processed
data already processed
data already processed


'\nSubprocess executed in 425.9841 seconds.\n\nSubprocess executed in 435.5488 seconds.\n\nSubprocess executed in 527.9717 seconds.\n\nSubprocess executed in 413.6645 seconds.\n'

## Convert sam to bam

In [7]:
sam2bam_template = ['samtools', 'view', '--bam', '-S', 'samfile', '-o', 'bamfile']

In [8]:
for label in data_labels:
    command = deepcopy(sam2bam_template)
    data = DATA_DIR.joinpath(label + ".sam")
    bam_data = DATA_DIR.joinpath(label + ".bam")
    command[command.index('samfile')] = data
    command[command.index('bamfile')] = bam_data

    if bam_data.exists():
        print("data already processed")
        continue

    from fig_remake.timing import TimeSubprocess

    with TimeSubprocess(command) as process:
        print(process.stdout)

"""Subprocess executed in 169.4720 seconds.

Subprocess executed in 145.6439 seconds.

Subprocess executed in 167.3066 seconds.

Subprocess executed in 117.7698 seconds."""

data already processed
data already processed
data already processed
data already processed


'Subprocess executed in 169.4720 seconds.\n\nSubprocess executed in 145.6439 seconds.\n\nSubprocess executed in 167.3066 seconds.\n\nSubprocess executed in 117.7698 seconds.'

## Sort the Bam

In [9]:
sort_command_template = ['samtools', 'sort', '-@', '4', 'bamfile', '-o', 'sortedfile']

In [17]:
for label in data_labels:
    command = deepcopy(sort_command_template)
    bam_data = DATA_DIR.joinpath(label + ".bam")
    sorted_bam_data = DATA_DIR.joinpath(label + ".sorted.bam")
    command[command.index('bamfile')] = bam_data
    command[command.index('sortedfile')] = sorted_bam_data

    if sorted_bam_data.exists():
        print("data already processed")
        continue

    from fig_remake.timing import TimeSubprocess

    with TimeSubprocess(command) as process:
        print(process.stdout)



Subprocess executed in 1 minutes and 26.3590 seconds.
data already processed
data already processed
data already processed


## Index the sorted bam

In [18]:
for label in data_labels:
    sorted_bam_data = DATA_DIR.joinpath(label + '.sorted.bam')
    index_data = DATA_DIR.joinpath(label + '.sorted.bam.bai')
    command = ['samtools', 'index', sorted_bam_data]

    if index_data.exists():
        print('data already processed')
        continue

    from fig_remake.timing import TimeSubprocess

    with TimeSubprocess(command) as process:
        print(process.stdout)


Subprocess executed in 14.6908 seconds.
data already processed
data already processed
data already processed
