TAG to BED

In [20]:
import os
import csv

In [1]:
import os

def remove_leading_whitespace(directory, output_directory=None):
    # Create output directory if it doesn't exist
    if output_directory and not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Iterate through each file in the specified directory
    for filename in os.listdir(directory):
        if filename.endswith('.txt') or filename.endswith('.tsv'):  # Adjust file extensions if necessary
            file_path = os.path.join(directory, filename)

            with open(file_path, 'r') as file:
                # Read all lines from the file
                lines = file.readlines()

            # Process lines to remove leading/trailing whitespace from the first column and other columns
            cleaned_lines = []
            for line in lines:
                # Split by tab but don't drop the first column
                columns = line.strip().split('\t')

                # Remove leading/trailing spaces from all columns but keep the first element intact
                cleaned_columns = [col.strip() for col in columns]

                # Join the cleaned columns with a tab and add to the result
                cleaned_line = '\t'.join(cleaned_columns)
                cleaned_lines.append(cleaned_line)

            # Save the cleaned data
            if output_directory:
                output_file_path = os.path.join(output_directory, filename)
            else:
                output_file_path = file_path  # Overwrite original file

            # Write the cleaned content back to file
            with open(output_file_path, 'w') as cleaned_file:
                cleaned_file.writelines('\n'.join(cleaned_lines) + '\n')

            print(f'Processed file: {filename}')

# Usage example
input_directory = r'C:\Rishabh\IISER\Semester Projects\Aug 2024 Ramanathan\Datasets\TF ChIP-seq\MCF10A cell line\gr-d60-1\Tag files'  # Replace with your input directory
output_directory = r'C:\Rishabh\IISER\Semester Projects\Aug 2024 Ramanathan\Datasets\TF ChIP-seq\MCF10A cell line\gr-d60-1\Tag files\cleaned'  # Optional: Replace with your output directory or set to None
remove_leading_whitespace(input_directory, output_directory)


Processed file: chr1.tags.tsv
Processed file: chr10.tags.tsv
Processed file: chr11.tags.tsv
Processed file: chr12.tags.tsv
Processed file: chr13.tags.tsv
Processed file: chr14.tags.tsv
Processed file: chr15.tags.tsv
Processed file: chr16.tags.tsv
Processed file: chr17.tags.tsv
Processed file: chr18.tags.tsv
Processed file: chr19.tags.tsv
Processed file: chr2.tags.tsv
Processed file: chr20.tags.tsv
Processed file: chr21.tags.tsv
Processed file: chr22.tags.tsv
Processed file: chr3.tags.tsv
Processed file: chr4.tags.tsv
Processed file: chr5.tags.tsv
Processed file: chr6.tags.tsv
Processed file: chr7.tags.tsv
Processed file: chr8.tags.tsv
Processed file: chr9.tags.tsv
Processed file: chrX.tags.tsv
Processed file: chrY.tags.tsv


In [4]:
import os
import pandas as pd

# Function to convert a single tag file to BED format
def tag_file_to_bed(tag_file_path, output_file_path):
    # Extract chromosome name from the file name
    tag_file = os.path.basename(tag_file_path)
    chrom = tag_file.replace(".tags.tsv", "")  # Extract chromosome name (e.g., chr1)

    # List of valid chromosome names
    valid_chromosomes = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"]

    # Check if the chromosome is valid
    if chrom not in valid_chromosomes:
        print(f"Invalid chromosome name: {chrom}")
        return

    print(f"Processing {tag_file}...")  # Debug statement

    # Read the file as a DataFrame
    try:
        df = pd.read_csv(tag_file_path, sep='\t', header=None)

        # Debug: Show the first few rows of the DataFrame
        print(f"Initial rows in {tag_file}:\n{df.head()}")

        # Ensure there are at least 5 columns to avoid IndexError
        if df.shape[1] >= 5:
            try:
                # Keep only the first (chromosome), second (start), and fifth (tag length) columns
                df_cleaned = df[[0, 1, 4]].copy()  # Column indices: 0 = chromosome, 1 = start, 4 = tag length
                df_cleaned.columns = ['chrom', 'start', 'tag_length']  # Rename columns for clarity

                # Calculate the end position (start + tag length)
                df_cleaned['end'] = df_cleaned['start'] + df_cleaned['tag_length']

                # Keep only relevant columns for BED format: chromosome, start, and end
                df_bed = df_cleaned[['chrom', 'start', 'end']]

                # Write the BED file output
                df_bed.to_csv(output_file_path, sep='\t', header=False, index=False)
                print(f"Output written to {output_file_path}")
            except Exception as e:
                print(f"Error processing {tag_file}: {e}")
        else:
            print(f"Skipping {tag_file} due to insufficient columns.")
    except Exception as e:
        print(f"Failed to read {tag_file}: {e}")

# Example usage
tag_file_path = r"C:\Rishabh\IISER\Semester Projects\Aug 2024 Ramanathan\Datasets\TF ChIP-seq\MCF10A cell line\gr-d60-1\Tag files\cleaned\chrY.tags.tsv"  # Path to your tag file
output_file_path = r"C:\Rishabh\IISER\Semester Projects\Aug 2024 Ramanathan\Datasets\TF ChIP-seq\MCF10A cell line\mcf10a_gr_chrY.bed"  # Output BED file

# Call the function to process the single tag file
tag_file_to_bed(tag_file_path, output_file_path)


Processing chrY.tags.tsv...
Initial rows in chrY.tags.tsv:
      0        1  2    3   4
0  chrY   493812  1  1.0  61
1  chrY   596253  0  1.0  61
2  chrY   765784  0  1.0  61
3  chrY   958430  0  1.0  61
4  chrY  1507133  0  1.0  43
Output written to C:\Rishabh\IISER\Semester Projects\Aug 2024 Ramanathan\Datasets\TF ChIP-seq\MCF10A cell line\mcf10a_gr_chrY.bed


Automated code

In [5]:
import os
import pandas as pd

# Function to convert a tag file to BED format and return a DataFrame
def tag_file_to_bed_df(tag_file_path):
    # Extract chromosome name from the file name
    tag_file = os.path.basename(tag_file_path)
    chrom = tag_file.replace(".tags.tsv", "")  # Extract chromosome name (e.g., chr1)

    # List of valid chromosome names
    valid_chromosomes = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"]

    # Check if the chromosome is valid
    if chrom not in valid_chromosomes:
        print(f"Invalid chromosome name: {chrom}")
        return None

    print(f"Processing {tag_file}...")  # Debug statement

    # Read the file as a DataFrame
    try:
        df = pd.read_csv(tag_file_path, sep='\t', header=None)

        # Ensure there are at least 5 columns to avoid IndexError
        if df.shape[1] >= 5:
            # Keep only the first (chromosome), second (start), and fifth (tag length) columns
            df_cleaned = df[[0, 1, 4]].copy()  # Column indices: 0 = chromosome, 1 = start, 4 = tag length
            df_cleaned.columns = ['chrom', 'start', 'tag_length']  # Rename columns for clarity

            # Calculate the end position (start + tag_length)
            df_cleaned['end'] = df_cleaned['start'] + df_cleaned['tag_length']

            # Keep only relevant columns for BED format: chromosome, start, and end
            df_bed = df_cleaned[['chrom', 'start', 'end']]

            return df_bed
        else:
            print(f"Skipping {tag_file} due to insufficient columns.")
            return None
    except Exception as e:
        print(f"Failed to read {tag_file}: {e}")
        return None

# Main function to process all tag files in the directory and merge into one BED file
def process_all_tag_files_to_bed(input_directory, output_bed_file):
    all_bed_dfs = []  # List to hold DataFrames for each chromosome's BED data

    # Iterate through each tag file in the directory
    for filename in sorted(os.listdir(input_directory)):
        if filename.endswith('.tags.tsv'):
            file_path = os.path.join(input_directory, filename)
            
            # Convert the tag file to a BED DataFrame
            bed_df = tag_file_to_bed_df(file_path)
            
            # Append the DataFrame to the list if it's valid
            if bed_df is not None:
                all_bed_dfs.append(bed_df)

    # Concatenate all DataFrames into one
    if all_bed_dfs:
        merged_bed_df = pd.concat(all_bed_dfs)

        # Sort by chromosome order: chr1, chr2, ..., chr22, chrX, chrY
        chromosome_order = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"]
        merged_bed_df['chrom'] = pd.Categorical(merged_bed_df['chrom'], categories=chromosome_order, ordered=True)
        merged_bed_df = merged_bed_df.sort_values(by=['chrom', 'start']).reset_index(drop=True)

        # Write the merged DataFrame to a single BED file
        merged_bed_df.to_csv(output_bed_file, sep='\t', header=False, index=False)
        print(f"Merged BED file written to {output_bed_file}")
    else:
        print("No valid tag files found.")

# Example usage
input_directory = r"C:\Rishabh\IISER\Semester Projects\Aug 2024 Ramanathan\Datasets\TF ChIP-seq\MCF10A cell line\gr-d60-1\Tag files\cleaned"  # Path to your directory with tag files
output_bed_file = r"C:\Rishabh\IISER\Semester Projects\Aug 2024 Ramanathan\Datasets\TF ChIP-seq\MCF10A cell line\MCF10A_GR_ChIP_seq.bed"  # Output merged BED file

# Call the function to process all tag files in the directory
process_all_tag_files_to_bed(input_directory, output_bed_file)


Processing chr1.tags.tsv...
Processing chr10.tags.tsv...
Processing chr11.tags.tsv...
Processing chr12.tags.tsv...
Processing chr13.tags.tsv...
Processing chr14.tags.tsv...
Processing chr15.tags.tsv...
Processing chr16.tags.tsv...
Processing chr17.tags.tsv...
Processing chr18.tags.tsv...
Processing chr19.tags.tsv...
Processing chr2.tags.tsv...
Processing chr20.tags.tsv...
Processing chr21.tags.tsv...
Processing chr22.tags.tsv...
Processing chr3.tags.tsv...
Processing chr4.tags.tsv...
Processing chr5.tags.tsv...
Processing chr6.tags.tsv...
Processing chr7.tags.tsv...
Processing chr8.tags.tsv...
Processing chr9.tags.tsv...
Processing chrX.tags.tsv...
Processing chrY.tags.tsv...
Merged BED file written to C:\Rishabh\IISER\Semester Projects\Aug 2024 Ramanathan\Datasets\TF ChIP-seq\MCF10A cell line\MCF10A_GR_ChIP_seq.bed
