## Reformat genotype file into bed format

In [6]:
import pandas as pd
import numpy as np

In [3]:
simulated_genomes_dir = "/home/lbruce/teams/CSE284_SP21_A00/team3/Git/cse284_project/Data/"

In [36]:
def convert_tsv_to_beds(filename, genome_tsv_file, output_dir):
    """
    Convert genome tsv file (simulated or otherwise) to bed files
    Original: 
    POS     A1      A2      POP1    POP2
    0       C       T       0       0
    
    Output:
    Copy 1: CHR    START_POS  END_POS  POP1
    Copy 2: CHR    START_POS  END_POS  POP2
    """
    df = pd.read_csv(genome_tsv_file, sep=('\t'))
    
    # Insert chromosome column
    df.insert(0,'CHR','21')
    
    # Insert stop position for bed file
    df.insert(2,'STOP_POS',df['POS'] + 1)
    df.rename(columns={'POS':'START_POS'}, inplace=True)
    
    # Split into chromosome copy
    copy1 = df[['CHR', 'START_POS', 'STOP_POS', 'POP1']]
    copy2 = df[['CHR', 'START_POS', 'STOP_POS', 'POP2']]
    
    # Write to new files
    copy1_filename = filename.split('.')[0] + '_copy1.bed'
    copy1.to_csv(output_dir + copy1_filename, sep='\t', index=False)
    
    copy2_filename = filename.split('.')[0] + '_copy2.bed'
    copy2.to_csv(output_dir + copy2_filename, sep='\t', index=False) 

In [37]:
genome_file = simulated_genomes_dir + "simGenome_100_0_0.tsv"
filename = "simGenome_100_0_0.tsv"
output_dir = "/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/"

convert_tsv_to_beds(filename, genome_file, output_dir)

## Fake Admixed Data for testing my bedtools formatting

## Merge Regions with same predicted Population Labels

In [67]:
def merge_regions(bed_file, chrom, output_prefix, pop_header):
    """
    Merge positions where the adjacent population labels are the same.
    Output bed file formatted for chromosome painting processing.
    
    Input:
    chr21 1  2 AFR
    chr21 5  6 AFR
    chr21 8  9 EUR 
    Output:
    chr21 1 6 AFR
    chr21 8 9 EUR
    """
    df = pd.read_csv(bed_file, sep='\t')
    
    out = open(output_prefix + ".merged.bed", 'w')
    
    pop_map = {0: "AFR", 1:"EUR"}
    pop_range = []
    pop=0
    for index,row in df.iterrows():
        #print("Index", index, df.shape[0])
        if index < df.shape[0] - 1: 
            # If current position has the same population, add to range
            if row[pop_header] == pop:
                pop_range.append(row['START_POS'])
            else:
                # Write previous pop range to merged bed file
                if pop_range != []:
                    output_line = [str(chrom), str(pop_range[0]), str(pop_range[-1] + 1), str(pop_map[pop])]
                    out.write('\t'.join(output_line) + '\n')

                # Start new range and switch pop
                if pop == 0:
                    pop = 1
                elif pop == 1:
                    pop = 0
                pop_range = []

                # Add current position
                pop_range.append(row['START_POS'])

        else:
            if row[pop_header] == pop:
                pop_range.append(row['START_POS'])
                output_line = [str(chrom), str(pop_range[0]), str(pop_range[-1] + 1), str(pop_map[pop])]
                out.write('\t'.join(output_line) + '\n')
            else:
                # Write previous pop range to merged bed file
                if pop_range != []:
                    output_line = [str(chrom), str(pop_range[0]), str(pop_range[-1] + 1), str(pop_map[pop])]
                    out.write('\t'.join(output_line) + '\n')
                
                # Start new range and switch pop
                if pop == 0:
                    pop = 1
                elif pop == 1:
                    pop = 0
                pop_range = []

                # Add current position
                pop_range.append(row['START_POS'])
                output_line = [str(chrom), str(pop_range[0]), str(pop_range[-1] + 1), str(pop_map[pop])]
                out.write('\t'.join(output_line) + '\n')

In [50]:
genome_file = simulated_genomes_dir + "simGenome_100_0_0.tsv"
df = pd.read_csv(genome_file, sep=('\t'))
df_opposite = df.copy()
# Flip population
df_opposite['POP1'] = 1
df_opposite['POP2'] = 1

# Concatenate half original and half opposite of dataframe
df_mixed= pd.concat([df[:51],df_opposite[51:]])
mixed_genome="/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/MIXED_simGenome_100_0_0.tsv"
df_mixed.to_csv(mixed_genome, sep='\t', index=False)

filename = "MIXED_simGenome_100_0_0.tsv"
output_dir = "/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/"

# Convert to split bed files
convert_tsv_to_beds(filename, mixed_genome, output_dir)

In [68]:
# Tested with fake mixed bed file
output_dir = "/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/"    
    
merge_regions(output_dir + '/MIXED_simGenome_100_0_0_copy1.bed', 'chr21', output_dir + '/MIXED_simGenome_100_0_0_copy1', 'POP1')

In [69]:
merge_regions(output_dir + '/MIXED_simGenome_100_0_0_copy2.bed', 'chr21', output_dir + '/MIXED_simGenome_100_0_0_copy2', 'POP2')

# Reformat chr 14/21 files to have AFR=0 and EUR=1

In [5]:
hmm_file = "/home/lbruce/teams/CSE284_SP21_A00/team3/HMM_Log10_Test_Outputs/admixEUR_AFR_chr14_Rx1_b_recomb0.01_HMMoutput.tsv"
hmm_formatted_file = open("/home/lbruce/teams/CSE284_SP21_A00/team3/HMM_Log10_Test_Outputs/admixEUR_AFR_chr14_Rx1_b_recomb0.01_HMMoutput_formatted.tsv", 'w')
with open(hmm_file, 'r') as f:
    for lines in f:
        lines = lines.rstrip()
        fields = lines.split('\t')
        if fields[2] == 'AFR':
            fields[2] = '0'
        elif fields[2] == 'EUR':
            fields[2] = '1'
            
        if fields[3] == 'AFR':
            fields[3] = '0'
        elif fields[3] == 'EUR':
            fields[3] = '1'
        
        hmm_formatted_file.write('\t'.join(fields) + '\n')


## TO DELETE

Use Bedtools to Merge on Population

In [39]:
%%bash
output_dir="/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/"
# Merges regions based on POP1/POP2 column
# Doesn't work for our purpose because regions do not overlap :(
#bedtools merge -c 4 -o distinct -i $output_dir/simGenome_100_0_0_copy1.bed -header >  $output_dir/simGenome_100_0_0_copy1.merged.bed
#bedtools merge -c 4 -o distinct -i $output_dir/simGenome_100_0_0_copy2.bed -header >  $output_dir/simGenome_100_0_0_copy2.merged.bed

In [52]:
%%bash
# DO NOT USE: Doesn't work because regions not overlapping

output_dir="/home/lbruce/teams/CSE284_SP21_A00/team3/chromosome_21_files/"
# Merges regions based on POP1/POP2 column
#bedtools merge -c 1,2,3,4 -o distinct -i $output_dir/MIXED_simGenome_100_0_0_copy1.bed -header >  $output_dir/MIXED_simGenome_100_0_0_copy1.merged.bed
#bedtools merge -c 1,2,3,4 -o distinct -i $output_dir/MIXED_simGenome_100_0_0_copy2.bed -header >  $output_dir/MIXED_simGenome_100_0_0_copy2.merged.bed