In [2]:
import os
import re
import numpy as np
import pandas as pd
import process_fasta
import process_fasta_scaffold
from collections import defaultdict

In [6]:
def read_in_df(path):
    
    with open(path) as f:
        lines = f.readlines()[:-1]
        
    scaffold_lines = []
    for i, l in enumerate(lines):
        if 'QMGA' in l:
            print(l)
            scaffold_lines.append(i)
   
    dfs = []
    for i in range(len(scaffold_lines)):
        
        
        if i != len(scaffold_lines)-1:
            start = scaffold_lines[i] +7
            end = scaffold_lines[i+1]- 2
        else:
            start = scaffold_lines[i] +7
            end = len(lines)
        
        #print(start, end)
        columns = ['Start_Index', 'End_Index', 'Period_Size', 'Copy_Number', 'Consensus_Size' , 'Percent_Matches', 'Percent_Indels', 'Score', 'A' , 'C' ,'G', 'T', 'Entropy', 'Motif', 'Tandem_Repeat']

        rows = []
        for line in lines[start: end]:
            line = line.replace('\n', '')
            row = []
            for element in line.split(' '):
                if element.isdigit():
                    row.append(int(element))
                elif '.' in element and element[: element.find('.')].isdigit():
                    row.append(float(element))
                else:
                    row.append(element)

            if len(row) == 15:
                rows.append(row)

        df = pd.DataFrame(rows, columns = columns)
        rearrange  = ['Start_Index', 'End_Index', 'Motif', 'Tandem_Repeat', 'Period_Size', 'Copy_Number', 'Consensus_Size' , 'Percent_Matches', 'Percent_Indels', 'Score', 'A' , 'C' ,'G', 'T', 'Entropy']
        dfs.append(df[rearrange])
    
    return dfs

## Read .dat file of donor1 

In [7]:
#Reading donor1 data
df_d1 = read_in_df("../Data/Donor1/GCA_003314715.1_Tur_tru_Illumina_hap_v1_genomic.fna.2.7.7.80.10.50.500.dat")

Sequence: QMGA01000001.1 Tursiops truncatus isolate Sample_04329 scaffold1, whole genome shotgun sequence

Sequence: QMGA01000010.1 Tursiops truncatus isolate Sample_04329 scaffold10, whole genome shotgun sequence

Sequence: QMGA01000099.1 Tursiops truncatus isolate Sample_04329 scaffold100, whole genome shotgun sequence

Sequence: QMGA01000100.1 Tursiops truncatus isolate Sample_04329 scaffold101, whole genome shotgun sequence

Sequence: QMGA01000101.1 Tursiops truncatus isolate Sample_04329 scaffold102, whole genome shotgun sequence

Sequence: QMGA01000102.1 Tursiops truncatus isolate Sample_04329 scaffold103, whole genome shotgun sequence

Sequence: QMGA01000103.1 Tursiops truncatus isolate Sample_04329 scaffold104, whole genome shotgun sequence

Sequence: QMGA01000104.1 Tursiops truncatus isolate Sample_04329 scaffold105, whole genome shotgun sequence

Sequence: QMGA01000105.1 Tursiops truncatus isolate Sample_04329 scaffold106, whole genome shotgun sequence

Sequence: QMGA01000106

In [None]:
print([len(d) for d in df_d1])

In [None]:
sum([len(d) for d in df_d1])

## Read Fasta file of the donor1

In [3]:
d1_genome_dict = process_fasta_scaffold.process_fasta_file('../Data/Donor1/GCA_003314715.1_Tur_tru_Illumina_hap_v1_genomic.fna')

Reading the FASTA file...
Preprocessing the file...
Collecting scaffold genomes
DONE.


In [9]:
list(d1_genome_dict.keys())[-1]

'>QMGA01000098.1 Tursiops truncatus isolate Sample_04329 scaffold99, whole genome shotgun sequence'

## Add flanking regions

In [None]:
def add_flanking_regions(df_d1, d1_genome_dict, fr_size):
    
    df_proc_d1 = []
    
    for i in range(len(df_d1)):
        
        
        lfrs = []
        rfrs = []
    
        genome = d1_genome_dict[list(d1_genome_dict.keys())[i]]
        temp = df_d1[i][(df_d1[i]['Start_Index'] > fr_size) & (df_d1[i]['End_Index'] + fr_size < len(genome))]     
        
        for j, row in temp.iterrows():
            lfr_start = row['Start_Index']-fr_size
            lfr_end = row['Start_Index']

            rfr_start = row['End_Index']
            rfr_end = row['End_Index']+fr_size

            lfr = genome[lfr_start:lfr_end]
            rfr = genome[rfr_start:rfr_end]

            lfrs.append(lfr)
            rfrs.append(rfr)
         
        if len(temp) != 0:
            temp.loc[:,'LFR'] = lfrs
            temp.loc[:,'RFR'] = rfrs      
        
        df_proc_d1.append(temp)
        
       # print(i, len(df_d1[i]), len(temp))
         
    return df_proc_d1

In [None]:
df_proc_d1 = add_flanking_regions(df_d1, d1_genome_dict, 100)

In [None]:
d1_genome_dict[list(d1_genome_dict.keys())[1]][14466:14524]

In [None]:
df_proc_d1[1].loc[1].Tandem_Repeat

In [None]:
for i in range(len(df_proc_d1)):
    df_proc_d1[i].to_csv('df_proc_d1_'+str(i)+'.csv', index = False)