# Summary
---

This notebook shows the process of creating a function to parse fasta files into dataframes. <br>
The function below is the perfected (for now, 2/2) function:

In [13]:
#function to read in fasta files
def readFasta(file):
    # read in sequence and id separately
    fasta_sequences = SeqIO.parse(file,'fasta')
    df_1 = pd.DataFrame(fasta_sequences)

    df_1["ID"] = [fasta.id for fasta in SeqIO.parse(file, "fasta")]

    # place id column at front of dataframe
    cols = list(df_1.columns)
    cols = [cols[-1]] + cols[:-1]
    df_1 = df_1[cols]
    df_1.set_index('ID', inplace = True)
    
    return df_1

# Parsing Attemps:
---

In [2]:
# import useful libraries
from Bio import SeqIO
import pandas as pd

# allow multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Parse attempt 1

In [12]:
# read in fasta files and make dataframes
  
fasta_sequences = SeqIO.parse(open("data/mycovirus_genbank_all_refseq_nucleotide_unique.fasta"),'fasta')
df_0 = pd.DataFrame(fasta_sequences)

fasta_sequences = SeqIO.parse("data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta",'fasta')
df_1 = pd.DataFrame(fasta_sequences)

In [165]:
df_0.head()

df_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23625,23626,23627,23628,23629,23630,23631,23632,23633,23634
0,G,G,G,G,G,T,T,A,G,A,...,,,,,,,,,,
1,G,G,G,A,A,A,T,T,T,G,...,,,,,,,,,,
2,G,C,C,T,A,T,G,G,G,T,...,,,,,,,,,,
3,A,C,A,A,A,A,T,A,A,T,...,,,,,,,,,,
4,G,A,A,T,T,T,T,T,C,G,...,,,,,,,,,,


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14571,14572,14573,14574,14575,14576,14577,14578,14579,14580
0,C,T,T,T,A,G,T,A,A,A,...,,,,,,,,,,
1,G,G,G,G,T,G,A,T,G,G,...,,,,,,,,,,
2,T,T,T,T,T,G,G,G,G,A,...,A,A,A,A,A,A,A,A,A,A
3,T,T,G,G,C,T,C,C,T,G,...,,,,,,,,,,
4,G,C,A,A,T,A,A,A,A,A,...,,,,,,,,,,


#### Problem

Above dataframes don't list virus names

## Parse attempt 2

In [9]:
# read in both id and sequence

d = {fasta.id : fasta.seq for fasta in SeqIO.parse("data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta", "fasta")}
pd.DataFrame([d])

s = pd.Series(d, name='Sequence')
s.index.name = 'ID'
s.reset_index()

Unnamed: 0,NC_007415.1,NC_015939.1,NC_022896.1,NC_026510.1,NC_027138.1,NC_027139.1,NC_025383.1
0,"(C, T, T, T, A, G, T, A, A, A, T, A, C, C, A, ...","(G, G, G, G, T, G, A, T, G, G, G, A, A, A, C, ...","(T, T, T, T, T, G, G, G, G, A, T, G, G, T, A, ...","(T, T, G, G, C, T, C, C, T, G, G, A, G, A, C, ...","(G, C, A, A, T, A, A, A, A, A, G, C, A, C, A, ...","(G, C, A, A, T, A, A, A, A, A, G, C, A, C, A, ...","(A, G, G, G, A, G, T, A, T, T, A, T, A, A, T, ..."


Unnamed: 0,ID,Sequence
0,NC_007415.1,"(C, T, T, T, A, G, T, A, A, A, T, A, C, C, A, ..."
1,NC_015939.1,"(G, G, G, G, T, G, A, T, G, G, G, A, A, A, C, ..."
2,NC_022896.1,"(T, T, T, T, T, G, G, G, G, A, T, G, G, T, A, ..."
3,NC_026510.1,"(T, T, G, G, C, T, C, C, T, G, G, A, G, A, C, ..."
4,NC_027138.1,"(G, C, A, A, T, A, A, A, A, A, G, C, A, C, A, ..."
5,NC_027139.1,"(G, C, A, A, T, A, A, A, A, A, G, C, A, C, A, ..."
6,NC_025383.1,"(A, G, G, G, A, G, T, A, T, T, A, T, A, A, T, ..."


#### Problem

Above dataframes don't list virus sequences nicely

## Parse attempt 3
(combining methods 1 and 2)

In [3]:
# read in sequence and id separately

fasta_sequences = SeqIO.parse("data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta",'fasta')
df_1 = pd.DataFrame(fasta_sequences)


# a = {fasta.id for fasta in SeqIO.parse("data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta", "fasta")}
# print(a) != print(list(a))
# set {} and list [] changes order of ids. Use list to preserve order.
df_1["ID"] = [fasta.id for fasta in SeqIO.parse("data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta", "fasta")]

# place id column at front of dataframe
cols = list(df_1.columns)
cols = [cols[-1]] + cols[:-1]
df_1 = df_1[cols]

df_1.head()

Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,14571,14572,14573,14574,14575,14576,14577,14578,14579,14580
0,NC_007415.1,C,T,T,T,A,G,T,A,A,...,,,,,,,,,,
1,NC_015939.1,G,G,G,G,T,G,A,T,G,...,,,,,,,,,,
2,NC_022896.1,T,T,T,T,T,G,G,G,G,...,A,A,A,A,A,A,A,A,A,A
3,NC_026510.1,T,T,G,G,C,T,C,C,T,...,,,,,,,,,,
4,NC_027138.1,G,C,A,A,T,A,A,A,A,...,,,,,,,,,,


#### Problem

Have to read in each file twice

## Scratch Work

In [168]:
# def makeDF(input):
#     s1 = []
#     s2 = []
#     for fasta in input:
#         s1.append(fasta.id)
#         s2.append(fasta.seq)
#     s1 = pd.Series(s1)
#     #s2 = pd.Series(s2)
#     s3 = pd.DataFrame(s2)
#     #return pd.concat([s1, s2], axis=1)
#     return s3

# for fasta in fasta_sequences:
#         name, sequence = fasta.id, str(fasta.seq)