# Summary:
---

1. This notebook parses fasta files into dataframes
2. Lists the number of viruses in each file
3. Lists the sequence length of each unaligned virus 

Unaligned data:
1. Number of total viruses: 350
2. Number of fungus killing viruses: 7
3. Number of fungus non-killing viruses: 343

Aligned data:
1. Number of total viruses: 16
2. Number of fungus killing viruses: 7
3. Number of fungus non-killing viruses: 9

In [1]:
# import useful libraries
from Bio import SeqIO
import pandas as pd

# Useful Functions
---

In [2]:
#function to read in fasta files
def readFasta(file):
    # read in sequence and id separately
    fasta_sequences = SeqIO.parse(file,'fasta')
    df_1 = pd.DataFrame(fasta_sequences)

    df_1["ID"] = [fasta.id for fasta in SeqIO.parse(file, "fasta")]

    # place id column at front of dataframe
    cols = list(df_1.columns)
    cols = [cols[-1]] + cols[:-1]
    df_1 = df_1[cols]
    df_1.set_index('ID', inplace = True)
    
    return df_1

#Creates a datafram describing a datframe made of a fasta file
def dfDescriber(df):
    ids = pd.Series(df.index, name = "ID")
    columnLengths = []
    for i in range(0, len(df)):
        columnLengths.append(rowSearch(df.iloc[i], "None"))
    return pd.DataFrame({"Length of Sequence": columnLengths}, index = ids)
        
#Helper method of dfDescriber()
def rowSearch(row, key):
    length = 0;
    for r in row:
        if str(r) != key:
            length += 1 
        elif r == key:
            return length
    return length
    

# Unaligned Data
---

## Fungus Killers

In [6]:
#fungus killers

virus1 = readFasta("data/Sclerotinia_biocontrol_mycovirus_nucleotide.fasta")
virus1.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14571,14572,14573,14574,14575,14576,14577,14578,14579,14580
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NC_007415.1,C,T,T,T,A,G,T,A,A,A,...,,,,,,,,,,
NC_015939.1,G,G,G,G,T,G,A,T,G,G,...,,,,,,,,,,
NC_022896.1,T,T,T,T,T,G,G,G,G,A,...,A,A,A,A,A,A,A,A,A,A
NC_026510.1,T,T,G,G,C,T,C,C,T,G,...,,,,,,,,,,
NC_027138.1,G,C,A,A,T,A,A,A,A,A,...,,,,,,,,,,


#### Fungus Killers Described:

In [8]:
print("Number of killing viruses:", len(virus1))
dfDescriber(virus1).head()

Number of killing viruses: 7


Unnamed: 0_level_0,Length of Sequence
ID,Unnamed: 1_level_1
NC_007415.1,5470
NC_015939.1,10438
NC_022896.1,14581
NC_026510.1,2530
NC_027138.1,6457


### Fungus Killers and Non-Killers Mixed

In [5]:
#fungyus killers and non-killers

virus01 = readFasta("data/mycovirus_genbank_all_refseq_nucleotide_unique.fasta")
virus01.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,23625,23626,23627,23628,23629,23630,23631,23632,23633,23634
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NC_000960.1,G,G,G,G,G,T,T,A,G,A,...,,,,,,,,,,
NC_001278.1,G,G,G,A,A,A,T,T,T,G,...,,,,,,,,,,
NC_001492.1,G,C,C,T,A,T,G,G,G,T,...,,,,,,,,,,
NC_001633.1,A,C,A,A,A,A,T,A,A,T,...,,,,,,,,,,
NC_001641.1,G,A,A,T,T,T,T,T,C,G,...,,,,,,,,,,


#### Mixed Described:

In [15]:
print("Number of killing and non-killing viruses:", len(virus01))
print("Number of non-killing viruses (350 - 7):", 350-7)
dfDescriber(virus01).head()

Number of killing and non-killing viruses: 350
Number of non-killing viruses (350 - 7): 343


Unnamed: 0_level_0,Length of Sequence
ID,Unnamed: 1_level_1
NC_000960.1,9799
NC_001278.1,4113
NC_001492.1,12734
NC_001633.1,4009
NC_001641.1,4615


# Aligned Data
---

## Nobiocontrol

In [11]:
a_virus0 = readFasta("data/hypoviridae_aligned_polyprotein_nucleotide_nobiocontrol.fas")
print("Number of non-killing viruses:", len(a_virus0))
a_virus0.head()

Number of non-killing viruses: 9


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,12466,12467,12468,12469,12470,12471,12472,12473,12474,12475
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AB690372.1:379-9201_Valsa_ceratosperma_hypovirus_1_RNA_complete_genome_strain:_MVC86,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
KF537784.1:310-8856_Phomopsis_longicolla_hypovirus_isolate_ME711_complete_genome,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
KC330231.1:511-12543_Fusarium_graminearum_hypovirus_1_isolate_HN10_complete_genome,-,-,-,-,-,-,-,-,-,-,...,C,A,A,T,T,A,T,T,G,A
LC150612.1:462-12296_Fusarium_poae_hypovirus_1_genomic_RNA_complete_genome,A,T,G,A,C,T,G,G,C,T,...,T,T,A,G,G,T,T,A,A,-
KY120321.1:476-12331_Fusarium_langsethiae_hypovirus_1_isolate_FlHV1/AH32_complete_genome,-,-,-,-,-,-,-,-,-,-,...,T,T,A,G,G,T,T,A,A,-


## Biocontrol

In [13]:
a_virus1 = readFasta("data/hypoviridae_aligned_polyprotein_nucleotide_biocontrol.fas")
print("Number of killing viruses:", len(a_virus1))
a_virus1.head()

Number of killing viruses: 7


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14961,14962,14963,14964,14965,14966,14967,14968,14969,14970
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
JF781304.1:542-9388_Sclerotinia_sclerotiorum_hypovirus_1_strain_SZ-150_complete_genome,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
KF525367.1:314-14221_Sclerotinia_sclerotiorum_hypovirus_2_isolate_5472_complete_genome,A,T,G,A,T,C,C,C,C,T,...,C,A,A,T,G,G,T,T,A,G
KJ561218.1:468-14567_Sclerotinia_sclerotiorum_hypovirus_2_isolate_SX247_complete_genome,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
KP208178.1:458-12292_Fusarium_graminearum_hypovirus_2_isolate_FgHV2/JS16_complete_genome,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
NC_001492.1:496-11861_Cryphonectria_hypovirus_1_complete_genome,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


## All

In [14]:
a_virus01 = readFasta("data/hypoviridae_aligned_polyprotein_nucleotide_all.fas")
print("Number of total viruses:", len(a_virus01))
a_virus01.head()

Number of total viruses: 16


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,15144,15145,15146,15147,15148,15149,15150,15151,15152,15153
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
JF781304.1:542-9388_Sclerotinia_sclerotiorum_hypovirus_1_strain_SZ-150_complete_genome,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
KF525367.1:314-14221_Sclerotinia_sclerotiorum_hypovirus_2_isolate_5472_complete_genome,A,T,G,A,T,C,C,C,C,T,...,C,A,A,T,G,G,T,T,A,G
KJ561218.1:468-14567_Sclerotinia_sclerotiorum_hypovirus_2_isolate_SX247_complete_genome,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
AB690372.1:379-9201_Valsa_ceratosperma_hypovirus_1_RNA_complete_genome_strain:_MVC86,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
KF537784.1:310-8856_Phomopsis_longicolla_hypovirus_isolate_ME711_complete_genome,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


# Scratch Work
---

In [10]:
virus01.loc[virus01.index == "NC_027138.1"]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,23625,23626,23627,23628,23629,23630,23631,23632,23633,23634
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NC_027138.1,G,C,A,A,T,A,A,A,A,A,...,,,,,,,,,,
