In [1]:
import pandas as pd
import bs4 as bs
import requests
import numpy as np
import progressbar
import os
import csv
from Bio import Entrez
import re
import time
import sys

## Data Acquisition

In [2]:
# Create a BeautifulSoup object for the main page
url = 'https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi'
result = requests.get(url)
main_soup = bs.BeautifulSoup(result.content, 'html.parser')

In [3]:
#Parse the main_soup for the table of viruses
table = main_soup.find('table', {'class': 'tblTxt'})

In [4]:
# This table contains some subtables, which are included within single cells, called 'minitable' below
# The rows in the minitables are formatted differently, but contain the same info that I want, so this function 
# will convert them into the same format as the rest. A list comprehension didn't work because the 'minirows' have 
# variable lengths
def edit_minirow(row): 
    n = len(row)
    #assuming edited rows should have structure =[row[0],row[2],'None','None', row[1],row[3], row[4]]
    if n == 1:
        new_minirow = [row[0], 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None']
    elif n == 2:
        new_minirow = [row[0], 'None', 'None', 'None', row[1], 'None', 'None', 'None', 'None', 'None', 'None']
    elif n == 3:
        new_minirow = [row[0], row[2], 'None', 'None', row[1], 'None', 'None', 'None', 'None', 'None', 'None']
    elif n == 4:
        new_minirow = [row[0], row[2], 'None', 'None', row[1], row[3], 'None', 'None', 'None', 'None', 'None']
    elif n == 5:
        new_minirow = [row[0], row[2], 'None', 'None', row[1], row[3], row[4], 'None', 'None', 'None', 'None']
    else:
        print('error:', row)
    return(new_minirow)

# Initialize the list of classifications
class_list = ['Deltavirus', 'Genomoviridae', 'Retro-transcribing viruses', 'Satellites', 
'Virus families not assigned to an order', 'Virus-associated RNAs', 'dsDNA viruses, no RNA stage', 'dsRNA viruses',
'environmental samples', 'ssDNA viruses', 'ssRNA viruses', 'unassigned viruses', 'unclassified RNA viruses',
'unclassified archaeal viruses', 'unclassified bacterial viruses', 'unclassified virophages', 
'unclassified viruses']

In [5]:
# Create an empty list of lists
viral_genome_array = []

# Create a list of rows that will ensure that for every row in a minitable, that row isn't copied again when the 
#  main table finds those rows. The alternative would be to not include a separate minitable code, but since the 
#  minitables are within cells, those cells' text is added to the table
minirow_master_list = []

# This is in order to add the previous non-minitable row to each minitable row, as it contains relevant info
head_row = []

# This variable will hold the subheader of the previous lighter colored column, which will contain extra info for the 
# light columns beneath it
sub_head = ''
sub_head_list = []

# Go through all of the rows in the table 
for row in table.find_all('tr'):
    this_row = []
    # Go through each cell in each row
    for cell in row.find_all('td'):
        
        # If the cell contains a table (the pulldown rows on the main page) reformat those rows like the normal rows
        try:
            minitable = cell.find('table')
            for minirow in minitable.find_all('tr'):
                this_minirow = []
                for minicell in minirow.find_all('td'):
                    if 'NC_' in minicell.text:
                        this_minirow.append(minicell.find('a').attrs['href'])
                    else:
                        this_minirow.append(minicell.text.replace('\n','').replace('\xa0',''))
                minirow_master_list.append(this_minirow)
                edited_minirow = edit_minirow(this_minirow)
                edited_minirow.append(head_row)
                edited_minirow[0] = edited_minirow[0] + ' segment '
                viral_genome_array.append(edited_minirow)
                
        # If there isn't a table, grab the href link for cells with a 'NC_', otherwise grab the text
        except AttributeError:
            if 'NC_' in cell.text:
                this_row.append(cell.find('a').attrs['href'])
            else:
                this_row.append(cell.text.replace('\n','').replace('\xa0',''))
    
    # For the lighter rows, which share a subheader, either grab the subheader or append the current subheader in 
    # the 12th position (index 11)
    try:
        if row.attrs['bgcolor'] == '#F8F8F8':
            if len(this_row) == 1:
                # Classifications also have this lighter color, but I will to treat them differently to add them to 
                #  every row
                if this_row[0] not in class_list:
                    sub_head = this_row[0]
                    sub_head_list.append(this_row)
            else:
                while len(this_row) < 11:
                    this_row.append('None')
                this_row.append(sub_head)
    except:
        pass
    #
    
    # don't take rows that have already been taken down and reformatteed, and rows that include the text 
    # of a whole minitable (which have the name repeated)
    if this_row not in minirow_master_list and this_row[0] not in this_row[1:]: 
        head_row = this_row 
        viral_genome_array.append(this_row)
    
print('done')

done


In [6]:
# Add a classification and extra info feature in the first row (these will become dataframe column headers)
viral_genome_array[0].append('Classification')
viral_genome_array[0].append('Extra Info')

# If a row contains only a classification, everything below that row falls into that category until the next 
# classification comes up
classification = 'None'
for row in viral_genome_array[1:]:
    if row[0] in class_list:
        classification = row[0]
        viral_genome_array.remove(row)
    else:
        while len(row) < 11:
            row.append('None')
        row[10] = classification

In [7]:
# Gather a list of the head_rows that ended up in the extra info columns
head_row_list = []
for row in viral_genome_array[1:]:
    if len(row) == 12:
        head = row[11]
        if head != 'None' and head not in head_row_list:
            head_row_list.append(head)

# now that I have a list of heads, go through the array again and remove rows that match those head_rows, as well
#  as subheaders
for row in viral_genome_array[1:]:
    if row in head_row_list or row in sub_head_list:
        viral_genome_array.remove(row)   

In [8]:
# Extract the info in 'Extra Info' lists 
# For each row in the array, if the row ends in a list, replace missing values in that row with the corresponding 
#  value in the list
for row in viral_genome_array:
    if type(row[-1]) == list:
        extra_list = row[-1]
        for i in range(len(row)):
            if i not in [3,4,5,6]:
                if row[i] == 'None' or row[i] == '' or row[i] == '-':
                    row[i] = extra_list[i]

In [9]:
# Number all of the segments
segment_counts = {}

for row in viral_genome_array:
    name = row[0]
    if 'segment' in name:
        if name not in segment_counts:
            segment_counts[name] = 1
            row[0] += '1'
        else:
            segment_counts[name] += 1
            row[0] += str(segment_counts[name])

In [10]:
viral_genomes = pd.DataFrame(viral_genome_array[2:], columns = viral_genome_array[0])#, columns = viral_genome_array[0])
viral_genomes.head()

Unnamed: 0,Genome,Accession,Source information,Segm,Length,Protein,Neighbors,Host,Created,Updated,Classification,Extra Info
0,Hepatitis delta virus,/nuccore/13277517,,-,1682nt,2,320,"vertebrates, human",04/29/1993,02/10/2015,Deltavirus,
1,Badger associated gemykibivirus 1,/nuccore/807743872,strain:588t,-,2112nt,2,-,vertebrates,04/16/2015,11/24/2015,Genomoviridae,
2,Bemisia-associated genomovirus AdDF,/nuccore/1211677462,isolate:AdDF,-,2199nt,2,-,invertebrates,06/27/2017,07/19/2017,Genomoviridae,
3,Bemisia-associated genomovirus AdO,/nuccore/1211677465,isolate:AdO,-,2211nt,2,-,invertebrates,06/27/2017,07/19/2017,Genomoviridae,
4,Bemisia-associated genomovirus NfO,/nuccore/1211677468,isolate:NfO,-,2231nt,2,-,invertebrates,06/27/2017,07/19/2017,Genomoviridae,


In [13]:
#------------------------------------------------------------------------------------------------------------------
##### This cell gets the full html 'soup' from each virus' page, which includes the genome. My intention is to only run this once since it will take a while, and save the result into a csv file in this directory. 
length = len(viral_genomes['Genome'])

def scrape_viruses(start = 0, end = 9460):
    soup_dict = {}
    ###### Using a progress bar, go through the dataframe, getting a soup for each genome and saving the result to a dict
    with progressbar.ProgressBar(max_value=length) as bar:
        for i in range(start, end):

            name = viral_genomes['Genome'][i]

            slug = viral_genomes['Accession'][i]

            result = requests.get('https://www.ncbi.nlm.nih.gov{}'.format(slug))

            soup = bs.BeautifulSoup(result.content, 'html.parser')

            soup_dict[name] = soup

            bar.update(i)

    os.system('say "download complete"')

    ###### Write the dict to a csv: "virus_soups.csv"
    w = csv.writer(open("virus_soups_{}.csv".format(str(start) + '-' + str(end)), "w"))
    for key, val in soup_dict.items():
        w.writerow([key, val])
    os.system('say "file saved"')

    scrape_viruses(0, 1000)

    scrape_viruses(1000,2000)

    scrape_viruses(2000,3000)

    scrape_viruses(3000,4000)

    scrape_viruses(4000,5000)

    scrape_viruses(5000,6000)

    scrape_viruses(6000,7000)

    scrape_viruses(7000,8000)

    scrape_viruses(start = 8000)

In [28]:
soup0 = pd.read_csv('virus_soups_0-1000.csv', header = None, names = ['Name', 'BeautifulSoup'])
soup1 = pd.read_csv('virus_soups_1000-2000.csv', header = None, names = ['Name', 'BeautifulSoup'])
soup2 = pd.read_csv('virus_soups_2000-3000.csv', header = None, names = ['Name', 'BeautifulSoup'])
soup3 = pd.read_csv('virus_soups_3000-4000.csv', header = None, names = ['Name', 'BeautifulSoup'])
soup4 = pd.read_csv('virus_soups_4000-5000.csv', header = None, names = ['Name', 'BeautifulSoup'])
soup5 = pd.read_csv('virus_soups_5000-6000.csv', header = None, names = ['Name', 'BeautifulSoup'])
soup6 = pd.read_csv('virus_soups_6000-7000.csv', header = None, names = ['Name', 'BeautifulSoup'])
soup7 = pd.read_csv('virus_soups_7000-8000.csv', header = None, names = ['Name', 'BeautifulSoup'])
soup8 = pd.read_csv('virus_soups_8000-9460.csv', header = None, names = ['Name', 'BeautifulSoup'])

In [45]:
soups = pd.concat([soup0, soup1, soup2, soup3, soup4, soup5, soup6, soup7, soup8], ignore_index = True)

In [50]:
viral_genomes['BeautifulSoup'] = soups['BeautifulSoup'].apply(lambda x: bs.BeautifulSoup(x, 'html.parser'))
viral_genomes.head()

Unnamed: 0,Genome,Accession,Source information,Segm,Length,Protein,Neighbors,Host,Created,Updated,Classification,Extra Info,BeautifulSoup
0,Hepatitis delta virus,/nuccore/13277517,,-,1682nt,2,320,"vertebrates, human",04/29/1993,02/10/2015,Deltavirus,,"<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY..."
1,Badger associated gemykibivirus 1,/nuccore/807743872,strain:588t,-,2112nt,2,-,vertebrates,04/16/2015,11/24/2015,Genomoviridae,,"<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY..."
2,Bemisia-associated genomovirus AdDF,/nuccore/1211677462,isolate:AdDF,-,2199nt,2,-,invertebrates,06/27/2017,07/19/2017,Genomoviridae,,"<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY..."
3,Bemisia-associated genomovirus AdO,/nuccore/1211677465,isolate:AdO,-,2211nt,2,-,invertebrates,06/27/2017,07/19/2017,Genomoviridae,,"<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY..."
4,Bemisia-associated genomovirus NfO,/nuccore/1211677468,isolate:NfO,-,2231nt,2,-,invertebrates,06/27/2017,07/19/2017,Genomoviridae,,"<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY..."


In [51]:
viral_genomes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9460 entries, 0 to 9459
Data columns (total 13 columns):
Genome                9460 non-null object
Accession             9460 non-null object
Source information    9460 non-null object
Segm                  9460 non-null object
Length                9460 non-null object
Protein               9460 non-null object
Neighbors             9460 non-null object
Host                  9460 non-null object
Created               9460 non-null object
Updated               9460 non-null object
Classification        9460 non-null object
Extra Info            3000 non-null object
BeautifulSoup         9460 non-null object
dtypes: object(13)
memory usage: 960.9+ KB


In [52]:
# This function returns the id that I need for each virus to access it in the NCBI Entrez database 
def get_db_id (soup):
    try:
        line = soup.find('p', {'class': 'itemid'}).text
        itemid = (line[25:])
        return(itemid)
    except AttributeError:
        return(np.nan)

# Add the sequence type to the main DataFrame
def get_seq_type(soup):
    try:
        head = soup.find('div', {"class":"rprtheader"})
        line = head.find('h1')
        title = (line.text)
        sequence_type = re.search(',[a-z,0-9, ]+', title)[0]
        return(sequence_type[2:])
    except AttributeError:
        return('error')
    except TypeError:
        return('error')    
    
viral_genomes['ItemId'] = viral_genomes['BeautifulSoup'].apply(get_db_id)
viral_genomes['SequenceType'] = viral_genomes['BeautifulSoup'].apply(get_seq_type)

In [53]:
# Check to make sure we have an itemid for each virus/segment
viral_genomes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9460 entries, 0 to 9459
Data columns (total 15 columns):
Genome                9460 non-null object
Accession             9460 non-null object
Source information    9460 non-null object
Segm                  9460 non-null object
Length                9460 non-null object
Protein               9460 non-null object
Neighbors             9460 non-null object
Host                  9460 non-null object
Created               9460 non-null object
Updated               9460 non-null object
Classification        9460 non-null object
Extra Info            3000 non-null object
BeautifulSoup         9460 non-null object
ItemId                9458 non-null object
SequenceType          9460 non-null object
dtypes: object(15)
memory usage: 1.1+ MB


In [54]:
# identify missing ids (np.nan is a float type)
for i in range(len(viral_genomes['ItemId'])):
    string = viral_genomes['ItemId'][i]
    if type(string) == float:
        print(i)

624
625


In [58]:
viral_genomes[624:626]

Unnamed: 0,Genome,Accession,Source information,Segm,Length,Protein,Neighbors,Host,Created,Updated,Classification,Extra Info,BeautifulSoup,ItemId,SequenceType
624,Bacillus phage Bobb,/nuccore/682123955,,-,160281nt,247,-,bacteria,09/09/2014,09/09/2014,"dsDNA viruses, no RNA stage",,"<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY...",,error
625,Bacillus phage Bp8p-C,/nuccore/985761324,,-,151417nt,211,-,bacteria,02/05/2016,02/05/2016,"dsDNA viruses, no RNA stage",,"<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY...",,error


In [59]:
#manually enter ids for 624, 625

viral_genomes['ItemId'][624] = 'NC_024792.1'
viral_genomes['ItemId'][625] = 'NC_029121'

In [60]:
viral_genomes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9460 entries, 0 to 9459
Data columns (total 15 columns):
Genome                9460 non-null object
Accession             9460 non-null object
Source information    9460 non-null object
Segm                  9460 non-null object
Length                9460 non-null object
Protein               9460 non-null object
Neighbors             9460 non-null object
Host                  9460 non-null object
Created               9460 non-null object
Updated               9460 non-null object
Classification        9460 non-null object
Extra Info            3000 non-null object
BeautifulSoup         9460 non-null object
ItemId                9460 non-null object
SequenceType          9460 non-null object
dtypes: object(15)
memory usage: 1.1+ MB


In [61]:
viral_genomes['SequenceType'].value_counts()

complete genome                                                                                                                                                     5528
complete sequence                                                                                                                                                   1683
complete cds                                                                                                                                                        1414
error                                                                                                                                                                202
hypothetical protein 2, and hypothetical protein 3 genes, complete cds                                                                                                93
                                                                                                                                                           

In [62]:
viral_genomes.to_csv('viral_genomes_soups.csv')

## Get NCBI reports with Entrez

In [67]:
###### As with scraping the html for each virus above, I will attempt to only extract the report for each virus once, 
###### I will save this to a df and thereafter I'll be importing that and not running this cell. 

length = len(viral_genomes['Genome'])

def get_virus_reports(start = 0, end = length):
    report_dict = {}
    with progressbar.ProgressBar(max_value=length) as bar:
        for i in range(start, end):     #starting at 4235 because it stopped at 4240
            
            name = viral_genomes['Genome'][i]
            try:
                Entrez.email = 'qdupupet@umass.edu'
                handle = Entrez.efetch(db="nuccore", id=viral_genomes['ItemId'][i], rettype="gb", retmode="text")
                report = handle.read()
                report_dict[name] = report
            except AttributeErrror:
                pass
            time.sleep(0.5)
            bar.update(i)

    os.system('say "download complete"')
    
    ###### Write the dict to a csv: "virus_soups.csv"
    w = csv.writer(open("virus_reports_{}.csv".format(str(start) + '-' + str(end)), "w"))
    for key, val in report_dict.items():
        w.writerow([key, val])
    os.system('say "file saved"')

    get_virus_reports(0,2000)

    get_virus_reports(2000,4000)

    get_virus_reports(4000,6000)

    get_virus_reports(6000,8000)

    get_virus_reports(8000)

In [82]:
report0 = pd.read_csv('virus_reports_0-2000.csv', header = None, names = ['Name', 'Report'])
report1 = pd.read_csv('virus_reports_2000-4000.csv', header = None, names = ['Name', 'Report'])
report2 = pd.read_csv('virus_reports_4000-6000.csv', header = None, names = ['Name', 'Report'])
report3 = pd.read_csv('virus_reports_6000-8000.csv', header = None, names = ['Name', 'Report'])
report4 = pd.read_csv('virus_reports_8000-9460.csv', header = None, names = ['Name', 'Report'])

In [83]:
reports = pd.concat([report0, report1, report2, report3, report4], ignore_index = True)

In [89]:
viral_genomes['Report'] = reports['Report']

In [90]:
viral_genomes.to_csv('viral_genomes_soups_and_reports')

## Extract genome sequences

In [None]:
## read in the full meta df with all reports.
#chunksize = 1000
#TextFileReader = pd.read_csv('viral_genomes_soups_and_reports.csv', chunksize=chunksize, iterator=True)
#viral_genomes = pd.concat(TextFileReader, ignore_index=True)

#viral_genomes.tail()

In [91]:
# Extract genome sequences from the reports, add them to viral_genomes
viral_genomes['Sequence'] = 'None'

def get_genome(report):
    
    try:
        report_origin = re.findall('ORIGIN[0-9,a-z, ,\n]+', report)
        report_origin = report_origin[0]
        origin = re.findall('[atcg]+', report_origin)
        s = ''
        genome = s.join(origin)
        return(genome)
    except IndexError:
        print('error at', report)
        pass


viral_genomes['Sequence'] = viral_genomes['Report'].apply(get_genome)

In [92]:
viral_genomes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9460 entries, 0 to 9459
Data columns (total 17 columns):
Genome                9460 non-null object
Accession             9460 non-null object
Source information    9460 non-null object
Segm                  9460 non-null object
Length                9460 non-null object
Protein               9460 non-null object
Neighbors             9460 non-null object
Host                  9460 non-null object
Created               9460 non-null object
Updated               9460 non-null object
Classification        9460 non-null object
Extra Info            3000 non-null object
BeautifulSoup         9460 non-null object
ItemId                9460 non-null object
SequenceType          9460 non-null object
Report                9460 non-null object
Sequence              9460 non-null object
dtypes: object(17)
memory usage: 1.2+ MB


In [94]:
viral_genomes.tail()

Unnamed: 0,Genome,Accession,Source information,Segm,Length,Protein,Neighbors,Host,Created,Updated,Classification,Extra Info,BeautifulSoup,ItemId,SequenceType,Report,Sequence
9455,Wilkie partiti-like virus 1,/nuccore/1211677385,strain:mosWSCP36002,-,2100nt,1,-,invertebrates,06/27/2017,07/14/2017,unclassified viruses,,"<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY...",NC_035122.1,complete genome,LOCUS NC_035122 2100 bp ...,tatactacaatatatacaatatttttcactcataaaatgactgact...
9456,Wilkie partiti-like virus 2,/nuccore/1211677379,strain:mosWSCP53020,-,1819nt,1,1,eukaryotes,06/27/2017,07/14/2017,unclassified viruses,,"<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY...",NC_035119.1,complete genome,LOCUS NC_035119 1819 bp ...,ggcaatatgaattcctatgcacttaacaacattattcgtacttcac...
9457,Wolkberg virus segment 1,/nuccore/1197509331,isolate:2562_SA3,,(4461 nt),proteins: 1,neighbors: 10,invertebrates,05/24/2017,05/24/2017,unclassified viruses,"[Wolkberg virus, , isolate:2562_SA3, 3, 12312n...","<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY...",NC_034631.1,complete cds,LOCUS NC_034631 4461 bp ...,agtagtgtactaccaatattctaatattttcaatatatctttacaa...
9458,Wolkberg virus segment 2,/nuccore/1197509335,isolate:2562_SA3,,(6873 nt),proteins: 1,neighbors: 10,invertebrates,05/24/2017,05/24/2017,unclassified viruses,"[Wolkberg virus, , isolate:2562_SA3, 3, 12312n...","<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY...",NC_034633.1,complete cds,LOCUS NC_034633 6873 bp ...,agtagtgtactcctatactaacttcacttatcactaaaatggacga...
9459,Wolkberg virus segment 3,/nuccore/1197509333,isolate:2562_SA3,,(978 nt),proteins: 1,neighbors: 10,invertebrates,05/24/2017,05/24/2017,unclassified viruses,"[Wolkberg virus, , isolate:2562_SA3, 3, 12312n...","<?xml version=""1.0"" encoding=""utf-8""?> <!DOCTY...",NC_034632.1,complete cds,LOCUS NC_034632 978 bp ...,agtagtgtactccagaaaaaagacaataacaaatctcaatctacaa...


In [96]:
# Extract the molecule type (RNA or DNA) from the report
viral_genomes['Molecule Type'] = 'None'

def extract_genome_type(report):
    try:
        report_mol_type = re.findall('mol_type=[A-Z,a-z,=," ]+', report)
        report_mol_type = report_mol_type[0]
        molecule_type = re.findall('"[A-z ]+"', report_mol_type)
        molecule_type = molecule_type[0].strip('"')
        return(molecule_type)
    except IndexError:
        print('error at', report)
        pass
    
viral_genomes['Molecule Type'] = viral_genomes['Report'].apply(extract_genome_type)

In [97]:
# Extract the phylogeny from the report
viral_genomes['Phylogeny'] = 'None'

def extract_phylogeny(report):
    try:
        report_phylo = re.findall('Viruses[A-z ;,]+', report)
        report_phylo = report_phylo[0]
        return(report_phylo)
    except IndexError:
        print('error at', report)
        pass
    
viral_genomes['Phylogeny'] = viral_genomes['Report'].apply(extract_phylogeny)

In [98]:
viral_genomes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9460 entries, 0 to 9459
Data columns (total 19 columns):
Genome                9460 non-null object
Accession             9460 non-null object
Source information    9460 non-null object
Segm                  9460 non-null object
Length                9460 non-null object
Protein               9460 non-null object
Neighbors             9460 non-null object
Host                  9460 non-null object
Created               9460 non-null object
Updated               9460 non-null object
Classification        9460 non-null object
Extra Info            3000 non-null object
BeautifulSoup         9460 non-null object
ItemId                9460 non-null object
SequenceType          9460 non-null object
Report                9460 non-null object
Sequence              9460 non-null object
Molecule Type         9460 non-null object
Phylogeny             9460 non-null object
dtypes: object(19)
memory usage: 1.4+ MB


In [99]:
viral_genomes.to_csv('full_viral_genomes.csv', index = False)

In [2]:
## read in the full dataframe.
chunksize = 1000
TextFileReader = pd.read_csv('full_viral_genomes.csv', chunksize=chunksize, iterator=True)
viral_genomes = pd.concat(TextFileReader, ignore_index=True)

viral_genomes.tail()

Unnamed: 0,Genome,Accession,Source information,Segm,Length,Protein,Neighbors,Host,Created,Updated,Classification,Extra Info,BeautifulSoup,ItemId,SequenceType,Report,Sequence,Molecule Type,Phylogeny
9455,Wilkie partiti-like virus 1,/nuccore/1211677385,strain:mosWSCP36002,-,2100nt,1,-,invertebrates,06/27/2017,07/14/2017,unclassified viruses,,"<?xml version=""1.0"" encoding=""utf-8""?>\n<!DOCT...",NC_035122.1,complete genome,LOCUS NC_035122 2100 bp ...,tatactacaatatatacaatatttttcactcataaaatgactgact...,genomic RNA,Viruses; unclassified viruses
9456,Wilkie partiti-like virus 2,/nuccore/1211677379,strain:mosWSCP53020,-,1819nt,1,1,eukaryotes,06/27/2017,07/14/2017,unclassified viruses,,"<?xml version=""1.0"" encoding=""utf-8""?>\n<!DOCT...",NC_035119.1,complete genome,LOCUS NC_035119 1819 bp ...,ggcaatatgaattcctatgcacttaacaacattattcgtacttcac...,genomic RNA,Viruses; unclassified viruses
9457,Wolkberg virus segment 1,/nuccore/1197509331,isolate:2562_SA3,,(4461 nt),proteins: 1,neighbors: 10,invertebrates,05/24/2017,05/24/2017,unclassified viruses,"['Wolkberg virus', '', 'isolate:2562_SA3', '3'...","<?xml version=""1.0"" encoding=""utf-8""?>\n<!DOCT...",NC_034631.1,complete cds,LOCUS NC_034631 4461 bp ...,agtagtgtactaccaatattctaatattttcaatatatctttacaa...,genomic RNA,Viruses; unclassified viruses
9458,Wolkberg virus segment 2,/nuccore/1197509335,isolate:2562_SA3,,(6873 nt),proteins: 1,neighbors: 10,invertebrates,05/24/2017,05/24/2017,unclassified viruses,"['Wolkberg virus', '', 'isolate:2562_SA3', '3'...","<?xml version=""1.0"" encoding=""utf-8""?>\n<!DOCT...",NC_034633.1,complete cds,LOCUS NC_034633 6873 bp ...,agtagtgtactcctatactaacttcacttatcactaaaatggacga...,genomic RNA,Viruses; unclassified viruses
9459,Wolkberg virus segment 3,/nuccore/1197509333,isolate:2562_SA3,,(978 nt),proteins: 1,neighbors: 10,invertebrates,05/24/2017,05/24/2017,unclassified viruses,"['Wolkberg virus', '', 'isolate:2562_SA3', '3'...","<?xml version=""1.0"" encoding=""utf-8""?>\n<!DOCT...",NC_034632.1,complete cds,LOCUS NC_034632 978 bp ...,agtagtgtactccagaaaaaagacaataacaaatctcaatctacaa...,genomic RNA,Viruses; unclassified viruses


In [5]:
viral_genomes = viral_genomes[[col for col in viral_genomes.columns if col != 'BeautifulSoup' and col != 'Extra Info']]

In [6]:
viral_genomes.head()

Unnamed: 0,Genome,Accession,Source information,Segm,Length,Protein,Neighbors,Host,Created,Updated,Classification,ItemId,SequenceType,Report,Sequence,Molecule Type,Phylogeny
0,Hepatitis delta virus,/nuccore/13277517,,-,1682nt,2,320,"vertebrates, human",04/29/1993,02/10/2015,Deltavirus,NC_001653.2,complete genome,LOCUS NC_001653 1682 bp ss...,atgagccaagttccgaacaaggattcgcggggaggatagatcagcg...,genomic RNA,Viruses; Deltavirus
1,Badger associated gemykibivirus 1,/nuccore/807743872,strain:588t,-,2112nt,2,-,vertebrates,04/16/2015,11/24/2015,Genomoviridae,NC_026806.1,complete genome,LOCUS NC_026806 2112 bp ...,taatactatagccatctgggacacagagcacagtcgacgtgtccct...,genomic DNA,Viruses; Genomoviridae; Gemykibivirus
2,Bemisia-associated genomovirus AdDF,/nuccore/1211677462,isolate:AdDF,-,2199nt,2,-,invertebrates,06/27/2017,07/19/2017,Genomoviridae,NC_035137.1,complete genome,LOCUS NC_035137 2199 bp ...,taatgttatacacaaaccgtggtgtcagtgtcacctctatatagta...,genomic DNA,Viruses; Genomoviridae; unclassified Genomovir...
3,Bemisia-associated genomovirus AdO,/nuccore/1211677465,isolate:AdO,-,2211nt,2,-,invertebrates,06/27/2017,07/19/2017,Genomoviridae,NC_035138.1,complete genome,LOCUS NC_035138 2211 bp ...,taatattatagcccaggacacagggcacacctgtgtctatataaac...,genomic DNA,Viruses; Genomoviridae; unclassified Genomovir...
4,Bemisia-associated genomovirus NfO,/nuccore/1211677468,isolate:NfO,-,2231nt,2,-,invertebrates,06/27/2017,07/19/2017,Genomoviridae,NC_035139.1,complete genome,LOCUS NC_035139 2231 bp ...,taatattattctctctctcaggcagaggtagagggagttaatatag...,genomic DNA,Viruses; Genomoviridae; unclassified Genomovir...


In [8]:
viral_genomes.to_csv('final_viral_genomes.csv', index = False)