In [None]:
## NOTES -- change 'collection_date' to 'date'
## 

In [20]:
import pandas as pd
from Bio import SeqIO
from BCBio import GFF
from collections import defaultdict
import re
import numpy as np

### Load raw data

In [2]:
ictvSeqs = [i for i in SeqIO.parse(open('../raw/hcv-genotypes-ictv.fasta', 'r'), 'fasta')]
ncbiSeqs = [i for i in SeqIO.parse(open('../raw/hcv-context-ncbi.fasta', 'r'), 'fasta')]


ictvMetaDf = pd.read_csv('../raw/hcv-ictv-ref-metadata.csv')
ictvMetaDf['source'] = 'ictv'
ncbiMetaDf = pd.read_csv('../raw/hcv-context-ncbi-metadata.csv')
ncbiMetaDf['source'] = 'ncbi'

metaDf = pd.concat([ictvMetaDf, ncbiMetaDf.loc[~ncbiMetaDf['Accession'].isin(ictvMetaDf['Accession'])]])

exclude = []
include = []

### Helper format functions

In [3]:
def getGeoData(metaRow):
    if type(row['Geo_Location']) == str and len(row['Geo_Location'].split(':')) > 1:
        country, division = row['Geo_Location'].split(':')
    elif row['USA'] and type(row['USA']) == str:
        country, division = row['Country'], row['USA']
    else:
        country, division = row['Country'], ''
        
    if type(division) == str and len(division.split(',')) > 1:
        location, division = division.split(', ')
    else:
        location = ''
        
    return [country, division, location]

In [4]:
def makeNewName(metaRow, exclude): 
    redFlags = ['Mutant', 'Recombinant', 'treatment',  'Week', 'UNVERIFIED', 'Baseline', 'Post_Treatment', 'D0', 'failure', 'nonfunctional']
    regexReplace = ['[,.:;()\'"]', '-\b', '\B-', '\b[s]\b', '\bF\b']
    removeStrings = ['-like', 'from', 'USA', 'gp1', 'gp2','sequence', 'Hepacivirus C', 'Hepatitis C', 'virus', 'isolate', 'polyprotein', 
                     'gene', 'protein', 'POLY', 'and', 'strain', 'cds', 'partial', 'genomic', ' F ', ' s ', 'subtype', 
                     'genotype','RNA','complete', 'genome', '(POL)', 'Patient', 'precursor', 'for', ':']

    name = row['GenBank_Title']

    for rf in redFlags:
        if rf in name: 
            exclude.append(row['Accession'])
    for string in removeStrings: 
        name = name.replace(string, '').strip()
    for pattern in regexReplace:
        name = re.sub(pattern, '', name).strip()
    name = re.sub('\s+', '_', name)

    if '/' not in name:
        name = 'HCV/%s/%s'%(name, row['Accession'])
        if row['country'] and type(row['country']) == str:
                name = name+'/'+row['country'].replace(' ', '-').replace("'", "")
        else:
            name = name + '/unknown'
        if row['Collection_Date'] and type(row['Collection_Date']) == str:
            name = name+'/'+row['Collection_Date'].split('-')[0]

    return name.upper()

In [5]:
def fillDates(datestring):
    if len(datestring) == 4:
        return datestring+'-XX-XX'
    elif len(datestring) == 7:
        return datestring+'-XX'
    elif len(datestring) == 10:
        return datestring
    else:
        print(datestring)

## Apply cleanup functions to each row 
Yes, I know I can and should do this with map / apply / whatever, but I dont' remember how to do that right now and this will work

In [6]:
for (idx, row) in metaDf.iterrows():
    [country, division, location] = getGeoData(row)
    metaDf.loc[idx, 'country'] = country
    metaDf.loc[idx, 'division'] = division
    metaDf.loc[idx, 'location'] = location

metaDf.drop(['Geo_Location', 'Country', 'USA'], axis=1, inplace=True)

In [7]:
for (idx, row) in metaDf.iterrows():
    metaDf.loc[idx, 'strain'] = makeNewName(row, exclude)

In [8]:
metaDf['collection_date'] = metaDf['Collection_Date'].fillna('XXXX-XX-XX')
metaDf['collection_date'] = metaDf['collection_date'].map(fillDates)

In [9]:
subtypeRegex = '(subtype|genotype)\s[1-8]{1}[a-zA-Z]{1,2}'

for (idx, row) in metaDf.iterrows():
    subtype = re.search(subtypeRegex, row['GenBank_Title'])
    if subtype:
        metaDf.loc[idx, 'subtype'] = subtype.group().split()[1]

In [10]:
for seq in ictvSeqs:
    lineage, accession = seq.id.split('_')
    metaDf.loc[metaDf['Accession'] == accession, 'subtype'] = lineage
    name = metaDf.loc[metaDf['Accession']==accession, 'strain'].values[0]
    seq.id = name
    seq.description = name

In [11]:
metaDf.fillna('?', inplace=True)
metaDf.replace('', '?', inplace=True)

metaDf.drop(['Release_Date', 'Collection_Date', 'GenBank_Title', 'Isolate', 'Genotype'], axis=1, inplace=True)
metaDf.drop(metaDf.loc[metaDf['Accession'].isin(exclude)].index, axis=0, inplace=True)

In [12]:
metaDf.to_csv('../hcv-meta.tsv', sep='\t', index=False)

## Update sequence headers to match, remove duplicates

In [13]:
def strainFromAccession(accession, metaDf=metaDf):
    return metaDf.loc[metaDf['Accession'] == accession, 'strain'].values[0]
    
def accessionFromStrain(strain, metaDf=metaDf):
    try:
        return metaDf.loc[metaDf['strain'] == strain, 'Accession'].values[0]
    except:
        print(strain)
        return None


In [14]:
ictvSeqsClean = []
ictvSeqNames = [s.id for s in ictvSeqs]

for s in ictvSeqs:
    accession = accessionFromStrain(s.id)
    if accession and accession not in exclude:
        ictvSeqsClean.append(s)
        ictvSeqNames.append(s.id)

ncbiSeqsClean = []

for s in ncbiSeqs:
    accession = s.id.split('.')[0]
    if accession in exclude:
        continue
    name = strainFromAccession(accession, metaDf)

    if name not in ictvSeqNames:
        s.id = name
        s.description = name
        ncbiSeqsClean.append(s)

HCV/UNVERIFIED_ASMBLY_S553_20120314_FL_C05/OK552741/USA/2012
HCV/UNVERIFIED_S553_20131105_FL_A01/OK552863/USA/2013
HCV/UNVERIFIED_ASMBLY_S553_20090408_FL_C26/OK552705/USA/2009


In [15]:
SeqIO.write(ncbiSeqsClean, open('../hcv-ncbi-sequences.fasta', 'w'), 'fasta')
SeqIO.write(ictvSeqs, open('../hcv-ictv-sequences-aligned.fasta', 'w'), 'fasta')

238

In [16]:
with open('../include.txt', 'w') as f:
    for name in ictvSeqNames:
        f.write(name+'\n')
    f.close()

## Deal with reference genome

In [21]:
in_file = "../hcv-reference-NC_009824.gb"
out_file = "../hcv-reference-NC-009824.gff"
in_handle = open(in_file)
out_handle = open(out_file, "w")

GFF.write(SeqIO.parse(in_handle, "genbank"), out_handle)

in_handle.close()
out_handle.close()