# Compare IFO13350 (NBRC 13350) different annotation
* Genbank 20161007
* RefSeq 20170309

In [1]:
from Bio import SeqIO

In [2]:
refseq = SeqIO.read('/Users/durand.dc/Desktop/IFO13350/RefSeqAssembly/GCF_000010605.1_ASM1060v1_genomic.gbff', 'genbank')
genbank = SeqIO.read('/Users/durand.dc/Desktop/IFO13350/GenBankAssembly/GCA_000010605.1_ASM1060v1_genomic.gbff', 'genbank')

Make some general comparisons  
GCF = RefSeq  
GCA = GenBank

In [3]:
GCFfeats = len(refseq.features)
GCAfeats = len(genbank.features)
print(f'Features:\n\tGCF: {GCFfeats}\n\tGCA: {GCAfeats}')

GCFCDS = sum(1 for feat in refseq.features  if feat.type == 'CDS')
GCACDS = sum(1 for feat in genbank.features if feat.type == 'CDS')
print(f'CDS:\n\tGCF: {GCFCDS}\n\tGCA: {GCACDS}')

GCFgenes = sum(1 for feat in refseq.features  if feat.type == 'gene')
GCAgenes = sum(1 for feat in genbank.features if feat.type == 'gene')
print(f'Genes:\n\tGCF: {GCFgenes}\n\tGCA: {GCAgenes}')

Features:
	GCF: 14505
	GCA: 14449
CDS:
	GCF: 7154
	GCA: 7138
Genes:
	GCF: 7241
	GCA: 7224


Make data tables for storing necessary data:  
Compare genes only, should be enough info.  
* location  
* locus_tag  
* old_locus_tag  

Coordinate using old_locus_tag

In [4]:
print(genbank.features[1])

type: gene
location: [907:2791](+)
qualifiers:
    Key: locus_tag, Value: ['SGR_1t']



Some filters to check data integrity, eg. Does all 'gene' have 'locus_tag'?

In [5]:
GCFGeneList = list(feat for feat in refseq.features if feat.type == 'gene')
print(sum(1 for feat in GCFGeneList if 'locus_tag' not in feat.qualifiers))
print(sum(1 for feat in GCFGeneList if 'old_locus_tag' not in feat.qualifiers))

0
322


In [6]:
GCAGeneList = list(feat for feat in genbank.features if feat.type == 'gene')
print(sum(1 for feat in GCAGeneList if 'locus_tag' not in feat.qualifiers))

0


In [7]:
oldLocusTagInGCA = set(feat.qualifiers['locus_tag'][0] for feat in GCAGeneList)
oldLocusTagInGCF = set(feat.qualifiers['old_locus_tag'][0] for feat in GCFGeneList if 'old_locus_tag' in feat.qualifiers)
print(len(oldLocusTagInGCA.difference(oldLocusTagInGCF)))
print(len(oldLocusTagInGCF.difference(oldLocusTagInGCA)))
oldLocusTagInGCF <= oldLocusTagInGCA 

305
0


True

Thus use `oldLocusTagInGCA` as new index, then add `newLocusTagInGCF` in the table

In [8]:
print(list(feat for feat in refseq.features if 'locus_tag' not in feat.qualifiers)[1])

type: regulatory
location: [290024:290151](-)
qualifiers:
    Key: bound_moiety, Value: ['adenosylcobalamin']
    Key: db_xref, Value: ['RFAM:RF00174']
    Key: inference, Value: ['COORDINATES: nucleotide motif:Rfam:12.0:RF00174', 'COORDINATES: profile:INFERNAL:1.1.1']
    Key: note, Value: ['cobalamin riboswitch; Derived by automated computational analysis using gene prediction method: cmsearch.']
    Key: regulatory_class, Value: ['riboswitch']



In [9]:
columns = ['old_locus_tag','new_locus_tag','old_start','new_start','old_end','new_end','old_strand','new_strand']

In [91]:
import pandas as pd
combined = pd.DataFrame(index = range(len(oldLocusTagInGCA)), columns = columns)
combined.head()

Unnamed: 0,old_locus_tag,new_locus_tag,old_start,new_start,old_end,new_end,old_strand,new_strand
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,


In [92]:
for idx, feat in enumerate(GCAGeneList):
    info = [feat.qualifiers['locus_tag'][0],feat.location.start, feat.location.end, feat.location.strand]
    combined.loc[idx,['old_locus_tag','old_start','old_end','old_strand']] = info
for feat in GCFGeneList:
    if 'old_locus_tag' in feat.qualifiers:
        idx = combined.loc[combined.old_locus_tag == feat.qualifiers['old_locus_tag'][0]].index[0]
        info = [feat.qualifiers['locus_tag'][0],feat.location.start, feat.location.end, feat.location.strand]
        combined.loc[idx,['new_locus_tag','new_start','new_end','new_strand']] = info
    else:
        newRow = pd.DataFrame([feat.qualifiers['locus_tag'][0],feat.location.start, feat.location.end, feat.location.strand]).T
        newRow.columns = ['new_locus_tag','new_start','new_end','new_strand']
        combined = pd.concat([combined, newRow], ignore_index=True)

In [93]:
combined = combined[columns]

In [94]:
sortedindex = combined.loc[:,['old_start','new_start']].mean(axis = 1, skipna = True).sort_values().index

In [95]:
combined = combined.reindex(sortedindex)

In [96]:
combined = combined.reset_index(drop=True)

In [97]:
combined

Unnamed: 0,old_locus_tag,new_locus_tag,old_start,new_start,old_end,new_end,old_strand,new_strand
0,SGR_1t,SGR_RS00015,907,907,2791,2791,1,1
1,,SGR_RS00020,,3400,,3919,,1
2,SGR_2t,,3400,,3919,,1,
3,SGR_3t,SGR_RS00025,4041,4041,4353,4353,-1,-1
4,SGR_4t,SGR_RS00030,4440,4440,5283,5283,-1,-1
5,SGR_5t,SGR_RS00035,5279,5279,5813,5813,-1,-1
6,SGR_6t,SGR_RS00040,5815,5815,6055,6019,-1,-1
7,SGR_7t,SGR_RS35770,7111,7111,7642,7642,-1,-1
8,SGR_8t,SGR_RS00045,7721,<8285,14138,14138,-1,-1
9,SGR_9t,SGR_RS00050,14289,14289,17832,17832,-1,-1


In [100]:
combined.to_excel('/Users/durand.dc/Documents/works/Resources/Resource_SGR/Genome/IDConversion.xlsx')