Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Annotation File Support #67

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
135 changes: 135 additions & 0 deletions static/js/input.js
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ function updateDataFromText(text, data, cols, filter) {
parseAssembly(text, data, cols, filter);
}


// parse as an annotation file
else if (data.length > 0 && getAnnotationFormat(text)) {
parseAnnotation(text, data, cols);
}

// parse as a table
else {
parseTable(text, data, cols);
Expand Down Expand Up @@ -628,3 +634,132 @@ function parseContigTitle(line, format) {
}
return null;
}

/**
* Parses and retrieves information in annotation file.
* @function parseAnnotation
* @param {String} text - annotation file content (multi-line string)
* @param {Object} data - data object
* @param {Object} cols - cols object
* @param {number} threshold - similarity threshold for annotation files
*/
function parseAnnotation(text, data, cols, threshold) {
threshold = threshold || 0.7;
const lines = splitLines(text),
format = getAnnotationFormat(text)[0],
fregex = getAnnotationFormat(text)[1],
n = lines.length;

// calculating how much of the annotation file's contig IDs match
// the existing data's contig IDs
function calcSimilarity(contigIds, annIds) {
const contigSet = new Set(contigIds),
annArr = Array.from(new Set(annIds));
let count = 0;
for (let i = 0; i < annArr.length; i++) {
if (contigSet.has(Number(annArr[i]).toString())) count++;
}
return count / contigIds.length;
}

if (calcSimilarity(data[0], text.match(fregex)) < threshold) {
// Number of contigs do not match enough of the number of rows in annotation file
throw `Annotation File does not match ${threshold * 100}% of the existing data.`
}

else if (format === 'greengenes') {
const regex = /G\d{9}|[dkpcofgs]__[-a-zA-Z0-9\.\f _]{2,}/g,
ref = {'G': '', 'd': '', 'k': '', 'p': '', 'c': '',
'o': '', 'f': '', 'g': '', 's': ''};
let arr2d = new Array(data[0].length).fill(new Array(8).fill(''));

for (let i = 0; i < n; i++) {
let line = lines[i];
// extracting the taxon values from greengenes file
let rawArr = line.match(regex);
let arr = [];

// parsing the extracted taxonomy data and updating reference object
for (let j = 0; j < rawArr.length; j++) {
if (!rawArr[j]) continue;
let index = rawArr[j].charAt();

if (index !== 'G') ref[index] = rawArr[j].substring(3);
else ref[index] = rawArr[j].substring(1);
}

// finding where the information needs to go in the existing data
let id = data[0].indexOf(Number(ref['G']).toString());

if (id >= 0) {
let vals = Object.values(ref);
vals.shift();
// inserting the parsed data into proper place
arr2d.splice(id, 1, vals)
}
}

for (let arr of transpose(arr2d)) data.push(arr);
cols.names = cols.names.concat(['Domain','Kingdom', 'Phylum', 'Class',
'Order', 'Family', 'Genus', 'Species']);
cols.types = cols.types.concat(['cat', 'cat', 'cat', 'cat',
'cat', 'cat', 'cat', 'des']);
}

else if (format === 'kegg') {
const regex = /\d{12}|\d+|K\d{5}/g,
ref = {};
let arr2d = new Array(data[0].length).fill('');

for (let i = 0; i < n; i++) {
let line = lines[i];
// extracting the kegg gene values
let rawArr = line.match(regex);
let annId = Number(rawArr[0]);

// finding where the information needs to go on the existing data
let id = data[0].indexOf((Number(rawArr[0])).toString());

if (ref[annId] && rawArr.length === 3) {
let current = ref[annId];
// creating a collection of kegg values and contig ids
current.push(rawArr[2])
ref[annId] = current;
}

else if (!ref[annId]) ref[annId] = [];

// inserting the parsed data into the proper place
arr2d.splice(id, 1, ref[annId]);
}

data.push(arr2d);
cols.names = cols.names.concat(['KEGG']);
cols.types = cols.types.concat(['fea']);
}

}

/**
* Infer the format of an annotation file.
* @function getAnnotationFormat
* @param {String} text - file content (multi-line)
* @returns {String} format - annotation file format (GreenGenes or null)
* @description This function searches for unique starting sequences of different annotation file
* formats. Currently, it supports GreenGenes and KEGG format.
*/
function getAnnotationFormat(text) {
let format = null;

// GreenGenes sample line
// e.g. G000712055 k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae;
// g__Ruminococcus; s__Ruminococcus sp. HUN007
const green_genes_regex = /G\d{9}/g;
if (text.search(green_genes_regex) === 0) format = ['greengenes', /\d{9}/g];

// KEGG sample line
// e.g. c_000000007179_9 K16922
const kegg_regex = /c_\d{12}_/g;
if (text.search(kegg_regex) === 0) format = ['kegg', /\d{12}/g];
return format;
}