qiyunlab · AbhinavChede · Jan 2, 2022 · Jan 13, 2022 · Jan 15, 2022 · Jan 15, 2022
diff --git a/static/js/input.js b/static/js/input.js
@@ -80,6 +80,12 @@ function updateDataFromText(text, data, cols, filter) {
       parseAssembly(text, data, cols, filter);
     }
 
+
+    // parse as an annotation file
+    else if (data.length > 0 && getAnnotationFormat(text)) {
+      parseAnnotation(text, data, cols);
+    }
+
     // parse as a table
     else {
       parseTable(text, data, cols);
@@ -628,3 +634,132 @@ function parseContigTitle(line, format) {
   }
   return null;
 }
+
+/**
+ * Parses and retrieves information in annotation file.
+ * @function parseAnnotation
+ * @param {String} text - annotation file content (multi-line string)
+ * @param {Object} data - data object
+ * @param {Object} cols - cols object
+ * @param {number} threshold - similarity threshold for annotation files
+ */
+function parseAnnotation(text, data, cols, threshold) {
+  threshold = threshold || 0.7;
+  const lines = splitLines(text),
+        format = getAnnotationFormat(text)[0],
+        fregex = getAnnotationFormat(text)[1],
+        n = lines.length;
+
+  // calculating how much of the annotation file's contig IDs match
+  // the existing data's contig IDs
+  function calcSimilarity(contigIds, annIds) {
+    const contigSet = new Set(contigIds),
+          annArr = Array.from(new Set(annIds));
+    let count = 0;
+    for (let i = 0; i < annArr.length; i++) {
+      if (contigSet.has(Number(annArr[i]).toString())) count++;
+    }
+    return count / contigIds.length;
+  }
+
+  if (calcSimilarity(data[0], text.match(fregex)) < threshold) {
+    // Number of contigs do not match enough of the number of rows in annotation file
+    throw `Annotation File does not match ${threshold * 100}% of the existing data.`
+  }
+
+  else if (format === 'greengenes') {
+    const regex = /G\d{9}|[dkpcofgs]__[-a-zA-Z0-9\.\f _]{2,}/g,
+          ref = {'G': '', 'd': '', 'k': '', 'p': '', 'c': '', 
+                'o': '', 'f': '', 'g': '',  's': ''};
+    let arr2d = new Array(data[0].length).fill(new Array(8).fill(''));
+
+    for (let i = 0; i < n; i++) {
+      let line = lines[i];
+      // extracting the taxon values from greengenes file
+      let rawArr = line.match(regex);
+      let arr = [];
+
+      // parsing the extracted taxonomy data and updating reference object
+      for (let j = 0; j < rawArr.length; j++) {
+        if (!rawArr[j]) continue;
+        let index = rawArr[j].charAt();
+
+        if (index !== 'G') ref[index] = rawArr[j].substring(3);
+        else ref[index] = rawArr[j].substring(1);
+      }
+
+      // finding where the information needs to go in the existing data
+      let id = data[0].indexOf(Number(ref['G']).toString());
+
+      if (id >= 0) {
+        let vals = Object.values(ref);
+        vals.shift();
+        // inserting the parsed data into proper place
+        arr2d.splice(id, 1, vals)
+      }
+    }
+
+    for (let arr of transpose(arr2d)) data.push(arr);
+    cols.names = cols.names.concat(['Domain','Kingdom', 'Phylum', 'Class', 
+                                  'Order', 'Family', 'Genus', 'Species']);
+    cols.types = cols.types.concat(['cat', 'cat', 'cat', 'cat', 
+                                  'cat', 'cat', 'cat', 'des']);
+  }
+
+  else if (format === 'kegg') {
+    const regex = /\d{12}|\d+|K\d{5}/g,
+          ref = {};
+    let arr2d = new Array(data[0].length).fill('');
+
+    for (let i = 0; i < n; i++) {
+      let line = lines[i]; 
+      // extracting the kegg gene values
+      let rawArr = line.match(regex);
+      let annId = Number(rawArr[0]);
+
+      // finding where the information needs to go on the existing data
+      let id = data[0].indexOf((Number(rawArr[0])).toString());
+
+      if (ref[annId] && rawArr.length === 3) {
+        let current = ref[annId];
+        // creating a collection of kegg values and contig ids
+        current.push(rawArr[2])
+        ref[annId] = current;
+      }
+
+      else if (!ref[annId]) ref[annId] = [];
+
+      // inserting the parsed data into the proper place
+      arr2d.splice(id, 1, ref[annId]);
+    }
+
+    data.push(arr2d);
+    cols.names = cols.names.concat(['KEGG']);
+    cols.types = cols.types.concat(['fea']);
+  }
+
+}
+
+/**
+ * Infer the format of an annotation file.
+ * @function getAnnotationFormat
+ * @param {String} text - file content (multi-line)
+ * @returns {String} format - annotation file format (GreenGenes or null)
+ * @description This function searches for unique starting sequences of different annotation file
+ * formats. Currently, it supports GreenGenes and KEGG format.
+ */
+function getAnnotationFormat(text) {
+  let format = null;
+
+  // GreenGenes sample line
+  // e.g. G000712055  k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; 
+  // g__Ruminococcus; s__Ruminococcus sp. HUN007
+  const green_genes_regex = /G\d{9}/g;
+  if (text.search(green_genes_regex) === 0) format = ['greengenes', /\d{9}/g];
+
+  // KEGG sample line
+  // e.g. c_000000007179_9  K16922
+  const kegg_regex = /c_\d{12}_/g;
+  if (text.search(kegg_regex) === 0) format = ['kegg', /\d{12}/g];
+  return format;
+}