# Processing Protein data with Arabesque

### First, read SCOP keys

In [1]:
%%scala
import java.io._
import java.nio.file._
import scala.collection.mutable.MutableList
import scala.collection.mutable.Map
import sys.process._

val MIN_FIELDS = 3

def getListOfFiles(dir: String):List[File] = {
  val d = new File(dir)
  if (d.exists && d.isDirectory) {
    d.listFiles.filter(_.isFile).toList
  } else {
    List[File]()
  }
}

val scop_loc = "/export/arabesque_apps/pdb/data/structures/divided/scop-pdb/graph/"
val existing_keys_path = "/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output/"

val p_class: MutableList[String] = MutableList()
val p_class_fold: MutableList[String] = MutableList()
val p_superfamily: MutableList[String] = MutableList()
val p_family: MutableList[String] = MutableList()
val ver_count = Map[String, String]()

val files_list = getListOfFiles(scop_loc).map(_.getName())

files_list.map((f: String) => {
    
//     val cmd = "cat " + scop_loc + f
//     val v_count = cmd #| "wc -l" !!
//     ver_count(f) = v_count
//     println(v_count)
    val fname_components = f.replace(".graph", "").split("_")
    
    val path = Paths.get(existing_keys_path + f)
    val exists = Files.exists(path)
        
    if ( !f.startsWith("all-pdb") && fname_components.length >= MIN_FIELDS && exists )
    {
        for( comp <- fname_components ) {
            val x = comp.split("=")

            x(0) match {
                case "cl" => p_class += x(1)
                case "cf" => p_class_fold += x(1)
                case "sf" => p_superfamily += x(1)
                case "fa" => p_family += x(1)
            }
        }
    }
})

val class_keys = p_class.distinct.map(_.toInt).sorted.mkString(", ")
val class_fold_keys = p_class_fold.distinct.map(_.toInt).sorted.mkString(", ")
val superfamily_keys = p_superfamily.distinct.map(_.toInt).sorted.mkString(", ")
val family_keys = p_family.distinct.map(_.toInt).sorted.mkString(", ")
val all_files = files_list.filter((f: String) => {
    val file = new File(existing_keys_path + f)
    f.count(_ == '_') >= MIN_FIELDS && Files.exists(Paths.get(existing_keys_path + f))
}).map("\"" + _ + "\"").sorted.mkString(", ")

// println("class keys: " + class_keys + "\n\n")
// println("class_fold_keys: " + class_fold_keys + "\n\n")
// println("superfamily_keys: " + superfamily_keys + "\n\n")
// println("family_keys: " + family_keys + "\n\n")
println(all_files)

"cl=46456_cf=101214_sf=101215_fa=101216.graph", "cl=46456_cf=101214_sf=101215_fa=109836.graph", "cl=46456_cf=101223_sf=101224_fa=101225.graph", "cl=46456_cf=101223_sf=101224_fa=227213.graph", "cl=46456_cf=101232_sf=101233_fa=101234.graph", "cl=46456_cf=101237_sf=101238_fa=101239.graph", "cl=46456_cf=101237_sf=101238_fa=254270.graph", "cl=46456_cf=101256_sf=101257_fa=101258.graph", "cl=46456_cf=101261_sf=101262_fa=101263.graph", "cl=46456_cf=101277_sf=101278_fa=101279.graph", "cl=46456_cf=101282_sf=101283_fa=101284.graph", "cl=46456_cf=101287_sf=101288_fa=101289.graph", "cl=46456_cf=101306_sf=101307_fa=101308.graph", "cl=46456_cf=101321_sf=101322_fa=101323.graph", "cl=46456_cf=101326_sf=101327_fa=101328.graph", "cl=46456_cf=101331_sf=101332_fa=101333.graph", "cl=46456_cf=101343_sf=101344_fa=101345.graph", "cl=46456_cf=101352_sf=101353_fa=101354.graph", "cl=46456_cf=101385_sf=101386_fa=101387.graph", "cl=46456_cf=101385_sf=101386_fa=116993.graph", "cl=46456_cf=101385_sf=101386_fa=140794.

### Draw the interface in order to select the SCOP key to work with

In [2]:
kernel.magics.html("""
<!-- Latest compiled and minified CSS -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
<!-- Optional theme -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap-theme.min.css" integrity="sha384-rHyoN1iRsVXV4nD0JutlnGaslCJuC7uwjduW9SVrLvRYooPp2bWYgmgJQIXwl/Sp" crossorigin="anonymous">
<!-- Latest compiled and minified JavaScript -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>

<h1> Select SCOP key </h1>

<br><br>
<div class="row">
  <div class="col-sm-12">
  <strong>Select file: </strong>
    <select id="file_dropdown">
      <option value="select">--Select--</option>
    </select>
  </div>
</div>

<div class="row">
  <div class="col-sm-12">
    -------------------------------------------------------------------
  </div>
</div>

<div class="row">
  <div class="col-sm-3">
    <strong>Class: </strong>
    <select id="class_dropdown">
      <option value="select">--Select--</option>
    </select>
  </div>
  <div class="col-sm-3">
    <strong>Class Fold: </strong>
    <select id="class_fold_dropdown">
      <option value="select">--Select--</option>
    </select>
  </div>
  <div class="col-sm-3">
    <strong>Superfamily: </strong>
    <select id="superfamily_dropdown">
      <option value="select">--Select--</option>
    </select>
  </div>
  <div class="col-sm-3">
    <strong>Family: </strong>
    <select id="family_dropdown">
      <option value="select">--Select--</option>
    </select>
  </div>
</div>


<h2>Selected key file: <span id="selected_key_lbl" class="label label-default">---</span></h2>


<script type="text/Javascript">
    function set_value(var_name, var_value){
        var command = "var " + var_name + " = " + var_value;
        console.log("Executing Command: " + command);
        
        var kernel = Jupyter.notebook.kernel;
        kernel.execute(command);
    }

    function reset_dropdown_list(name) {
        $("#" + name).empty();
        $("#" + name).append("<option value=\"select\">--select--</option>");
    }

    function fill_dropdown_list(name, items) {
        
        reset_dropdown_list(name);
        
        $.each(items, function(index, value) {
            $("#" + name).append("<option value=\"" + value + "\">" + value + "</option>");
        });
    }
    
    var files_list = [%s];
    var class_keys = [%s];
    var class_fold_keys = [%s];
    var superfamily_keys = [%s];
    var family_keys = [%s];

    fill_dropdown_list("class_dropdown", class_keys);
    fill_dropdown_list("class_fold_dropdown", class_fold_keys);
    fill_dropdown_list("superfamily_dropdown", superfamily_dropdown);
    fill_dropdown_list("family_dropdown", family_keys);
    fill_dropdown_list("file_dropdown", files_list);

    function update_selected_key() {
        var selected_class_key = $("select[id=class_dropdown]").val();
        var selected_class_fold_key = $("select[id=class_fold_dropdown]").val();
        var selected_superfamily_key = $("select[id=superfamily_dropdown]").val();
        var selected_family_key = $("select[id=family_dropdown]").val();
        var selected_file = $("select[id=file_dropdown]").val();
        
        var selected_key = "";
        
        if ( !selected_file.includes("select") )
            selected_key = selected_file;
        else if ( !selected_class_key.includes("select") )
        {
            selected_key += "cl=" + selected_class_key;
            if ( !selected_class_fold_key.includes("select") )
                selected_key += "_cf=" + selected_class_fold_key;
            if ( !selected_superfamily_key.includes("select"))
                selected_key += "_sf=" + selected_superfamily_key;
            if ( !selected_family_key.includes("select"))
                selected_key += "_fa=" + selected_family_key;

            selected_key += ".graph";
        }
        
        if ( selected_key.length > 6 )
            $("#selected_key_lbl").text(selected_key);
        
        if (files_list.indexOf(selected_key) >=0 )
            $("#selected_key_lbl").removeClass("label-danger label-default").addClass("label-success");
        else
            $("#selected_key_lbl").removeClass("label-default label-success").addClass("label-danger");
        
        set_value("selected_key_file" , "\""+ selected_key + "\"");
        
        return selected_key;
    }
    
    $("select").change(function(){update_selected_key();});

    $("select[id=class_dropdown]").change(function() {
        
        $("select[id=file_dropdown]").val("select");
        
        var class_key = $("select[id=class_dropdown]").val();
        var added_folds = [];
                
        $.each(files_list, function(indx, item) {

            if ( item.startsWith("cl=" + class_key) )
            {
                var x = item.replace(".graph", "").split("_");
                if ( x.length > 1 ){
                    x = x[1]
                    var y = x.split("=")[1];
                    if ( added_folds.indexOf(y) < 0 )
                        added_folds.push(y);
                }
            }
        });
        
        fill_dropdown_list("class_fold_dropdown", added_folds);
        update_selected_key();
    });

    $("select[id=class_fold_dropdown]").change(function() {

        $("select[id=file_dropdown]").val("select");
        
        var class_key = $("select[id=class_dropdown]").val();
        var class_fold_key = $("select[id=class_fold_dropdown]").val();
        var added_superfamily = [];
                
        $.each(files_list, function(indx, item) {

            if ( item.startsWith("cl=" + class_key + "_cf=" + class_fold_key) )
            {
                var x = item.replace(".graph", "").split("_");
                if ( x.length > 2 ){
                    x = x[2]
                    var y = x.split("=")[1];
                    if ( added_superfamily.indexOf(y) < 0 )
                        added_superfamily.push(y);
                }
            }
        });
        
        fill_dropdown_list("superfamily_dropdown", added_superfamily);
        update_selected_key();
    });

    $("select[id=superfamily_dropdown]").change(function() {

        $("select[id=file_dropdown]").val("select");
        
        var class_key = $("select[id=class_dropdown]").val();
        var class_fold_key = $("select[id=class_fold_dropdown]").val();
        var superfamily_key = $("select[id=superfamily_dropdown]").val();
        var added_family = [];
                
        $.each(files_list, function(indx, item) {

            if ( item.startsWith("cl=" + class_key + "_cf=" + class_fold_key + "_sf=" + superfamily_key) )
            {
                var x = item.replace(".graph", "").split("_");
                if ( x.length > 3 ){
                    x = x[3]
                    var y = x.split("=")[1];
                    if ( added_family.indexOf(y) < 0 )
                        added_family.push(y);
                }
            }
        });
        
        fill_dropdown_list("family_dropdown", added_family);
        update_selected_key();
    });

</script>
""".format(all_files, class_keys, class_fold_keys, superfamily_keys, family_keys))


### Run Arabesque on the selected SCOP key

In [4]:
val fsmSupport = 10
val fsmEmbeddingSize = 3

val pdb_dir = "/export/arabesque_apps/pdb/data/structures/divided/scop-pdb/"

val inputPdbGraphPath = "hdfs:///input/pdb/graph/" + selected_key_file
println(inputPdbGraphPath)

import io.arabesque._

val ac: ArabesqueContext = new ArabesqueContext(sc)
// val ag: ArabesqueGraph = ac.textFile(inputPdbGraphPath)
val ag: ArabesqueGraph = ac.textFile("/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/graph/" + selected_key_file, true)
val fsm = ag.fsm (fsmSupport, fsmEmbeddingSize)

val fsmAgg = fsm.aggregation ("support")


hdfs:///input/pdb/graph/cl=46456_cf=101223_sf=101224_fa=227213.graph
Master computing
Aggregation Storage: PatternAggregationStorage{quick2CanonicalMap={[0,1-1,2]=[0,1-1,2], [0,1-1,7]=[0,1-1,7], [0,1-1,12]=[0,1-1,12], [0,1-1,14]=[0,1-1,14], [0,17-1,14]=[0,14-1,17], [0,17-1,17]=[0,17-1,17], [0,8-1,7]=[0,7-1,8], [0,8-1,8]=[0,8-1,8], [0,8-1,14]=[0,8-1,14], [0,16-1,16]=[0,16-1,16], [0,7-1,1]=[0,1-1,7], [0,7-1,2]=[0,2-1,7], [0,7-1,7]=[0,7-1,7], [0,7-1,8]=[0,7-1,8], [0,7-1,12]=[0,7-1,12], [0,7-1,13]=[0,7-1,13], [0,7-1,14]=[0,7-1,14], [0,7-1,22]=[0,7-1,22], [0,14-1,1]=[0,1-1,14], [0,14-1,7]=[0,7-1,14], [0,14-1,8]=[0,8-1,14], [0,14-1,12]=[0,12-1,14], [0,14-1,13]=[0,13-1,14], [0,22-1,7]=[0,7-1,22], [0,14-1,17]=[0,14-1,17], [0,14-1,22]=[0,14-1,22], [0,22-1,14]=[0,14-1,22], [0,13-1,4]=[0,4-1,13], [0,13-1,7]=[0,7-1,13], [0,13-1,13]=[0,13-1,13], [0,13-1,14]=[0,13-1,14], [0,12-1,1]=[0,1-1,12], [0,12-1,2]=[0,2-1,12], [0,4-1,13]=[0,4-1,13], [0,12-1,7]=[0,7-1,12], [0,12-1,14]=[0,12-1,14], [0,3-1,2]=[0,

### Define data structures used to post process Arabesque output

In [5]:
import io.arabesque.embedding._
import io.arabesque.pattern._
import java.io._
//import scala.collection.mutable.MutableList
//import scala.collection.mutable.Map
import scala.collection.mutable.ArrayBuffer
import scala.io.Source


def getEmbeddingVertices(emb: ResultEmbedding): Array[Int] = {
    return emb.words
}

class Vertex(var vid: Int, var lbl: Int) extends Serializable {
    
    override def toString = s"$vid,$lbl"
}

@SerialVersionUID(123L)
class Atom(var vid: Int, 
            var lbl: Int, 
            var amino_3: String, 
            var amino_1: Char,
            var position: (Float, Float, Float),
            var chain: String,
            var chain_pos: Int,
            var pdbFile: String) extends Serializable {
    
    override def toString = s"[$vid, $lbl, $amino_3, $position, $chain, $chain_pos, $pdbFile]"
    def toPDBString = s"$amino_3 $chain $chain_pos $pdbFile"
    def selectChimeraString = s":$chain_pos.$chain@ca"
    def openPDBChimeraString = s"open $pdbFile; "
}

class Pattern(pat: String, var freq: Int) extends Serializable {
    
    var vertices = scala.collection.mutable.Map[Int, Vertex]()
    var edges = scala.collection.mutable.MutableList[(Vertex, Vertex)]()

    var comp = pat.replace("(", "").replace(")", "").replace("[", "").replace("]", "").split(",|\\-")
    
//     println("comp before: " + comp.mkString(" , "))
//     println("comp after: " + comp.mkString(" , "))
    for{ i <- 0 to comp.length - 1; if i%4 == 0 } {
        
        val ver1 = new Vertex(comp(i).toInt , comp(i+1).toInt)
        this.vertices(ver1.vid) = ver1
        val ver2 = new Vertex(comp(i+2).toInt , comp(i+3).toInt)
        this.vertices(ver2.vid) = ver2

        this.edges += ((ver1, ver2))
    }
    
    override def toString = this.edges.map(pair => "[" + pair._1 + "-" + pair._2 + "]").mkString(",")
    def toPDBString(labelDict: scala.collection.mutable.Map[Int, String]) = {
        this.edges.map(pair => "[" + pair._1.vid + " , " + labelDict(pair._1.lbl) + "<=>" + pair._2.vid + " , " + labelDict(pair._2.lbl) + "]").mkString(" , ")
    }
}

class Embedding(embed: String) extends Serializable {
    
    var vertices = scala.collection.mutable.Map[Int, Vertex]()
    var edges = scala.collection.mutable.MutableList[(Vertex, Vertex)]()
    
    var comp = embed.replace("(", "").replace(")", "").replace("[", "").replace("]", "").split(",|\\-")
    for{ i <- 0 to comp.length - 1; if i%2 == 0 } {
        
        val ver1 = new Vertex(comp(i).toInt , -1)
        this.vertices(ver1.vid) = ver1
        val ver2 = new Vertex(comp(i+1).toInt , -1)
        this.vertices(ver2.vid) = ver2

        this.edges += ((ver1, ver2))
    }
    
    override def toString = this.edges.map(pair => "[" + pair._1 + "-" + pair._2 + "]").mkString(",")
    def toPDBString(atomsDict: Map[Int, Atom]) = {
        this.edges.map(pair => "(" + atomsDict(pair._1.vid) + "<=>" + atomsDict(pair._2.vid) + ")").mkString(" , ")
    }
}

### Reverse mapping Arabesque representation to Protein representation

In [6]:
val labelsFileName = pdb_dir + "labels/" + selected_key_file.replace(".graph", ".labels.txt")
println(labelsFileName)

val atomsDict = scala.collection.mutable.Map.empty[Int, Atom]
val labelsDict = scala.collection.mutable.Map.empty[Int, ArrayBuffer[Atom]]
val labels = scala.collection.mutable.Map.empty[Int, String]

for(line <- Source.fromFile(labelsFileName).getLines()){
//     println(line)
    val line_comp = line.split(" ")
    val vid = line_comp(0).toInt
    val lbl = line_comp(1).toInt
    val amino_3 = line_comp(2)
    val amino_1 = line_comp(3)(0)
    var chain_pos = line_comp(4).substring(1).toInt
    val position = (line_comp(5).toFloat, line_comp(6).toFloat, line_comp(7).toFloat)
    val chain = line_comp(8).replace("'", "")
    val pdbFile = line_comp(9).substring(3, 7)
    
    val atom = new Atom(vid, lbl, amino_3, amino_1, position, chain, chain_pos, pdbFile)
    atomsDict(vid) = atom
    
    if( !labelsDict.contains(lbl) )
        labelsDict(lbl) = new ArrayBuffer[Atom]()
    
    labelsDict(lbl) += atom
    labels(lbl) = amino_3
    println(atomsDict(vid) + ",")
//     println(atomsDict(vid).vid + " , " + atomsDict(vid).lbl)
}


/export/arabesque_apps/pdb/data/structures/divided/scop-pdb/labels/cl=46456_cf=101223_sf=101224_fa=227213.labels.txt
[0, 7, GLU, (59.156,-4.407,2.359), A, 739, 2nog],
[1, 17, PRO, (58.966,-5.162,6.097), A, 740, 2nog],
[2, 14, LYS, (59.962,-8.599,7.357), A, 741, 2nog],
[3, 22, VAL, (56.491,-8.749,8.846), A, 742, 2nog],
[4, 17, PRO, (53.71,-6.926,6.969), A, 743, 2nog],
[5, 14, LYS, (51.955,-4.23,8.995), A, 744, 2nog],
[6, 1, ALA, (48.722,-5.704,10.35), A, 745, 2nog],
[7, 17, PRO, (45.506,-4.513,8.593), A, 746, 2nog],
[8, 2, ARG, (43.316,-1.617,9.779), A, 747, 2nog],
[9, 17, PRO, (39.668,-0.76,8.89), A, 748, 2nog],
[10, 17, PRO, (39.764,1.343,5.715), A, 749, 2nog],
[11, 14, LYS, (36.884,3.406,7.145), A, 750, 2nog],
[12, 8, GLN, (38.569,4.432,10.403), A, 751, 2nog],
[13, 17, PRO, (38.727,8.133,11.4), A, 752, 2nog],
[14, 3, ASN, (42.012,9.68,10.292), A, 753, 2nog],
[15, 22, VAL, (43.146,11.231,13.568), A, 754, 2nog],
[16, 8, GLN, (46.648,12.693,13.667), A, 755, 2nog],
[17, 4, ASP, (49.012,1

### Listing files generated by Arabesque for the selected SCOP key

In [7]:
// temporery cell to process to output of arabesque on a scop on hdfs
import scala.io.Source._
import scala.collection.mutable.ListBuffer

def getListOfFilesRecursively(dir: String):List[File] = {
    val allFiles = ListBuffer.empty[File]
    val d = new File(dir)
    if (d.exists && d.isDirectory) {
        allFiles ++= d.listFiles.filter(_.isFile).toList
        val allDirs = d.listFiles.filter(_.isDirectory).toList
        for(file <- allDirs) {
            //println(file.getPath)
            allFiles ++= getListOfFilesRecursively(file.getPath)
        }
    } else {
        allFiles ++= List[File]()
    }
    
    return allFiles.toList
}

var files = getListOfFilesRecursively("/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output/" + selected_key_file + "/")
files.foreach(println)

val patternEmbeddingDict = scala.collection.mutable.Map[String, ArrayBuffer[String]]()

for(file <- files) {
    for(line <- fromFile(file).getLines) {
        val peArr = line.split(" _ ")
        if ( peArr.size == 2) {
            val pattern = peArr(1).trim
            val embeddings = peArr(0).trim

            if ( !patternEmbeddingDict.contains(pattern) )
                patternEmbeddingDict(pattern) = new ArrayBuffer[String]()

            patternEmbeddingDict(pattern) += embeddings
        }
    }
}

val peDictString = "{" + patternEmbeddingDict.map(pair => "\"" + pair._1 + "\"" + ": [\"" + pair._2.mkString("\", \"") + "\"]").mkString(", ") + "}"
println("peDictString:\n" + peDictString + "\n")

/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output/cl=46456_cf=101223_sf=101224_fa=227213.graph/1/4
/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output/cl=46456_cf=101223_sf=101224_fa=227213.graph/1/3
/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output/cl=46456_cf=101223_sf=101224_fa=227213.graph/1/5
/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output/cl=46456_cf=101223_sf=101224_fa=227213.graph/1/1
/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output/cl=46456_cf=101223_sf=101224_fa=227213.graph/1/9
/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output/cl=46456_cf=101223_sf=101224_fa=227213.graph/1/0
/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output/cl=46456_cf=101223_sf=101224_fa=227213.graph/1/2
/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output/cl=46456_cf=101223_sf=101224_fa=227213.graph/1/7
/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output/cl=46456_cf=101223_sf=101224_fa=227213.graph/1/8
/home/local/QCRI/abghanem/arabesque/pdb/scop-pdb/output

### Aggragating & formating the output for presentation

In [8]:
// val patternEmbeddingDict = Map[String, Array[String]](
//     "[0,13-1,19],[1,19-2,13]" -> Array("103-106 106-108", "172-176 176-180", "112-115 115-117"),
//     "[0,7-1,8],[0,7-2,12]" -> Array("120-122 120-123", "256-257 257-259", "301-304 304-307"),
//     "[0,8-1,8],[1,8-2,2]" -> Array("173-177 177-181", "88-135 135-137", "181-185 185-187"),
//     "[0,7-1,12],[1,12-2,7]" -> Array("202-204 203-204", "181-185 184-185", "169-171 171-172"),
//     "[0,22-1,12],[1,12-2,2]" -> Array("256-257 257-260", "278-281 281-284", "266-270 270-272")
// )

// mapping arabesque output back to atoms, amino acid and pdb representation
val reversedDict = patternEmbeddingDict.map(pair => {
    val newArr = pair._2.map(emb => {
        val atomsArray = new ArrayBuffer[Array[Atom]]()
        
        for( embedding <- emb.split(" ")) {
            val vids = embedding.split("-")
            val atom1 = atomsDict(vids(0).toInt)
            val atom2 = atomsDict(vids(1).toInt)
            atomsArray += Array(atom1, atom2)
        }
        
        atomsArray
    })
    val pattern = new Pattern(pair._1, newArr.length);
    
    pattern -> newArr
})

val embedSelDict = Map.empty[String, ArrayBuffer[String]]
val embedOpenDict = Map.empty[String, ArrayBuffer[String]]
val patternOpenDict = Map.empty[Pattern, ArrayBuffer[String]]

val patternSelDict = reversedDict.map(pair => {

    val patternOpenPDB = new ArrayBuffer[String]()
    var patternSel = new ArrayBuffer[String]()
    
    for ( edges <- pair._2 ){
        
        var embedSel = new ArrayBuffer[String]()
        var embedOpenPDB = new ArrayBuffer[String]()
        
        for ( atomArr <- edges ) {
            val atom1 = atomArr(0)
            val atom2 = atomArr(1)

            embedOpenPDB += atom1.pdbFile
            embedOpenPDB += atom2.pdbFile

            embedSel += atom1.selectChimeraString
            embedSel += atom2.selectChimeraString
        }
        
        val embed = edges.map(atoms => "(" + atoms(0).toPDBString + "<=>" + atoms(1).toPDBString + ")").mkString(" , ")
        embedSelDict(embed) = embedSel.distinct
        embedOpenDict(embed) = embedOpenPDB.distinct
        
        patternSel.appendAll(embedSel)
        patternOpenPDB.appendAll(embedOpenPDB)
    }
    
    patternOpenDict(pair._1) = patternOpenPDB.distinct
    
    pair._1 -> patternSel.distinct
})

val peDictString = "{" + patternEmbeddingDict.map(pair => "\"" + pair._1 + "\"" + ": [\"" + pair._2.mkString("\", \"") + "\"]").mkString(", ") + "}"
println("peDictString:\n" + peDictString + "\n")

var revpeDictString = reversedDict.map(pair => "\"" + pair._1.toPDBString(labels) + "\"" + ": [\"" + pair._2.map(atomsArray => atomsArray.map(atoms => "(" + atoms(0).toPDBString + "<=>" + atoms(1).toPDBString + ")").mkString(" , ")).mkString("\", \"") + "\"]").mkString(", ")
revpeDictString = "{" + revpeDictString + "}"
println("revpeDictString:\n" + revpeDictString + "\n")

var combinedSelDictString = patternSelDict.map(pair => "\"" + pair._1.toPDBString(labels) + "\"" + ": [\"" + pair._2.mkString("\",\"") + "\"]").mkString(",")
combinedSelDictString += ", " + embedSelDict.map(pair => "\"" + pair._1 + "\"" + ": [\"" + pair._2.mkString("\",\"") + "\"]").mkString(",")
combinedSelDictString = "{" + combinedSelDictString + "}"
println("combinedSelDictString:\n" + combinedSelDictString + "\n")

var combinedOpenDictString = patternOpenDict.map(pair => "\"" + pair._1.toPDBString(labels) + "\"" + ": [\"" + pair._2.mkString("\",\"") + "\"]").mkString(",")
combinedOpenDictString += ", " + embedOpenDict.map(pair => "\"" + pair._1 + "\"" + ": [\"" + pair._2.mkString("\",\"") + "\"]").mkString(",")
combinedOpenDictString = "{" + combinedOpenDictString + "}"
println("combinedOpenDictString:\n" + combinedOpenDictString + "\n")

peDictString:
{"[0,14-1,7],[0,14-2,1]": ["220-223 220-224", "63-66 63-67", "31-32 31-67", "188-189 188-224"], "[0,2-1,12]": ["165-190", "154-155", "194-197", "37-40", "8-33", "37-64", "311-312", "194-221"], "[0,7-1,14],[0,7-2,13]": ["79-80 79-82", "79-80 79-81", "187-188 187-191", "77-78 77-81", "236-237 236-238", "30-31 30-34", "77-80 77-81", "234-237 234-238", "236-237 236-239", "234-235 234-238"], "[0,13-1,13],[0,13-2,7]": ["185-186 185-189", "28-29 28-32", "28-29 28-30", "185-186 185-187"], "[0,14-1,17]": ["43-45", "11-13", "119-121", "5-7", "200-202", "2-4", "276-278", "5-167", "162-164", "168-170", "159-161"], "[0,14-1,7],[1,7-2,14]": ["215-216 216-220", "99-102 102-103", "215-217 217-220", "58-60 60-63", "58-59 59-63", "78-79 79-80", "256-259 259-260", "235-236 236-237"], "[0,7-1,13],[0,7-2,13]": ["189-191 189-227", "184-185 184-186", "236-238 236-239", "32-34 32-70", "27-28 27-29", "79-81 79-82"], "[0,2-1,3]": ["249-252", "203-204", "291-293", "92-95", "46-47", "134-136"], "[0,

### Visualizing the output with Chimera

In [9]:
import sys.process._

val ipAddress = "10.161.200.131"
val userName = "ghanemabdo"

kernel.magics.html("""
<style>
.txtstuff {
    resize: none; /* remove this if you want the user to be able to resize it in modern browsers */
    overflow: hidden;
}

.hiddendiv {
    display: none;
    white-space: pre-wrap;
    word-wrap: break-word;
    overflow-wrap: break-word; /* future version of deprecated 'word-wrap' */
}

/* the styles for 'commmon' are applied to both the textarea and the hidden clone */
/* these must be the same for both */
.common {
    width: 500px;
    min-height: 50px;
    font-family: Arial, sans-serif;
    font-size: 13px;
    overflow: hidden;
}

.lbr {
    line-height: 3px;
}
</style>
<!-- Latest compiled and minified CSS -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
<!-- Optional theme -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap-theme.min.css" integrity="sha384-rHyoN1iRsVXV4nD0JutlnGaslCJuC7uwjduW9SVrLvRYooPp2bWYgmgJQIXwl/Sp" crossorigin="anonymous">
<link rel="stylesheet" href="https://b3nj4m.github.io/really-simple-bootstrap-color-picker/colorPicker.css" crossorigin="anonymous">

<label for="sel1"><h1> Select a pattern to visualize </h1></label>

<br><br>

<div id="info-lbl" class="alert alert-info" role="alert">...</div>

<div class="row">
  <div class="col-sm-5">
    <label for="pattern-select"><h3> Select pattern(s): </h3></label>
    <select multiple class="form-control" id="pattern-select"  style="min-height: 250px; max-width: 400px;">

      </select>
    <textarea id="selected-pattern" class="form-control common" placeholder="Selected patterns">---</textarea>
  </div>
  <div class="col-sm-7">
    <label for="embedding-select"><h3> Select embedding(s): </h3></label>
    <select multiple class="form-control" id="embedding-select" style="min-height: 250px; max-width: 550px;">

      </select>
    <textarea id="selected-embedding" class="form-control common" placeholder="Selected embeddings">---</textarea>
  </div>
</div>
<br>
<div class="row">
  <div class="col-sm-5">
    
  </div>
  <div class="col-sm-7">
    <div id="embeddings-count-lbl" class="alert alert-info" role="alert">Number of embeddings: ---</div>
  </div>
</div>

<label><h3> Select options: </h3></label>
<div class="row">
  <div class="col-sm-2">
    <input type="checkbox" name="select" value="select" checked> Select.
  </div>
  <div class="col-sm-2">
    <input type="checkbox" id="focus-checkbox" name="focus" value="focus" checked> Focus.
  </div>
  <div class="col-sm-2">
    <input type="checkbox" id="color-checkbox" name="color" value="color" > Color.
  </div>
  <div class="col-sm-6">
      <div class="control-group">
        <label class="control-label">Pick a Color</label>
        <div class="controls input-prepend">
          <input type="text" id="color-picker" name="color" class="color input-small" value="#FF0000" data-text="AG" />
        </div>
      </div>
  </div>
</div>
<div class="row">
    <div class="col-sm-2">
        <input type="checkbox" id="fadeout-checkbox" name="fadeout" value="fadeout" > Fadeout others.
    </div>
    <div class="col-sm-2">
        <input type="checkbox" id="open-pdb-checkbox" name="open-pdb" value="open-pdb" checked> Open pdb file.
    </div>
    <div class="col-sm-2">
          <input type="checkbox" id="additive-coloring" name="additive-coloring" value="additive-coloring" > Additive Coloring.
    </div>
    <div class="col-sm-2">
      
    </div>
    <div class="col-sm-2">
      
    </div>
    <div class="col-sm-2">
      <button id="reopen-btn" type="button" class="btn btn-danger btn-block">Reopen all</button>
    </div>
</div>
<div class="row">
    <div class="col-sm-12">
        <h3>Chimera Command: </h3>
        <br>
        <textarea id="pattern_cmd" class="form-control common">---</textarea>
    </div>
</div>

<br><br>
<div class="row">
    <div class="col-sm-2">
        
    </div>
    <div class="col-sm-2">
        
    </div>
    <div class="col-sm-2">
      
    </div>
    <div class="col-sm-2">
      
    </div>
    <div class="col-sm-2">
      
    </div>
    <div class="col-sm-2">
      <button id="submit-query-btn" type="button" class="btn btn-primary btn-block">Run Command</button>
    </div>
</div>


<!-- Latest compiled and minified JavaScript -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
<script src="https://b3nj4m.github.io/really-simple-bootstrap-color-picker/jquery.colorPicker.js" crossorigin="anonymous"></script>

<script type="text/Javascript">

    function set_value(var_name, var_value){
        var command = "var " + var_name + " = " + var_value;
        console.log("Executing Command: " + command);
        
        var kernel = Jupyter.notebook.kernel;
        kernel.execute(command);
    }
    
    function reset_dropdown_list(name) {
        $("#" + name).empty();
        $("#" + name).append("<option style=\"min-height: 25px;\" value=\"none\"><b>none</b></option>");
    }

    function fill_select_list(name, items) {
        
        reset_dropdown_list(name);
        
        $.each(items, function(index, value) {
            $("#" + name).append("<option style=\"min-height: 25px;\" value=\"" + value + "\"><b>" + value + "</b></option>");
        });
        
        $("#" + name).val("none");
    }
    
    var patterns = %s;
    var chimeraSel = %s;
    var chimeraOpen = %s;
    console.log(patterns);
    console.log(chimeraSel);
    console.log(chimeraOpen);

    fill_select_list("pattern-select", Object.keys(patterns).sort(function(a,b) { return a.length - b.length}));
    
    function construct_selectors() {
        
        var selectors = [];
        
        var selected_patterns = $("#pattern-select").val();
        var selected_embeddings = $("#embedding-select").val();
        
        if( selected_embeddings.length <= 1 && selected_embeddings[0] === "none" ) {
            for(sel_pattern of selected_patterns) {
                if( sel_pattern !== "none" ){
                    var selList = chimeraSel[sel_pattern];
                    selectors = selectors.concat(selList);
                }
            }
        } else {
            for(sel_embed of selected_embeddings) {
                console.log("there: " + selected_embeddings);
                if( sel_embed !== "none" ){
                    var selList = chimeraSel[sel_embed];
                    selectors = selectors.concat(selList);
                }
            }
        }
        
        selectors = selectors.filter(function(item, i, ar){ return ar.indexOf(item) === i; });
        
        return selectors;
    }
    
    var alreadyOpened = [];
    
    function construct_open() {
        
        var openCmds = [];
        
        var selected_patterns = $("#pattern-select").val();
        var selected_embeddings = $("#embedding-select").val();
        
        if( selected_embeddings.length <= 1 && selected_embeddings[0] === "none" ) {
            for(sel_pattern of selected_patterns) {
                if( sel_pattern !== "none" ){
                    var selList = chimeraOpen[sel_pattern];
                    openCmds = openCmds.concat(selList);
                }
            }
        } else {
            for(sel_embed of selected_embeddings) {
                if( sel_embed !== "none" ){
                    var selList = chimeraOpen[sel_embed];
                    openCmds = openCmds.concat(selList);
                }
            }
        }
        
        if($("#open-pdb-checkbox")[0].checked )
            alreadyOpened = []
        
        openCmds = openCmds.filter(function(item, i, ar){ 
                                    return ar.indexOf(item) === i && 
                                            alreadyOpened.indexOf(item) < 0; 
                                    });
        
        return openCmds;
    }
    
    function create_cmd() {
        var selectors = construct_selectors();
        var openCmds = construct_open();
        
        var cmd = "";
        
        if ( selectors.length > 0 || openCmds.length > 0 ){
            
            if ( openCmds.length > 0 ) {
                for(pdb_file of openCmds) {
                    cmd += "open " + pdb_file + "; ";
                }
                
                alreadyOpened = alreadyOpened.concat(openCmds);
            }

            if ( selectors.length > 0 )
                cmd += "sel " + selectors + "; ";

            if ( $("#color-checkbox")[0].checked ){
                if ( $("#additive-coloring")[0].checked == false )
                    cmd += "~color; ";
                
                cmd += "color " + $("#color-picker").val() + " sel; ";
            }

            if ( $("#fadeout-checkbox")[0].checked )
                cmd += "~transp; transp 80 ~sel; ";

            if( $("#focus-checkbox")[0].checked )
                cmd += "focus sel; ";
        }
        
        console.log("cmd: " + cmd);
        //$("#pattern_cmd").text(cmd);
        update_textarea("pattern_cmd", cmd);
        
        set_value("cmd", "\"" + cmd + "\"");
        
        return cmd;
    }
    
    function run_cmd(cmd) {
    
        var command = "Seq(\"ssh\", \"%s@%s\" , \"echo\", \"'\" + cmd + \"' > cmd.com;\", \
            \"/Applications/Chimera.app/Contents/MacOS/chimera --send cmd.com; rm cmd.com; exit\") !";
        console.log("Running command: " + command);
        
        var kernel = Jupyter.notebook.kernel;
        kernel.execute(command);
    }

    $("#pattern-select").change(function(){
        var results = [];
        var selected_patterns = $("#pattern-select").val();
        for(sel_pattern of selected_patterns) {
            if( sel_pattern !== "none" ){
                var embed_list = patterns[sel_pattern];
                results = results.concat(embed_list);
            }
        }
                
        fill_select_list("embedding-select", results);
        $("#embeddings-count-lbl").html("Number of embeddings: <strong>" + results.length + "</strong>");
        
        //$("#selected-pattern").html("" + selected_patterns);
        //$("#selected-embedding").html("---");
        update_textarea("selected-pattern", "" + "" + selected_patterns);
        update_textarea("selected-embedding", "---");
        
        create_cmd();
    });
    
    $("#embedding-select").change(function(){
        var selected_embeddings = $("#embedding-select").val();
        
        //$("#selected-embedding").html("" + selected_embeddings);
        update_textarea("selected-embedding", "" + selected_embeddings);
        
        create_cmd();
    });
    
    $(':checkbox').change(function() {
        create_cmd();
    });
    
    $("#submit-query-btn").click(function() {
        var cmd = $("#pattern_cmd").text();
        run_cmd(cmd);
        
        //remove the opened pdb files from the next command such that there's no duplicate files opened
        var cmds_arr = cmd.split(";");
        for(cmd of cmds_arr){
            if ( cmd.includes("open ") ){
                var cmd_arr = cmd.split(" ");
                for(term in cmd_arr){
                    if ( term.length > 0 && term.trim() !== "open")
                        alreadyOpened += term.trim();
                }
            }
        }
        
        $('#open-pdb-checkbox').attr('checked', false);
    });
    
    $("#reopen-btn").click(function() {
        alreadyOpened = [];
    });
    
    var embeddingsCount = 0;
    $.each(Object.values(patterns), function(index, value) {
        embeddingsCount += value.length
    });
    
    $("#info-lbl").html("Number of patterns: " + Object.keys(patterns).length + " <br>Number of embeddings: " + embeddingsCount);


    var txtArea = $('textarea');
    var hiddenDiv = $(document.createElement('div'));

    txtArea.addClass('txtstuff');
    hiddenDiv.addClass('hiddendiv common');

    $('body').append(hiddenDiv);

    function update_textarea(textarea, filltext) {

        $("#" + textarea).html(filltext);
        
        var content = filltext.replace(/\n/g, '<br>');
        console.log("content is: " + content);
        hiddenDiv.html(content + '<br class="lbr">');

        $("#" + textarea).css('height', hiddenDiv.height() + 10);
    }


    jQuery(document).ready(function($) {
    
        $('#color-picker').colorPicker({
            pickerDefault: "ffffff",
            transparency: true,
            });
      });
</script>

""".format(revpeDictString, combinedSelDictString, combinedOpenDictString, userName, ipAddress))

In [24]:
Seq("ssh", "ghanemabdo@10.161.200.101" , "echo", "'" + cmd + "' > cmd.com;", 
            "/Applications/Chimera.app/Contents/MacOS/chimera --send cmd.com; rm cmd.com; exit") !
