# Preprocessing to create Path Contexts

- Prerequisite: Scala Kernel installed

In [None]:
/* javaparser */
import $ivy.`com.github.javaparser:javaparser-symbol-solver-core:3.6.17`

/* betterfiles */
import $ivy.`com.github.pathikrit::better-files:3.6.0`

In [None]:
import com.github.javaparser._
import com.github.javaparser.ast._
import com.github.javaparser.ast.body._

var cu = JavaParser.parse("""class A {
    boolean f(Object target) {
        for (Object elem : this.elements) {
            if (elem.equals(target)) {
                return true;
            }
        }
        return false;
    }
    
    @Override
    public void f(final boolean mayInterruptIfRunning) {
        this.mayInterruptIfRunning = mayInterruptIfRunning;
    }

    public String getHashKey();

    /**
     * @return Returns the cacheMgr.
     */
    ICompositeCacheManager getCacheManager();

    public void isOk() {
        return ok;
    }
    
    public void getFoo() {
        return this.foo;
    }
    
    public void isNg(boolean ng) {
        return ng;
    }
    
    public void getBar(String bar) {
        this.flag = false;
        return this.bar;
    }

    
    @Override
    public String toString() {
        return "Hello";
    }
    
    public <T extends Foo<U>> T id(List<T> x, T y) {
        return y;
    }
    
    public boolean bar(final boolean mayInterruptIfRunning) {
        Int y = 1;
        return (int x) -> x + y;
    }
    
    public boolean baz(final boolean x) {
        try {
            Int y = 1;
            return (int x) -> x + y;
        } catch (Exception1 e) {
            throw e;
        } catch (Exception2 e) {
            throw e;
        }
        
        try (InputStream is = null) {
        end:
            while (true) {
                is.read();
                break end;
            }
        }
    }

    public boolean hoge(int x) {
        return hoge(x + 1);
    }

    public String[] array(String[] str) {
        return new String[] { "A", "B" };
    }

    public void labeled() {
        foo: 
        while (true) {
            break foo;
        }
    }

    /**
     * comment
     */
    public void isOne(int x) {
        switch(len) {
          case 7:
            /* nothing */
          default:
            /* nothing */
        }
    }

}""");

// cu = JavaParser.parse("""
// public abstract class AbstractCollectionMapper<T> implements PropertyMapper {
//     private static Object processValue(int precision, Object value)
//     {
//         if (value instanceof Double || value instanceof Float) {
//             return new ApproximateDouble(((Number) value).doubleValue(), precision);
//         }
//         if (value instanceof Number) {
//             return ((Number) value).longValue();
//         }
//         if (value instanceof List) {
//             return ((List<?>) value).stream()
//                     .map(element -> processValue(precision, element))
//                     .collect(toList());
//         }
//         if (value instanceof Map) {
//             Map<Object, Object> map = new HashMap<>();
//             for (Entry<Object, Object> entry : ((Map<Object, Object>) value).entrySet()) {
//                 map.put(processValue(precision, entry.getKey()), processValue(precision, entry.getValue()));
//             }
//             return map;
//         }
//         return value;
//     }
// }
// """)

In [None]:
import scala.collection.JavaConversions._

val prettyPrinterConfig = new com.github.javaparser.printer.PrettyPrinterConfiguration
prettyPrinterConfig.setPrintJavadoc(false)
prettyPrinterConfig.setPrintComments(false)

def traverse(node: Node, indent: Int): Unit = {
    println("  " * indent + node.getClass.getSimpleName)
    node.getChildNodes().foreach { c => traverse(c, indent + 1) }
    if (node.getChildNodes().isEmpty()) {
        println("  " * (indent + 1) + node.toString(prettyPrinterConfig))
    }
}
traverse(cu, 0)

In [None]:
val OBJECT_METHODS = Set("clone", "equals", "finalize", "hashCode", "toString")

def isIgnorableMethod(method: MethodDeclaration): Boolean = {
    val name = method.getNameAsString
    if (!method.getBody.isPresent) {
        // ignore abstract method
        return true
    } else if (OBJECT_METHODS.contains(name)) {
        return true
    } else if (name.startsWith("set")) {
        if (method.getParameters.size == 1 && method.getBody.isPresent) {
            val body = method.getBody.get
            val statements = body.getStatements
            if (statements.size == 1) {
                return statements(0) match {
                    case e: com.github.javaparser.ast.stmt.ExpressionStmt => e.getExpression.isAssignExpr
                    case _ => false
                }
            }
        }
        return false
    } else if (name.startsWith("get") || name.startsWith("is")) {
        if (method.getParameters.size == 0 && method.getBody.isPresent) {
            val body = method.getBody.get
            val statements = body.getStatements
            return statements.size == 1 && statements(0).isReturnStmt
        }
        return false
    } else {
        return false
    }
}

cu.findAll(classOf[MethodDeclaration]).foreach{
    m => println(s"${m.getNameAsString}: ${isIgnorableMethod(m)}")
}

In [None]:
/** namespace */
sealed abstract class NameSpace(var name: String)
case object VarNameSpace extends NameSpace("var")  // variable name
case object TVarNameSpace extends NameSpace("tvar")  // type-variable name
case object ClassNameNameSpace extends NameSpace("class")  // class name
case object MethodNameSpace extends NameSpace("method")  // method name
case object LabelNameSpace extends NameSpace("label")  // label

/** identifier such as variable */
case class Variable(
    /** id of this identifier */
    val id: String,
    /** display name of this identifier */
    val name: String,
    /** namespace of this identifier */
    val space: NameSpace
)

/** When a new variable is declared, a new ParseContext will be created with the variable added as an element of vars. Upon leaving the scope, the ParseContext will be destroyed. */
case class ParseContext(val vars: List[Variable]) {
    def lookup(space: NameSpace, name: String): String = {
        vars.find { v => v.space == space && v.name == name }.map(_.id).getOrElse(name)
    }
}

/** AST node */
case class AstNode(
    /** name of AST node */
    val name: String,
    /** string representation of terminal */
    val terminal: Option[String] = Option.empty,
    /** child nodes of AST node */
    val children: List[AstNode] = List.empty
) {
    def toString(indent: Int): String = {
        "  " * indent + name + "\n" +
        (if (terminal.isDefined) { "  " * (indent + 1) + terminal.get + "\n" } else "") +
        children.map{ c => c.toString(indent + 1) }.mkString("")
    }
}

In [None]:
import com.github.javaparser.ast.expr._
import com.github.javaparser.ast.stmt._ 
import com.github.javaparser.ast.`type`._

// normalized values of terminal symbols
val STRING_LITERAL_TERMINAL = "@string_literal"
val CHAR_LITERAL_TERMINAL = "@char_literal"
val INT_LITERAL_TERMINAL = "@int_literal"
val DOUBLE_LITERAL_TERMINAL = "@double_literal"

// configuration of parsing
case class ExtractConfig(
    val isNormalizeStringLiteral: Boolean = true, 
    val isNormalizeCharLiteral: Boolean = true, 
    val isNormalizeIntLiteral: Boolean = false, 
    val isNormalizeDoubleLiteral: Boolean = true)

// environment （dictionary of identifiers by namespace）
class Env(space: NameSpace) {
    var varsIndex = 0
    var variables = List.empty[Variable]
    def getVarAndIncrement(originalName: String): Variable = {
        val name = "@" + space.name + "_" + varsIndex
        varsIndex += 1
        val variable = Variable(id=name, name=originalName, space=space)
        variables ::= variable
        variable
    }
}

// all environment
class VarEnv {
    val vars = new Env(VarNameSpace)
    val methods = new Env(MethodNameSpace)
    val labels = new Env(LabelNameSpace)
}

val prettyPrintConfig = new com.github.javaparser.printer.PrettyPrinterConfiguration
prettyPrintConfig.setPrintJavadoc(false)
prettyPrintConfig.setPrintComments(false)

// extract ASTs
def extractAstList(
        // List of nodes for which to extract information (evaluate in order and consider the possibility of creating a new context)
        nodes: java.util.List[Node], 
        // parse-context
        context: ParseContext,
        // all environment
        env: VarEnv,
        // configuration
        config: ExtractConfig,
        // Evaluates the node in the given context and returns its AST information and the new context. We can consider the case where nodes are evaluated in order and variable declarations increase in order.
        handler: (Node, ParseContext, VarEnv, ExtractConfig) => (AstNode, ParseContext) = 
            (c, context, env, config) => extractAST(c, context, env, config)
    ): (List[AstNode], ParseContext) = {

    import com.github.javaparser.ast.comments._

    var currentContext = context
    var children = List.empty[AstNode]
    nodes.filterNot {
        node => node.isInstanceOf[Comment]
    }.foreach {
        c => 
        val (ast, newContext) = handler(c, currentContext, env, config)
        children = ast :: children
        currentContext = newContext
    }
    (children.reverse, currentContext)
}

// extract AST
def extractAST(
        // Node from which information is extracted
        node: Node, 
        // parse-context
        context: ParseContext, 
        // all environment
        env: VarEnv, 
        // configuration
        config: ExtractConfig
    ): (AstNode, ParseContext) = {
    val nodeName = node.getClass().getSimpleName()

    node match {
        // TODO support type parameters
        case e: StringLiteralExpr if config.isNormalizeStringLiteral => {
            // Replace the string literal value with a constant "@char_literal".
            (AstNode(name=nodeName, terminal=Some(STRING_LITERAL_TERMINAL)), context)
        }
        case e: CharLiteralExpr if config.isNormalizeCharLiteral => {
            // Replace the character literal value with a constant "@char_literal".
            (AstNode(name=nodeName, terminal=Some(CHAR_LITERAL_TERMINAL)), context)
        }
        case e: IntegerLiteralExpr if config.isNormalizeIntLiteral => {
            // Replace the integer literal value with a constant "@int_literal".
            (AstNode(name=nodeName, terminal=Some(INT_LITERAL_TERMINAL)), context)
        }
        case e: LongLiteralExpr if config.isNormalizeIntLiteral => {
            // Replace the long literal value with a constant "@int_literal".
            (AstNode(name=nodeName, terminal=Some(INT_LITERAL_TERMINAL)), context)
        }
        case e: DoubleLiteralExpr if config.isNormalizeDoubleLiteral => {
            // Replace the floating-point literal value with a constant "@double_literal".
            (AstNode(name=nodeName, terminal=Some(DOUBLE_LITERAL_TERMINAL)), context)
        }
        case p: Parameter => {
            // Anonymize parameter name
            val parameterName = p.getNameAsString
            val alias = env.vars.getVarAndIncrement(parameterName)
            val astName = AstNode(name="SimpleName", terminal=Some(alias.id))
            // Child elements are evaluated in their original context.
            val newContext = ParseContext(alias :: context.vars)
            val (children, _) = extractAstList(
                p.getChildNodes, 
                context, 
                env,
                config,
                (child, curContext, env, config) => child match {
                    case s: SimpleName => (astName, curContext)
                    case t: Type => {
                        val (astType, _) = extractAST(t, curContext, env, config)
                        (if (p.isVarArgs) AstNode(name="VarArgs", children=List(astType)) else astType, curContext)
                    }
                    case _ => extractAST(child, curContext, env, config)
                }
            )
            // Returns the context in which the parameter definition was added to the context.
            (AstNode(name=nodeName, children=children), newContext)
        }
        case e: UnaryExpr => {
            var (children, newContext) = extractAstList(e.getChildNodes(), context, env, config)
            (AstNode(name=nodeName + ":" + e.getOperator, children=children), newContext)
        }
        case e: BinaryExpr => {
            var (children, newContext) = extractAstList(e.getChildNodes(), context, env, config)
            (AstNode(name=nodeName + ":" + e.getOperator, children=children), newContext)
        }
        case e: AssignExpr => {
            var (children, newContext) = extractAstList(e.getChildNodes(), context, env, config)
            (AstNode(name=nodeName + ":" + e.getOperator, children=children), newContext)
        }
        case e: VariableDeclarator => {
            // Anonymize variable name
            val varName = e.getNameAsString
            val alias = env.vars.getVarAndIncrement(varName)
            val astName = AstNode(name="SimpleName", terminal=Some(alias.id))
            val newContext = ParseContext(alias :: context.vars)
            // Child elements (variable initialization) are evaluated in their original context.
            val (children, _) = extractAstList(
                e.getChildNodes, 
                context, 
                env,
                config,
                (child, curContext, env, config) => child match {
                    case s: SimpleName => (astName, newContext)
                    case _ => extractAST(child, curContext, env, config)
                }
            )

            (AstNode(name=nodeName, children=children), newContext)
        }
        case e: NameExpr => {
            // reference to variable vame
            val name = e.getNameAsString
            val children = List(AstNode(name="SimpleName", 
                                        terminal=Some(context.lookup(VarNameSpace, name))))
            (AstNode(name=nodeName, children=children), context)
        }
        case m: MethodDeclaration => {
            // Anonymize method name
            val methodName = m.getNameAsString
            val alias = env.methods.getVarAndIncrement(methodName)
            val astName = AstNode(name="SimpleName", terminal=Some(alias.id))
            val newContext = ParseContext(alias :: context.vars)
            // Child elements (Parameters and method bodies) are evaluated in the new context.
            val (children, _) = extractAstList(
                m.getChildNodes, 
                context, 
                env,
                config,
                (child, curContext, env, config) => child match {
                    case s: SimpleName => (astName, newContext)
                    case _ => extractAST(child, curContext, env, config)
                }
            )
            (AstNode(name=nodeName, children=children), context) // close scope
        }
        case e: MethodCallExpr => {
            val scope = e.getScope
            val methodName = e.getNameAsString
            val astName = if (!scope.isPresent || scope.get.toString == "this") {
                // The possibility of self-recursion is considered.
                AstNode(name="SimpleName", terminal=Some(context.lookup(MethodNameSpace, methodName)))
            } else {
                extractAST(e.getName, context, env, config)._1
            }
            
            val (children, _) = extractAstList(
                e.getChildNodes, 
                context, 
                env,
                config,
                (child, curContext, env, config) => child match {
                    case s: SimpleName => (astName, curContext)
                    case _ => extractAST(child, curContext, env, config)
                }
            )

            val ast = AstNode(name=nodeName, children=children)
            (ast, context) // close scope
        }
        case s: LabeledStmt => {
            val label = s.getLabel.asString
            val alias = env.labels.getVarAndIncrement(label)
            val astName = AstNode(name="SimpleName", terminal=Some(alias.id))
            val newContext = ParseContext(alias :: context.vars)
            val (children, newContext2) = extractAstList(
                s.getChildNodes, 
                context, 
                env,
                config,
                (child, curContext, env, config) => child match {
                    case s: SimpleName => (astName, newContext)
                    case _ => extractAST(child, curContext, env, config)
                }
            )
            (AstNode(name=nodeName, children=children), newContext2)
        }
        case s: BreakStmt => {
            val label = s.getLabel
            val children = if (label.isPresent) {
                List(AstNode(name="SimpleName", terminal=Some(context.lookup(LabelNameSpace, label.get.asString))))
            } else {
                List.empty
            }
            (AstNode(name=nodeName, children=children), context)
        }
        case s: ContinueStmt => {
            val label = s.getLabel
            val children = if (label.isPresent) {
                List(AstNode(name="SimpleName", terminal=Some(context.lookup(LabelNameSpace, label.get.asString))))
            } else {
                List.empty
            }
            (AstNode(name=nodeName, children=children), context)
        }
        case e: ConditionalExpr => {
            (AstNode(name=nodeName, children=List(
                AstNode(name="Condition", children=List(extractAST(e.getCondition, context, env, config)._1)),
                extractAST(e.getThenExpr, context, env, config)._1,
                extractAST(e.getElseExpr, context, env, config)._1
            )), context)
        }
        case s if s.isInstanceOf[BlockStmt] || 
                s.isInstanceOf[LambdaExpr] ||
                s.isInstanceOf[MethodDeclaration] ||
                s.isInstanceOf[ConstructorDeclaration] ||
                s.isInstanceOf[ClassOrInterfaceDeclaration] || 
                s.isInstanceOf[EnumDeclaration] ||
                s.isInstanceOf[EnumConstantDeclaration] ||
                s.isInstanceOf[AnnotationDeclaration] || 
                s.isInstanceOf[AnnotationMemberDeclaration] ||
                s.isInstanceOf[TryStmt] ||
                s.isInstanceOf[CatchClause] => {
                    
            val (children, _) = extractAstList(node.getChildNodes(), context, env, config)
            val ast = AstNode(name=nodeName, children=children)
            (ast, context) // close scope
        }
        case _ => {
            val (children, newContext) = extractAstList(node.getChildNodes(), context, env, config)
            val ast = if (node.getChildNodes().isEmpty) {
                if (
                    node.isInstanceOf[Expression] ||
                    node.isInstanceOf[Name] ||
                    node.isInstanceOf[SimpleName] ||
                    node.isInstanceOf[Type] ||
                    node.isInstanceOf[ArrayCreationLevel]
                ) {
                    AstNode(name=nodeName, terminal=Some(node.toString(prettyPrintConfig))) // コメント抑制
                } else if (
                    node.isInstanceOf[BreakStmt] || // break;
                    node.isInstanceOf[ReturnStmt] || // return;
                    node.isInstanceOf[ContinueStmt] || // continue;
                    node.isInstanceOf[SwitchEntryStmt] || // default:
                    node.isInstanceOf[EmptyStmt] // 
                ) {
                    AstNode(name=nodeName, children=children)
                } else {
                    throw new IllegalStateException(node.getClass.getName)
                }
            } else {
                AstNode(name=nodeName, children=children)
            }
            (ast, newContext)
        }
    }
}

In [None]:
import scala.collection.mutable.{LinkedHashMap => MMap}

class Vocabs() {
    val terminals = MMap.empty[String, Int]
    val paths = MMap.empty[String, Int]
    val nodes = MMap.empty[String, Int]
    var nextTerminalIndex = 1
    var nextPathIndex = 1
    var nextNodeIndex = 1
    
    def getTerminalIndex(terminal: String): Int = {
        val name = terminal.toLowerCase // To reduce the vocabulary size
        terminals.getOrElseUpdate(name, {
            var index = nextTerminalIndex
            nextTerminalIndex += 1
            index
        })
    }
    
    def getPathIndex(path: String): Int = {
        paths.getOrElseUpdate(path, {
            var index = nextPathIndex
            nextPathIndex += 1
            index
        })
    }
    
    def getNodeIndex(node: String): Int = {
        nodes.getOrElseUpdate(node, {
            var index = nextNodeIndex
            nextNodeIndex += 1
            index
        })
    }

}

In [None]:
type PathToRoot = List[(AstNode, Int)]

def findTerminal(ast: AstNode, pathToRoot: PathToRoot, vocabs: Vocabs): List[(AstNode, PathToRoot, Int)] = {
    vocabs.getNodeIndex(ast.name)
    if (ast.terminal.isDefined) {
        val terminal = ast.terminal.get
        val termIndex = vocabs.getTerminalIndex(terminal)
        List((ast, pathToRoot, termIndex))
    } else {
        ast.children.zipWithIndex.flatMap {
            case (c, i) =>
            findTerminal(c, (c, i) :: pathToRoot, vocabs)
        }
    }
}

In [None]:
object Direction {
    sealed abstract class Direction(val name: String)
    case object Up extends Direction("↑")
    case object Down extends Direction("↓")
    case object Last extends Direction("")
}

type PathFromRoot = List[(AstNode, Int)]

def getPath(
        startPathFromRoot: PathFromRoot, 
        endPathFromRoot: PathFromRoot,
        maxLength: Int,
        maxWidth: Int
    ): List[(AstNode, Direction.Direction)] = {

    assert(!startPathFromRoot.isEmpty)
    assert(!endPathFromRoot.isEmpty)
    assert(startPathFromRoot.head eq endPathFromRoot.head)

    var hinge = startPathFromRoot.head
    var startPath = startPathFromRoot.tail
    var endPath = endPathFromRoot.tail
    
    while (startPath.head eq endPath.head) {
        hinge = startPath.head
        startPath = startPath.tail
        endPath = endPath.tail
    }
    
    assert(!startPath.isEmpty)
    assert(!endPath.isEmpty)

    if (startPath.head._2 - endPath.head._2 > maxWidth ||
        endPath.head._2 - startPath.head._2 > maxWidth ||
        startPath.size + endPath.size + 1 > maxLength) {
        return List.empty
    }
    
    var path: List[(AstNode, Direction.Direction)] = 
            startPath.reverse.map{ case (n, _) => (n, Direction.Up) } ++ 
            List((hinge._1, Direction.Down)) ++ 
            endPath.init.map{ case (n, _) => (n, Direction.Down) } ++
            List((endPath.last._1, Direction.Last))
    
    path
}

In [None]:
case class Feature(start: Int, path: Int, end: Int)

def extractFeature(cu: CompilationUnit, methodName: String, vocabs: Vocabs, maxLength: Int, maxWidth: Int, config: ExtractConfig): 
        List[(List[Feature], VarEnv, String, MethodDeclaration)] = {
    val lowerCaseMethodName = methodName.toLowerCase
    cu.findAll(classOf[MethodDeclaration]).filter{
        m: MethodDeclaration => 
        (methodName == "*" || m.getNameAsString.toLowerCase == lowerCaseMethodName) && !isIgnorableMethod(m)
    }.map {
        m =>
        val methodName = m.getNameAsString
        val env = new VarEnv
        val (ast, _) = extractAST(m, ParseContext(List.empty), env, config)
//         println(ast.toString(0))
        val terminalAndPathsFromRoot = findTerminal(ast, List((ast, 0)), vocabs).map{case (a, l, i) => (a, l.reverse, i)}
//         terminalAndPathsFromRoot.foreach {
//             case (terminal, pathFromRoot, termIndex) =>
//             println(s"${terminal.terminal.get}, ${pathFromRoot.map{case (n, i) => (n.name,i) }.mkString("/")}")
//         }

        val terms = terminalAndPathsFromRoot.zipWithIndex
        val features = terms.flatMap {
            case ((startTerm, startPath, startTermIndex), i) => 
            terms.flatMap {
                case ((endTerm, endPath, endTermIndex), j) => 
                if (i < j) {
                    val path = getPath(startPath, endPath, maxLength, maxWidth)
                    if (!path.isEmpty) {
                        val pathSymbol = path.map{ case (node, dir) => s"${node.name}${dir.name}" }.mkString
                        val pathIndex = vocabs.getPathIndex(pathSymbol)
                        List(Feature(startTermIndex, pathIndex, endTermIndex))
                    } else {
                        List.empty
                    }
                } else {
                    List.empty
                }
            }
        }
        (features, env, methodName, m)
    }.toList
}

val vocabs = new Vocabs
val methodName = "hoge"
extractFeature(cu, methodName, vocabs, maxLength=6, maxWidth=3, config=ExtractConfig()).foreach {
    case (features, env, methodName, methodDeclaration) =>
    println(s"medthod:${methodDeclaration}")
    println(s"label:${methodName}")
    println("paths:")
    println(features.map{ case Feature(s, p, e) => s"${s} ${p} ${e}" }.mkString("\n"))
    println("vars:")
    println(env.vars.variables.map{ case Variable(id,name,_) => s"${name}\t${id}" }.mkString("\n"))
}

vocabs.terminals.foreach { case (k, v) => println(s"${v}\t${k}") }

vocabs.paths.foreach { case (k, v) => println(s"${v}\t${k}") }


In [None]:
import better.files._
import File._

def createDataset(datasetDir: File, sourceCodeDir: File, methodDeclarationsFileName: Option[String], maxLength: Int, maxWidth: Int, extractConfig: ExtractConfig): Unit = {
    import scala.collection.mutable.{LinkedHashMap => MMap}

    val methodListFile = datasetDir / "methods.txt"
    val corpusFileName = datasetDir / "corpus.txt"
    val pathIndexFileName = datasetDir / "path_idxs.txt"
    val terminalIndexFileName = datasetDir / "terminal_idxs.txt"
    val paramsFileName = datasetDir / "params.txt"
    val actualMethodListFile = datasetDir / "actual_methods.txt"
    val methodDeclarationsPath = methodDeclarationsFileName.map{ filename => datasetDir / filename }

    var lineCount = 0
    val lines = methodListFile.lines
    println(s"method list:${lines.size}")

    var idCounter = 0

    val corpusWriter = corpusFileName.newBufferedWriter
    val actualMethodListWriter = actualMethodListFile.newBufferedWriter
    val methodDeclarationsWriter = methodDeclarationsPath.map(_.newBufferedWriter)

    val vocabs = new Vocabs
    val methodNames = MMap.empty[String, Int]

    var lastJavaFileName = ""
    var lastCompilationUnit: Option[CompilationUnit] = None
    for (line <- lines) {
        val Array(javaFileName, methodName) = line.split("\t")

        try {
            val cu = if (lastJavaFileName == javaFileName) {
                lastCompilationUnit.get
            } else {
                val sourceCodeFile = (sourceCodeDir / javaFileName)
                val content = sourceCodeFile.contentAsString
                val cu = JavaParser.parse(content)
                lastCompilationUnit = Some(cu)
                lastJavaFileName = javaFileName
                cu
            }

            val featureEnvPairs = extractFeature(cu, methodName.toLowerCase, vocabs, 
                                                 maxLength=maxLength, maxWidth=maxWidth, config=extractConfig)
            featureEnvPairs.foreach {
                case (features, env, methodName, methodDeclaration) =>
                val corpusId = idCounter
                idCounter += 1

                corpusWriter.write(s"#${corpusId}\n")

                corpusWriter.write(s"label:${methodName}\n")
                corpusWriter.write(s"class:${javaFileName}\n")
                corpusWriter.write("paths:\n")
                features.foreach{ case Feature(s, p, e) => corpusWriter.write(s"${s}\t${p}\t${e}\n") }
                corpusWriter.write("vars:\n")
                env.vars.variables.foreach{ case Variable(id,name,_) => corpusWriter.write(s"${name}\t${id}\n") }
                env.labels.variables.foreach{ case Variable(id,name,_) => corpusWriter.write(s"${name}\t${id}\n") }
                corpusWriter.write("\n")

                actualMethodListWriter.write(s"${javaFileName}\t${methodName}\t${corpusId}\t${features.size}\n")

                methodDeclarationsWriter.foreach {
                    writer => 
                    writer.write(s"#${corpusId}\t${javaFileName}#${methodName}\n${methodDeclaration}\n\n")
                }
                
                methodNames(methodName) = methodNames.getOrElseUpdate(methodName, 1) + 1                
            }
            if (featureEnvPairs.size == 0 && methodName != "*") {
                println("WARNING: method not found. " + line)
            }
        } catch {
            case c: java.nio.file.NoSuchFileException => {
                println("WARNING: file not found. " + javaFileName)
            }
            case e: com.github.javaparser.ParseProblemException => {
                println("ERROR: parse error. " + line)
            }
        }
    }
    methodDeclarationsWriter.foreach{ _.close }
    actualMethodListWriter.close
    corpusWriter.close

    val termIndexWriter = terminalIndexFileName.newBufferedWriter
    termIndexWriter.write("0\t<PAD/>\n")
    vocabs.terminals.foreach { case (k, v) => termIndexWriter.write(s"${v}\t${k}\n") }
    termIndexWriter.close

    val pathIndexWriter = pathIndexFileName.newBufferedWriter
    pathIndexWriter.write("0\t<PAD/>\n")
    vocabs.paths.foreach { case (k, v) => pathIndexWriter.write(s"${v}\t${k}\n") }
    pathIndexWriter.close

    val params = s"""max_length: ${maxLength}
max_width: ${maxWidth}
nomalize_string_literal: ${extractConfig.isNormalizeStringLiteral}
nomalize_char_literal: ${extractConfig.isNormalizeCharLiteral}
nomalize_int_literal: ${extractConfig.isNormalizeIntLiteral}
nomalize_double_literal: ${extractConfig.isNormalizeDoubleLiteral}
terminal_vocab_count: ${vocabs.terminals.size}
path_vocab_count: ${vocabs.paths.size}
method_count: ${idCounter}
method_name_vocab_count: ${methodNames.size}
"""
    paramsFileName.writeText(params)
    println(params)
}

In [None]:
val datasetDir = File("/Users/isao/Documents/Development/MySoftware/NLP/code2vec-preprocess/test-dataset")
val sourceCodeDir = File("/Users/isao/Documents/Development/MySoftware/NLP/code2vec/raw_data/apache")
val methodDeclarationsFileName = Some("method_declarations.txt")

val MAX_PATH_LENGTH = 8
val MAX_PATH_WIDTH = 3
val extractConfig = ExtractConfig(
    isNormalizeStringLiteral = true,
    isNormalizeCharLiteral = true,
    isNormalizeIntLiteral = false,
    isNormalizeDoubleLiteral = false
)

createDataset(datasetDir, sourceCodeDir, methodDeclarationsFileName, 
              maxLength=MAX_PATH_LENGTH, maxWidth=MAX_PATH_WIDTH, extractConfig=extractConfig)