sourcerer-io · yaronskaya · Oct 16, 2017 · Sep 20, 2017 · Oct 13, 2017 · Oct 16, 2017
diff --git a/build.gradle b/build.gradle
@@ -64,6 +64,7 @@ dependencies {
     compile group: 'org.eclipse.jgit', name: 'org.eclipse.jgit',
             version: '4.8.0.201706111038-r'
     compile "org.slf4j:slf4j-nop:1.7.2"
+    compile 'org.jpmml:pmml-evaluator:1.3.9'
 
     testCompile 'org.jetbrains.kotlin:kotlin-test'
     testCompile 'org.jetbrains.spek:spek-api:1.1.4'

diff --git a/src/main/kotlin/app/extractors/CSharpExtractor.kt b/src/main/kotlin/app/extractors/CSharpExtractor.kt
@@ -9,9 +9,10 @@ import app.model.DiffFile
 
 class CSharpExtractor : ExtractorInterface {
     companion object {
-        val LANGUAGE_NAME = "cs"
+        val LANGUAGE_NAME = "csharp"
         val FILE_EXTS = listOf("cs")
         val LIBRARIES = ExtractorInterface.getLibraries("cs")
+        val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -37,4 +38,18 @@ class CSharpExtractor : ExtractorInterface {
 
         return imports.toList()
     }
+
+    override fun tokenize(line: String): List<String> {
+        val importRegex = Regex("""^.*using\s+(\w+[.\w+]*)""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        var newLine = importRegex.replace(line, "")
+        newLine = commentRegex.replace(newLine, "")
+        return super.tokenize(newLine)
+    }
+
+    override fun getLineLibraries(line: String,
+                                  fileLibraries: List<String>): List<String> {
+
+        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+    }
 }
diff --git a/src/main/kotlin/app/extractors/CppExtractor.kt b/src/main/kotlin/app/extractors/CppExtractor.kt
@@ -11,6 +11,7 @@ class CppExtractor : ExtractorInterface {
     companion object {
         val LANGUAGE_NAME = "cpp"
         val FILE_EXTS = listOf("cc", "cpp", "cxx", "c++")
+        val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -32,4 +33,18 @@ class CppExtractor : ExtractorInterface {
 
         return imports.toList()
     }
+
+    override fun tokenize(line: String): List<String> {
+        val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        var newLine = importRegex.replace(line, "")
+        newLine = commentRegex.replace(newLine, "")
+        return super.tokenize(newLine)
+    }
+
+    override fun getLineLibraries(line: String,
+                                  fileLibraries: List<String>): List<String> {
+
+        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+    }
 }
diff --git a/src/main/kotlin/app/extractors/Extractor.kt b/src/main/kotlin/app/extractors/Extractor.kt
@@ -10,7 +10,8 @@ import app.model.DiffFile
 class Extractor : ExtractorInterface {
     companion object {
         val TYPE_LANGUAGE = 1
-        val TYPE_KEYWORD = 2
+        val TYPE_LIBRARY = 2
+        val TYPE_KEYWORD = 3
         val SEPARATOR = ">"
     }
 

diff --git a/src/main/kotlin/app/extractors/ExtractorInterface.kt b/src/main/kotlin/app/extractors/ExtractorInterface.kt
@@ -6,6 +6,12 @@ package app.extractors
 
 import app.model.DiffFile
 import app.model.CommitStats
+import org.dmg.pmml.FieldName
+import org.jpmml.evaluator.Evaluator
+import org.jpmml.evaluator.FieldValue
+import org.jpmml.evaluator.ModelEvaluatorFactory
+import org.jpmml.evaluator.ProbabilityDistribution
+import org.jpmml.model.PMMLUtil
 
 interface ExtractorInterface {
     companion object {
@@ -14,6 +20,14 @@ interface ExtractorInterface {
                 .getResourceAsStream("data/libraries/${name}_libraries.txt")
                 .bufferedReader().readLines().toSet()
         }
+        fun getLibrariesModelEvaluator(name: String): Evaluator {
+            val pmml = PMMLUtil.unmarshal(
+                           ExtractorInterface::class.java.classLoader
+                           .getResourceAsStream("data/models/$name.pmml"))
+            val evaluator = ModelEvaluatorFactory.newInstance()
+                                                 .newModelEvaluator(pmml)
+            return evaluator
+        }
     }
 
     fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -23,6 +37,57 @@ interface ExtractorInterface {
             file
         }
 
+        val oldLibraryToCount = mutableMapOf<String, Int>()
+        val newLibraryToCount = mutableMapOf<String, Int>()
+        val oldFilesImports = files.fold(mutableSetOf<String>()) { acc, file ->
+            acc.addAll(file.old.imports)
+            acc
+        }
+        val newFilesImports = files.fold(mutableSetOf<String>()) { acc, file ->
+            acc.addAll(file.new.imports)
+            acc
+        }
+
+        oldFilesImports.forEach { oldLibraryToCount[it] = 0}
+        newFilesImports.forEach { newLibraryToCount[it] = 0}
+
+
+        files.filter { file -> file.language.isNotBlank() }
+            .forEach { file ->
+                val oldFileLibraries = mutableListOf<String>()
+                file.old.content.forEach {
+                    val lineLibs = getLineLibraries(it, file.old.imports)
+                    oldFileLibraries.addAll(lineLibs)
+                }
+                file.old.imports.forEach { import ->
+                    val numLines = oldFileLibraries.count { it == import }
+                    oldLibraryToCount[import] =
+                        oldLibraryToCount[import] as Int + numLines
+                }
+
+                val newFileLibraries = mutableListOf<String>()
+                file.new.content.forEach {
+                    val lineLibs = getLineLibraries(it, file.new.imports)
+                    newFileLibraries.addAll(lineLibs)
+                }
+                file.new.imports.forEach { import ->
+                    val numLines = newFileLibraries.count { it == import }
+                    newLibraryToCount[import] =
+                            newLibraryToCount[import] as Int + numLines
+                }
+            }
+
+        val allImports = mutableSetOf<String>()
+        allImports.addAll(oldFilesImports + newFilesImports)
+
+        val libraryStats = allImports.map {
+            CommitStats(
+                numLinesAdded = oldLibraryToCount.getOrDefault(it, 0),
+                numLinesDeleted = newLibraryToCount.getOrDefault(it, 0),
+                type = Extractor.TYPE_LIBRARY,
+                tech = it)
+        }
+
         return files.filter { file -> file.language.isNotBlank() }
                     .groupBy { file -> file.language }
                     .map { (language, files) -> CommitStats(
@@ -31,12 +96,59 @@ interface ExtractorInterface {
                         numLinesDeleted = files.fold(0) { total, file ->
                             total + file.getAllDeleted().size },
                         type = Extractor.TYPE_LANGUAGE,
-                        tech = language)}
+                        tech = language)} + libraryStats
     }
 
     fun extractImports(fileContent: List<String>): List<String> {
         return listOf()
     }
 
+    fun tokenize(line: String): List<String> {
+        val stringRegex = Regex("""(".+?"|'.+?')""")
+        val newLine = stringRegex.replace(line, "")
+        //TODO(lyaronskaya): multiline comment regex
+        val splitRegex =
+            Regex("""\s|,|;|\*|\n|\(|\)|\[|]|\{|}|\+|=|&|\$|!=|\.|>|<|#|@|:|\?|!""")
+        val tokens = splitRegex.split(newLine)
+            .filter { it.isNotBlank() && !it.contains('"') && !it.contains('\'')
+                && it != "-" && it != "@"}
+        return tokens
+    }
+
+    fun getLineLibraries(line: String, fileLibraries: List<String>): List<String> {
+        return listOf()
+    }
+
+    fun getLineLibraries(line: String,
+                          fileLibraries: List<String>,
+                          evaluator: Evaluator,
+                          languageLabel: String): List<String> {
+        val arguments = LinkedHashMap<FieldName, FieldValue>()
+
+        for (inputField in evaluator.inputFields) {
+            val inputFieldName = inputField.name
+            val tokenizedLine = tokenize(line).joinToString(separator = " ")
+            val inputFieldValue = inputField.prepare(tokenizedLine)
+            arguments.put(inputFieldName, inputFieldValue)
+        }
+        val result = evaluator.evaluate(arguments)
+
+        val targetFieldName = evaluator.targetFields[0].name
+        val targetFieldValue = result[targetFieldName] as ProbabilityDistribution
+
+        val categoryValues = targetFieldValue.categoryValues.toList()
+        val probabilities = categoryValues.map { targetFieldValue.getProbability(it) }
+        val maxProbability = probabilities.max() as Double
+        val maxProbabilityCategory = categoryValues[probabilities.indexOf(maxProbability)]
+        val selectedCategories = categoryValues.filter {
+            targetFieldValue.getProbability(it) >= 0.1 * maxProbability
+        }
+
+        if (maxProbabilityCategory == languageLabel) {
+            return emptyList()
+        }
 
+        val lineLibraries = fileLibraries.filter { it in selectedCategories }
+        return lineLibraries
+    }
 }
diff --git a/src/main/kotlin/app/extractors/GoExtractor.kt b/src/main/kotlin/app/extractors/GoExtractor.kt
@@ -11,6 +11,7 @@ class GoExtractor : ExtractorInterface {
     companion object {
         val LANGUAGE_NAME = "go"
         val FILE_EXTS = listOf("go")
+        val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -41,4 +42,18 @@ class GoExtractor : ExtractorInterface {
 
         return imports.toList()
     }
+
+    override fun tokenize(line: String): List<String> {
+        val importRegex = Regex("""^(.*import)\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        var newLine = importRegex.replace(line, "")
+        newLine = commentRegex.replace(newLine, "")
+        return super.tokenize(newLine)
+    }
+
+    override fun getLineLibraries(line: String,
+                                  fileLibraries: List<String>): List<String> {
+
+        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+    }
 }
diff --git a/src/main/kotlin/app/extractors/JavaExtractor.kt b/src/main/kotlin/app/extractors/JavaExtractor.kt
@@ -20,6 +20,7 @@ class JavaExtractor : ExtractorInterface {
             "extends", "int", "short", "try", "char", "final", "interface",
             "static", "void", "class", "finally", "long", "strictfp",
             "volatile", "const", "float", "native", "super", "while")
+        val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -72,4 +73,18 @@ class JavaExtractor : ExtractorInterface {
 
         return imports.toList()
     }
+
+    override fun tokenize(line: String): List<String> {
+        val importRegex = Regex("""^(.*import)\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        var newLine = importRegex.replace(line, "")
+        newLine = commentRegex.replace(newLine, "")
+        return super.tokenize(newLine)
+    }
+
+    override fun getLineLibraries(line: String,
+                                  fileLibraries: List<String>): List<String> {
+
+        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+    }
 }
diff --git a/src/main/kotlin/app/extractors/JavascriptExtractor.kt b/src/main/kotlin/app/extractors/JavascriptExtractor.kt
@@ -9,9 +9,10 @@ import app.model.DiffFile
 
 class JavascriptExtractor : ExtractorInterface {
     companion object {
-        val LANGUAGE_NAME = "js"
+        val LANGUAGE_NAME = "javascript"
         val FILE_EXTS = listOf("js")
         val LIBRARIES = ExtractorInterface.getLibraries("js")
+        val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -30,4 +31,10 @@ class JavascriptExtractor : ExtractorInterface {
 
         return imports.toList()
     }
+
+    override fun getLineLibraries(line: String,
+                                  fileLibraries: List<String>): List<String> {
+
+        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+    }
 }
diff --git a/src/main/kotlin/app/extractors/ObjectiveCExtractor.kt b/src/main/kotlin/app/extractors/ObjectiveCExtractor.kt
@@ -11,6 +11,7 @@ class ObjectiveCExtractor : ExtractorInterface {
     companion object {
         val LANGUAGE_NAME = "objectivec"
         val FILE_EXTS = listOf("m", "mm")
+        val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -36,4 +37,18 @@ class ObjectiveCExtractor : ExtractorInterface {
 
         return imports.toList()
     }
+
+    override fun tokenize(line: String): List<String> {
+        val importRegex = Regex("""^([^\n]*[#@](import|include))\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        var newLine = importRegex.replace(line, "")
+        newLine = commentRegex.replace(newLine, "")
+        return super.tokenize(newLine)
+    }
+
+    override fun getLineLibraries(line: String,
+                                  fileLibraries: List<String>): List<String> {
+
+        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+    }
 }
diff --git a/src/main/kotlin/app/extractors/PhpExtractor.kt b/src/main/kotlin/app/extractors/PhpExtractor.kt
@@ -11,6 +11,7 @@ class PhpExtractor : ExtractorInterface {
     companion object {
         val LANGUAGE_NAME = "php"
         val FILE_EXTS = listOf("php", "phtml", "php4", "php3", "php5", "phps")
+        val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -34,4 +35,18 @@ class PhpExtractor : ExtractorInterface {
 
         return imports.toList()
     }
+
+    override fun tokenize(line: String): List<String> {
+        val importRegex = Regex("""^(.*require|require_once|include|include_once|use)\s[^\n]*""")
+        val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
+        var newLine = importRegex.replace(line, "")
+        newLine = commentRegex.replace(newLine, "")
+        return super.tokenize(newLine)
+    }
+
+    override fun getLineLibraries(line: String,
+                                  fileLibraries: List<String>): List<String> {
+
+        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
+    }
 }
diff --git a/src/main/kotlin/app/extractors/PythonExtractor.kt b/src/main/kotlin/app/extractors/PythonExtractor.kt
@@ -11,6 +11,7 @@ class PythonExtractor : ExtractorInterface {
     companion object {
         val LANGUAGE_NAME = "python"
         val FILE_EXTS = listOf("py", "py3")
+        val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
     }
 
     override fun extract(files: List<DiffFile>): List<CommitStats> {
@@ -33,5 +34,20 @@ class PythonExtractor : ExtractorInterface {
         }
 
         return imports.toList()
+
+    }
+
+    override fun tokenize(line: String): List<String> {
+        val docImportRegex = Regex("""^([^\n]*#|\s*\"\"\"|\s*import|\s*from)[^\n]*""")
+        val commentRegex = Regex("""^(.*#).*""")
+        var newLine = docImportRegex.replace(line, "")
+        newLine = commentRegex.replace(newLine, "")
+        return super.tokenize(newLine)
+    }
+
+    override fun getLineLibraries(line: String,
+                                  fileLibraries: List<String>): List<String> {
+
+        return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
     }
 }