diff --git a/build.gradle b/build.gradle index 865c0621..2c8d80b6 100644 --- a/build.gradle +++ b/build.gradle @@ -64,6 +64,7 @@ dependencies { compile group: 'org.eclipse.jgit', name: 'org.eclipse.jgit', version: '4.8.0.201706111038-r' compile "org.slf4j:slf4j-nop:1.7.2" + compile 'org.jpmml:pmml-evaluator:1.3.9' testCompile 'org.jetbrains.kotlin:kotlin-test' testCompile 'org.jetbrains.spek:spek-api:1.1.4' diff --git a/src/main/kotlin/app/extractors/CSharpExtractor.kt b/src/main/kotlin/app/extractors/CSharpExtractor.kt index 95395eba..81ebd847 100644 --- a/src/main/kotlin/app/extractors/CSharpExtractor.kt +++ b/src/main/kotlin/app/extractors/CSharpExtractor.kt @@ -9,9 +9,10 @@ import app.model.DiffFile class CSharpExtractor : ExtractorInterface { companion object { - val LANGUAGE_NAME = "cs" + val LANGUAGE_NAME = "csharp" val FILE_EXTS = listOf("cs") val LIBRARIES = ExtractorInterface.getLibraries("cs") + val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME) } override fun extract(files: List): List { @@ -37,4 +38,18 @@ class CSharpExtractor : ExtractorInterface { return imports.toList() } + + override fun tokenize(line: String): List { + val importRegex = Regex("""^.*using\s+(\w+[.\w+]*)""") + val commentRegex = Regex("""^([^\n]*//)[^\n]*""") + var newLine = importRegex.replace(line, "") + newLine = commentRegex.replace(newLine, "") + return super.tokenize(newLine) + } + + override fun getLineLibraries(line: String, + fileLibraries: List): List { + + return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME) + } } diff --git a/src/main/kotlin/app/extractors/CppExtractor.kt b/src/main/kotlin/app/extractors/CppExtractor.kt index 29828cbd..c50f1332 100644 --- a/src/main/kotlin/app/extractors/CppExtractor.kt +++ b/src/main/kotlin/app/extractors/CppExtractor.kt @@ -11,6 +11,7 @@ class CppExtractor : ExtractorInterface { companion object { val LANGUAGE_NAME = "cpp" val FILE_EXTS = listOf("cc", "cpp", "cxx", "c++") + val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME) } override fun extract(files: List): List { @@ -32,4 +33,18 @@ class CppExtractor : ExtractorInterface { return imports.toList() } + + override fun tokenize(line: String): List { + val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""") + val commentRegex = Regex("""^([^\n]*//)[^\n]*""") + var newLine = importRegex.replace(line, "") + newLine = commentRegex.replace(newLine, "") + return super.tokenize(newLine) + } + + override fun getLineLibraries(line: String, + fileLibraries: List): List { + + return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME) + } } diff --git a/src/main/kotlin/app/extractors/Extractor.kt b/src/main/kotlin/app/extractors/Extractor.kt index d90b537f..88d16060 100644 --- a/src/main/kotlin/app/extractors/Extractor.kt +++ b/src/main/kotlin/app/extractors/Extractor.kt @@ -10,7 +10,8 @@ import app.model.DiffFile class Extractor : ExtractorInterface { companion object { val TYPE_LANGUAGE = 1 - val TYPE_KEYWORD = 2 + val TYPE_LIBRARY = 2 + val TYPE_KEYWORD = 3 val SEPARATOR = ">" } diff --git a/src/main/kotlin/app/extractors/ExtractorInterface.kt b/src/main/kotlin/app/extractors/ExtractorInterface.kt index 165c923d..1ff4d2c9 100644 --- a/src/main/kotlin/app/extractors/ExtractorInterface.kt +++ b/src/main/kotlin/app/extractors/ExtractorInterface.kt @@ -6,6 +6,12 @@ package app.extractors import app.model.DiffFile import app.model.CommitStats +import org.dmg.pmml.FieldName +import org.jpmml.evaluator.Evaluator +import org.jpmml.evaluator.FieldValue +import org.jpmml.evaluator.ModelEvaluatorFactory +import org.jpmml.evaluator.ProbabilityDistribution +import org.jpmml.model.PMMLUtil interface ExtractorInterface { companion object { @@ -14,6 +20,14 @@ interface ExtractorInterface { .getResourceAsStream("data/libraries/${name}_libraries.txt") .bufferedReader().readLines().toSet() } + fun getLibrariesModelEvaluator(name: String): Evaluator { + val pmml = PMMLUtil.unmarshal( + ExtractorInterface::class.java.classLoader + .getResourceAsStream("data/models/$name.pmml")) + val evaluator = ModelEvaluatorFactory.newInstance() + .newModelEvaluator(pmml) + return evaluator + } } fun extract(files: List): List { @@ -23,6 +37,57 @@ interface ExtractorInterface { file } + val oldLibraryToCount = mutableMapOf() + val newLibraryToCount = mutableMapOf() + val oldFilesImports = files.fold(mutableSetOf()) { acc, file -> + acc.addAll(file.old.imports) + acc + } + val newFilesImports = files.fold(mutableSetOf()) { acc, file -> + acc.addAll(file.new.imports) + acc + } + + oldFilesImports.forEach { oldLibraryToCount[it] = 0} + newFilesImports.forEach { newLibraryToCount[it] = 0} + + + files.filter { file -> file.language.isNotBlank() } + .forEach { file -> + val oldFileLibraries = mutableListOf() + file.old.content.forEach { + val lineLibs = getLineLibraries(it, file.old.imports) + oldFileLibraries.addAll(lineLibs) + } + file.old.imports.forEach { import -> + val numLines = oldFileLibraries.count { it == import } + oldLibraryToCount[import] = + oldLibraryToCount[import] as Int + numLines + } + + val newFileLibraries = mutableListOf() + file.new.content.forEach { + val lineLibs = getLineLibraries(it, file.new.imports) + newFileLibraries.addAll(lineLibs) + } + file.new.imports.forEach { import -> + val numLines = newFileLibraries.count { it == import } + newLibraryToCount[import] = + newLibraryToCount[import] as Int + numLines + } + } + + val allImports = mutableSetOf() + allImports.addAll(oldFilesImports + newFilesImports) + + val libraryStats = allImports.map { + CommitStats( + numLinesAdded = oldLibraryToCount.getOrDefault(it, 0), + numLinesDeleted = newLibraryToCount.getOrDefault(it, 0), + type = Extractor.TYPE_LIBRARY, + tech = it) + } + return files.filter { file -> file.language.isNotBlank() } .groupBy { file -> file.language } .map { (language, files) -> CommitStats( @@ -31,12 +96,59 @@ interface ExtractorInterface { numLinesDeleted = files.fold(0) { total, file -> total + file.getAllDeleted().size }, type = Extractor.TYPE_LANGUAGE, - tech = language)} + tech = language)} + libraryStats } fun extractImports(fileContent: List): List { return listOf() } + fun tokenize(line: String): List { + val stringRegex = Regex("""(".+?"|'.+?')""") + val newLine = stringRegex.replace(line, "") + //TODO(lyaronskaya): multiline comment regex + val splitRegex = + Regex("""\s|,|;|\*|\n|\(|\)|\[|]|\{|}|\+|=|&|\$|!=|\.|>|<|#|@|:|\?|!""") + val tokens = splitRegex.split(newLine) + .filter { it.isNotBlank() && !it.contains('"') && !it.contains('\'') + && it != "-" && it != "@"} + return tokens + } + + fun getLineLibraries(line: String, fileLibraries: List): List { + return listOf() + } + + fun getLineLibraries(line: String, + fileLibraries: List, + evaluator: Evaluator, + languageLabel: String): List { + val arguments = LinkedHashMap() + + for (inputField in evaluator.inputFields) { + val inputFieldName = inputField.name + val tokenizedLine = tokenize(line).joinToString(separator = " ") + val inputFieldValue = inputField.prepare(tokenizedLine) + arguments.put(inputFieldName, inputFieldValue) + } + val result = evaluator.evaluate(arguments) + + val targetFieldName = evaluator.targetFields[0].name + val targetFieldValue = result[targetFieldName] as ProbabilityDistribution + + val categoryValues = targetFieldValue.categoryValues.toList() + val probabilities = categoryValues.map { targetFieldValue.getProbability(it) } + val maxProbability = probabilities.max() as Double + val maxProbabilityCategory = categoryValues[probabilities.indexOf(maxProbability)] + val selectedCategories = categoryValues.filter { + targetFieldValue.getProbability(it) >= 0.1 * maxProbability + } + + if (maxProbabilityCategory == languageLabel) { + return emptyList() + } + val lineLibraries = fileLibraries.filter { it in selectedCategories } + return lineLibraries + } } diff --git a/src/main/kotlin/app/extractors/GoExtractor.kt b/src/main/kotlin/app/extractors/GoExtractor.kt index bd2862a4..5b9b91de 100644 --- a/src/main/kotlin/app/extractors/GoExtractor.kt +++ b/src/main/kotlin/app/extractors/GoExtractor.kt @@ -11,6 +11,7 @@ class GoExtractor : ExtractorInterface { companion object { val LANGUAGE_NAME = "go" val FILE_EXTS = listOf("go") + val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME) } override fun extract(files: List): List { @@ -41,4 +42,18 @@ class GoExtractor : ExtractorInterface { return imports.toList() } + + override fun tokenize(line: String): List { + val importRegex = Regex("""^(.*import)\s[^\n]*""") + val commentRegex = Regex("""^([^\n]*//)[^\n]*""") + var newLine = importRegex.replace(line, "") + newLine = commentRegex.replace(newLine, "") + return super.tokenize(newLine) + } + + override fun getLineLibraries(line: String, + fileLibraries: List): List { + + return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME) + } } diff --git a/src/main/kotlin/app/extractors/JavaExtractor.kt b/src/main/kotlin/app/extractors/JavaExtractor.kt index f7a57e58..34e53fd0 100644 --- a/src/main/kotlin/app/extractors/JavaExtractor.kt +++ b/src/main/kotlin/app/extractors/JavaExtractor.kt @@ -20,6 +20,7 @@ class JavaExtractor : ExtractorInterface { "extends", "int", "short", "try", "char", "final", "interface", "static", "void", "class", "finally", "long", "strictfp", "volatile", "const", "float", "native", "super", "while") + val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME) } override fun extract(files: List): List { @@ -72,4 +73,18 @@ class JavaExtractor : ExtractorInterface { return imports.toList() } + + override fun tokenize(line: String): List { + val importRegex = Regex("""^(.*import)\s[^\n]*""") + val commentRegex = Regex("""^([^\n]*//)[^\n]*""") + var newLine = importRegex.replace(line, "") + newLine = commentRegex.replace(newLine, "") + return super.tokenize(newLine) + } + + override fun getLineLibraries(line: String, + fileLibraries: List): List { + + return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME) + } } diff --git a/src/main/kotlin/app/extractors/JavascriptExtractor.kt b/src/main/kotlin/app/extractors/JavascriptExtractor.kt index a5d3a2df..db7fd17b 100644 --- a/src/main/kotlin/app/extractors/JavascriptExtractor.kt +++ b/src/main/kotlin/app/extractors/JavascriptExtractor.kt @@ -9,9 +9,10 @@ import app.model.DiffFile class JavascriptExtractor : ExtractorInterface { companion object { - val LANGUAGE_NAME = "js" + val LANGUAGE_NAME = "javascript" val FILE_EXTS = listOf("js") val LIBRARIES = ExtractorInterface.getLibraries("js") + val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME) } override fun extract(files: List): List { @@ -30,4 +31,10 @@ class JavascriptExtractor : ExtractorInterface { return imports.toList() } + + override fun getLineLibraries(line: String, + fileLibraries: List): List { + + return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME) + } } diff --git a/src/main/kotlin/app/extractors/ObjectiveCExtractor.kt b/src/main/kotlin/app/extractors/ObjectiveCExtractor.kt index 4eda9282..48f1b331 100644 --- a/src/main/kotlin/app/extractors/ObjectiveCExtractor.kt +++ b/src/main/kotlin/app/extractors/ObjectiveCExtractor.kt @@ -11,6 +11,7 @@ class ObjectiveCExtractor : ExtractorInterface { companion object { val LANGUAGE_NAME = "objectivec" val FILE_EXTS = listOf("m", "mm") + val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME) } override fun extract(files: List): List { @@ -36,4 +37,18 @@ class ObjectiveCExtractor : ExtractorInterface { return imports.toList() } + + override fun tokenize(line: String): List { + val importRegex = Regex("""^([^\n]*[#@](import|include))\s[^\n]*""") + val commentRegex = Regex("""^([^\n]*//)[^\n]*""") + var newLine = importRegex.replace(line, "") + newLine = commentRegex.replace(newLine, "") + return super.tokenize(newLine) + } + + override fun getLineLibraries(line: String, + fileLibraries: List): List { + + return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME) + } } diff --git a/src/main/kotlin/app/extractors/PhpExtractor.kt b/src/main/kotlin/app/extractors/PhpExtractor.kt index 66832489..914bc5cc 100644 --- a/src/main/kotlin/app/extractors/PhpExtractor.kt +++ b/src/main/kotlin/app/extractors/PhpExtractor.kt @@ -11,6 +11,7 @@ class PhpExtractor : ExtractorInterface { companion object { val LANGUAGE_NAME = "php" val FILE_EXTS = listOf("php", "phtml", "php4", "php3", "php5", "phps") + val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME) } override fun extract(files: List): List { @@ -34,4 +35,18 @@ class PhpExtractor : ExtractorInterface { return imports.toList() } + + override fun tokenize(line: String): List { + val importRegex = Regex("""^(.*require|require_once|include|include_once|use)\s[^\n]*""") + val commentRegex = Regex("""^([^\n]*//)[^\n]*""") + var newLine = importRegex.replace(line, "") + newLine = commentRegex.replace(newLine, "") + return super.tokenize(newLine) + } + + override fun getLineLibraries(line: String, + fileLibraries: List): List { + + return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME) + } } diff --git a/src/main/kotlin/app/extractors/PythonExtractor.kt b/src/main/kotlin/app/extractors/PythonExtractor.kt index 139cc9f2..d5afe71e 100644 --- a/src/main/kotlin/app/extractors/PythonExtractor.kt +++ b/src/main/kotlin/app/extractors/PythonExtractor.kt @@ -11,6 +11,7 @@ class PythonExtractor : ExtractorInterface { companion object { val LANGUAGE_NAME = "python" val FILE_EXTS = listOf("py", "py3") + val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME) } override fun extract(files: List): List { @@ -33,5 +34,20 @@ class PythonExtractor : ExtractorInterface { } return imports.toList() + + } + + override fun tokenize(line: String): List { + val docImportRegex = Regex("""^([^\n]*#|\s*\"\"\"|\s*import|\s*from)[^\n]*""") + val commentRegex = Regex("""^(.*#).*""") + var newLine = docImportRegex.replace(line, "") + newLine = commentRegex.replace(newLine, "") + return super.tokenize(newLine) + } + + override fun getLineLibraries(line: String, + fileLibraries: List): List { + + return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME) } } diff --git a/src/main/kotlin/app/extractors/RubyExtractor.kt b/src/main/kotlin/app/extractors/RubyExtractor.kt index 95ec6a81..e2953306 100644 --- a/src/main/kotlin/app/extractors/RubyExtractor.kt +++ b/src/main/kotlin/app/extractors/RubyExtractor.kt @@ -11,6 +11,7 @@ class RubyExtractor : ExtractorInterface { companion object { val LANGUAGE_NAME = "ruby" val FILE_EXTS = listOf("rb", "rbw") + val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME) } override fun extract(files: List): List { @@ -32,4 +33,18 @@ class RubyExtractor : ExtractorInterface { return imports.toList() } + + override fun tokenize(line: String): List { + val importRegex = Regex("""(require\s+'(\w+)'|load\s+'(\w+)\.\w+')""") + val commentRegex = Regex("""^([^\n]*#)[^\n]*""") + var newLine = importRegex.replace(line, "") + newLine = commentRegex.replace(newLine, "") + return super.tokenize(newLine) + } + + override fun getLineLibraries(line: String, + fileLibraries: List): List { + + return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME) + } } diff --git a/src/main/kotlin/app/extractors/SwiftExtractor.kt b/src/main/kotlin/app/extractors/SwiftExtractor.kt index 742f2c29..55c68a3b 100644 --- a/src/main/kotlin/app/extractors/SwiftExtractor.kt +++ b/src/main/kotlin/app/extractors/SwiftExtractor.kt @@ -11,6 +11,7 @@ class SwiftExtractor : ExtractorInterface { companion object { val LANGUAGE_NAME = "swift" val FILE_EXTS = listOf("swift") + val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME) } override fun extract(files: List): List { @@ -32,4 +33,18 @@ class SwiftExtractor : ExtractorInterface { return imports.toList() } + + override fun tokenize(line: String): List { + val importRegex = Regex("""^(.*import)\s[^\n]*""") + val commentRegex = Regex("""^([^\n]*//)[^\n]*""") + var newLine = importRegex.replace(line, "") + newLine = commentRegex.replace(newLine, "") + return super.tokenize(newLine) + } + + override fun getLineLibraries(line: String, + fileLibraries: List): List { + + return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME) + } } diff --git a/src/main/resources/data/models.zip b/src/main/resources/data/models.zip new file mode 100644 index 00000000..231846d1 Binary files /dev/null and b/src/main/resources/data/models.zip differ diff --git a/src/test/kotlin/test/tests/extractors/ExtractorTest.kt b/src/test/kotlin/test/tests/extractors/ExtractorTest.kt new file mode 100644 index 00000000..065310ca --- /dev/null +++ b/src/test/kotlin/test/tests/extractors/ExtractorTest.kt @@ -0,0 +1,146 @@ +// Copyright 2017 Sourcerer Inc. All Rights Reserved. +// Author: Liubov Yaronskaya (lyaronskaya@sourcerer.io) + +package test.tests.extractors + +import app.extractors.* +import org.jetbrains.spek.api.Spek +import org.jetbrains.spek.api.dsl.given +import org.jetbrains.spek.api.dsl.it +import kotlin.test.assertEquals + +fun assertExtractsLineLibraries(expectedLibrary: String, actualLine: String, + extractor: ExtractorInterface) { + val actualLineLibraries = + extractor.getLineLibraries(actualLine, listOf(expectedLibrary)) + assert(expectedLibrary in actualLineLibraries) +} + +fun assertExtractsNoLibraries(actualLine: String, + extractor: ExtractorInterface) { + val actualLineLibraries = + extractor.getLineLibraries(actualLine, listOf()) + assertEquals(listOf(), actualLineLibraries) +} + +class ExtractorTest : Spek({ + given(" code line contains library code" ) { + it("python extractor extracts the library") { + val line = "with tf.Session() as sess" + assertExtractsLineLibraries("tensorflow", + line, PythonExtractor()) + } + + it("java extractor extracts the library") { + val line = "private JdbcTemplate jdbcTemplate=new JdbcTemplate();" + assertExtractsLineLibraries("org.springframework", + line, JavaExtractor()) + } + + it("javascript extractor extracts the library") { + val line = "new Vue({" + assertExtractsLineLibraries("vue", + line, JavascriptExtractor()) + } + + it("ruby extractor extracts the library") { + val line1 = "img = Magick::Image.read_inline(Base64.encode64(image)).first" + assertExtractsLineLibraries("RMagick", + line1, RubyExtractor()) + val line2 = "fximages << {image: img.adaptive_threshold(3, 3, 0), name: \"Adaptive Threshold\"}" + assertExtractsLineLibraries("RMagick", + line2, RubyExtractor()) + } + + it("go extractor extracts the library") { + val line = "if DB, found = revel.Config.String(\"bloggo.db\"); !found {" + assertExtractsLineLibraries("revel", + line, GoExtractor()) + } + + it("objectiveC extractor extracts the library") { + val line = "[[NSFileManager defaultManager] removeItemAtURL:[RLMRealmConfiguration defaultConfiguration].fileURL error:nil];" + assertExtractsLineLibraries("Realm", + line, ObjectiveCExtractor()) + } + + it("swift extractor extracts the library") { + val line = "class City: RLMObject {" + assertExtractsLineLibraries("Realm", + line, SwiftExtractor()) + } + + it("cpp extractor extracts the library") { + val line1 = "leveldb::Options options;" + assertExtractsLineLibraries("leveldb", + line1, CppExtractor()) + val line2 = "leveldb::Status status = leveldb::DB::Open(options, \"./testdb\", &tmp);" + assertExtractsLineLibraries("leveldb", + line2, CppExtractor()) + } + + it("csharp extractor extracts the library") { + val line = "Algorithm = (h, v, i) => new ContrastiveDivergenceLearning(h, v)" + assertExtractsLineLibraries("Accord", + line, CSharpExtractor()) + } + + it("php extractor extracts the library") { + val line = "public function listRepos(string \$user, int \$limit): Call;" + assertExtractsLineLibraries("Tebru\\Retrofit", + line, PhpExtractor()) + } + } + + given("code line doesn't use libraries" ) { + it("python extractor returns empty list") { + val line = "from collections import Counter" + assertExtractsNoLibraries(line, PythonExtractor()) + } + + it("java extractor returns empty list") { + val line = "throw new RuntimeException(e);" + assertExtractsNoLibraries(line, JavaExtractor()) + } + + it("javascript extractor returns empty list") { + val line = "console.log(self.commits[0].html_url)" + assertExtractsNoLibraries(line, JavascriptExtractor()) + } + + it("ruby extractor returns empty list") { + val line = "require \"RMagick\"" + assertExtractsNoLibraries(line, RubyExtractor()) + } + + it("go extractor returns empty list") { + val line = "var found bool" + assertExtractsNoLibraries(line, GoExtractor()) + } + + it("objectivec extractor returns empty list") { + val line = "@end" + assertExtractsNoLibraries(line, ObjectiveCExtractor()) + } + + it("php extractor returns empty list") { + val line = "