Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ dependencies {
compile group: 'org.eclipse.jgit', name: 'org.eclipse.jgit',
version: '4.8.0.201706111038-r'
compile "org.slf4j:slf4j-nop:1.7.2"
compile 'org.jpmml:pmml-evaluator:1.3.9'

testCompile 'org.jetbrains.kotlin:kotlin-test'
testCompile 'org.jetbrains.spek:spek-api:1.1.4'
Expand Down
17 changes: 16 additions & 1 deletion src/main/kotlin/app/extractors/CSharpExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ import app.model.DiffFile

class CSharpExtractor : ExtractorInterface {
companion object {
val LANGUAGE_NAME = "cs"
val LANGUAGE_NAME = "csharp"
val FILE_EXTS = listOf("cs")
val LIBRARIES = ExtractorInterface.getLibraries("cs")
val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -37,4 +38,18 @@ class CSharpExtractor : ExtractorInterface {

return imports.toList()
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^.*using\s+(\w+[.\w+]*)""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
}

override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
}
}
15 changes: 15 additions & 0 deletions src/main/kotlin/app/extractors/CppExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class CppExtractor : ExtractorInterface {
companion object {
val LANGUAGE_NAME = "cpp"
val FILE_EXTS = listOf("cc", "cpp", "cxx", "c++")
val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -32,4 +33,18 @@ class CppExtractor : ExtractorInterface {

return imports.toList()
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^([^\n]*#include)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
}

override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
}
}
3 changes: 2 additions & 1 deletion src/main/kotlin/app/extractors/Extractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ import app.model.DiffFile
class Extractor : ExtractorInterface {
companion object {
val TYPE_LANGUAGE = 1
val TYPE_KEYWORD = 2
val TYPE_LIBRARY = 2
val TYPE_KEYWORD = 3
val SEPARATOR = ">"
}

Expand Down
114 changes: 113 additions & 1 deletion src/main/kotlin/app/extractors/ExtractorInterface.kt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ package app.extractors

import app.model.DiffFile
import app.model.CommitStats
import org.dmg.pmml.FieldName
import org.jpmml.evaluator.Evaluator
import org.jpmml.evaluator.FieldValue
import org.jpmml.evaluator.ModelEvaluatorFactory
import org.jpmml.evaluator.ProbabilityDistribution
import org.jpmml.model.PMMLUtil

interface ExtractorInterface {
companion object {
Expand All @@ -14,6 +20,14 @@ interface ExtractorInterface {
.getResourceAsStream("data/libraries/${name}_libraries.txt")
.bufferedReader().readLines().toSet()
}
fun getLibrariesModelEvaluator(name: String): Evaluator {
val pmml = PMMLUtil.unmarshal(
ExtractorInterface::class.java.classLoader
.getResourceAsStream("data/models/$name.pmml"))
val evaluator = ModelEvaluatorFactory.newInstance()
.newModelEvaluator(pmml)
return evaluator
}
}

fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -23,6 +37,57 @@ interface ExtractorInterface {
file
}

val oldLibraryToCount = mutableMapOf<String, Int>()
val newLibraryToCount = mutableMapOf<String, Int>()
val oldFilesImports = files.fold(mutableSetOf<String>()) { acc, file ->
acc.addAll(file.old.imports)
acc
}
val newFilesImports = files.fold(mutableSetOf<String>()) { acc, file ->
acc.addAll(file.new.imports)
acc
}

oldFilesImports.forEach { oldLibraryToCount[it] = 0}
newFilesImports.forEach { newLibraryToCount[it] = 0}


files.filter { file -> file.language.isNotBlank() }
.forEach { file ->
val oldFileLibraries = mutableListOf<String>()
file.old.content.forEach {
val lineLibs = getLineLibraries(it, file.old.imports)
oldFileLibraries.addAll(lineLibs)
}
file.old.imports.forEach { import ->
val numLines = oldFileLibraries.count { it == import }
oldLibraryToCount[import] =
oldLibraryToCount[import] as Int + numLines
}

val newFileLibraries = mutableListOf<String>()
file.new.content.forEach {
val lineLibs = getLineLibraries(it, file.new.imports)
newFileLibraries.addAll(lineLibs)
}
file.new.imports.forEach { import ->
val numLines = newFileLibraries.count { it == import }
newLibraryToCount[import] =
newLibraryToCount[import] as Int + numLines
}
}

val allImports = mutableSetOf<String>()
allImports.addAll(oldFilesImports + newFilesImports)

val libraryStats = allImports.map {
CommitStats(
numLinesAdded = oldLibraryToCount.getOrDefault(it, 0),
numLinesDeleted = newLibraryToCount.getOrDefault(it, 0),
type = Extractor.TYPE_LIBRARY,
tech = it)
}

return files.filter { file -> file.language.isNotBlank() }
.groupBy { file -> file.language }
.map { (language, files) -> CommitStats(
Expand All @@ -31,12 +96,59 @@ interface ExtractorInterface {
numLinesDeleted = files.fold(0) { total, file ->
total + file.getAllDeleted().size },
type = Extractor.TYPE_LANGUAGE,
tech = language)}
tech = language)} + libraryStats
}

fun extractImports(fileContent: List<String>): List<String> {
return listOf()
}

fun tokenize(line: String): List<String> {
val stringRegex = Regex("""(".+?"|'.+?')""")
val newLine = stringRegex.replace(line, "")
//TODO(lyaronskaya): multiline comment regex
val splitRegex =
Regex("""\s|,|;|\*|\n|\(|\)|\[|]|\{|}|\+|=|&|\$|!=|\.|>|<|#|@|:|\?|!""")
val tokens = splitRegex.split(newLine)
.filter { it.isNotBlank() && !it.contains('"') && !it.contains('\'')
&& it != "-" && it != "@"}
return tokens
}

fun getLineLibraries(line: String, fileLibraries: List<String>): List<String> {
return listOf()
}

fun getLineLibraries(line: String,
fileLibraries: List<String>,
evaluator: Evaluator,
languageLabel: String): List<String> {
val arguments = LinkedHashMap<FieldName, FieldValue>()

for (inputField in evaluator.inputFields) {
val inputFieldName = inputField.name
val tokenizedLine = tokenize(line).joinToString(separator = " ")
val inputFieldValue = inputField.prepare(tokenizedLine)
arguments.put(inputFieldName, inputFieldValue)
}
val result = evaluator.evaluate(arguments)

val targetFieldName = evaluator.targetFields[0].name
val targetFieldValue = result[targetFieldName] as ProbabilityDistribution

val categoryValues = targetFieldValue.categoryValues.toList()
val probabilities = categoryValues.map { targetFieldValue.getProbability(it) }
val maxProbability = probabilities.max() as Double
val maxProbabilityCategory = categoryValues[probabilities.indexOf(maxProbability)]
val selectedCategories = categoryValues.filter {
targetFieldValue.getProbability(it) >= 0.1 * maxProbability
}

if (maxProbabilityCategory == languageLabel) {
return emptyList()
}

val lineLibraries = fileLibraries.filter { it in selectedCategories }
return lineLibraries
}
}
15 changes: 15 additions & 0 deletions src/main/kotlin/app/extractors/GoExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class GoExtractor : ExtractorInterface {
companion object {
val LANGUAGE_NAME = "go"
val FILE_EXTS = listOf("go")
val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand Down Expand Up @@ -41,4 +42,18 @@ class GoExtractor : ExtractorInterface {

return imports.toList()
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^(.*import)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
}

override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
}
}
15 changes: 15 additions & 0 deletions src/main/kotlin/app/extractors/JavaExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class JavaExtractor : ExtractorInterface {
"extends", "int", "short", "try", "char", "final", "interface",
"static", "void", "class", "finally", "long", "strictfp",
"volatile", "const", "float", "native", "super", "while")
val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand Down Expand Up @@ -72,4 +73,18 @@ class JavaExtractor : ExtractorInterface {

return imports.toList()
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^(.*import)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
}

override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
}
}
9 changes: 8 additions & 1 deletion src/main/kotlin/app/extractors/JavascriptExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ import app.model.DiffFile

class JavascriptExtractor : ExtractorInterface {
companion object {
val LANGUAGE_NAME = "js"
val LANGUAGE_NAME = "javascript"
val FILE_EXTS = listOf("js")
val LIBRARIES = ExtractorInterface.getLibraries("js")
val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -30,4 +31,10 @@ class JavascriptExtractor : ExtractorInterface {

return imports.toList()
}

override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
}
}
15 changes: 15 additions & 0 deletions src/main/kotlin/app/extractors/ObjectiveCExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class ObjectiveCExtractor : ExtractorInterface {
companion object {
val LANGUAGE_NAME = "objectivec"
val FILE_EXTS = listOf("m", "mm")
val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -36,4 +37,18 @@ class ObjectiveCExtractor : ExtractorInterface {

return imports.toList()
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^([^\n]*[#@](import|include))\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
}

override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
}
}
15 changes: 15 additions & 0 deletions src/main/kotlin/app/extractors/PhpExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class PhpExtractor : ExtractorInterface {
companion object {
val LANGUAGE_NAME = "php"
val FILE_EXTS = listOf("php", "phtml", "php4", "php3", "php5", "phps")
val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -34,4 +35,18 @@ class PhpExtractor : ExtractorInterface {

return imports.toList()
}

override fun tokenize(line: String): List<String> {
val importRegex = Regex("""^(.*require|require_once|include|include_once|use)\s[^\n]*""")
val commentRegex = Regex("""^([^\n]*//)[^\n]*""")
var newLine = importRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
}

override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
}
}
16 changes: 16 additions & 0 deletions src/main/kotlin/app/extractors/PythonExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class PythonExtractor : ExtractorInterface {
companion object {
val LANGUAGE_NAME = "python"
val FILE_EXTS = listOf("py", "py3")
val evaluator = ExtractorInterface.getLibrariesModelEvaluator(LANGUAGE_NAME)
}

override fun extract(files: List<DiffFile>): List<CommitStats> {
Expand All @@ -33,5 +34,20 @@ class PythonExtractor : ExtractorInterface {
}

return imports.toList()

}

override fun tokenize(line: String): List<String> {
val docImportRegex = Regex("""^([^\n]*#|\s*\"\"\"|\s*import|\s*from)[^\n]*""")
val commentRegex = Regex("""^(.*#).*""")
var newLine = docImportRegex.replace(line, "")
newLine = commentRegex.replace(newLine, "")
return super.tokenize(newLine)
}

override fun getLineLibraries(line: String,
fileLibraries: List<String>): List<String> {

return super.getLineLibraries(line, fileLibraries, evaluator, LANGUAGE_NAME)
}
}
Loading