In [1]:
%use kandy
%use dataframe

import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.*
import org.jetbrains.kotlinx.dataframe.io.readJson
import kotlinx.serialization.json.*
import java.nio.file.Path
import java.nio.file.Paths
import kotlin.io.path.*

In [2]:
fun createExperimentDataFrame(experimentsPath: String): DataFrame<*> {
    val providerPath = Paths.get(experimentsPath)

    if (!providerPath.exists()) {
        println("Provider path does not exist: $providerPath")
        return DataFrame.empty()
    }

    val dataframes = mutableListOf<DataFrame<*>>()

    for (folder in providerPath.listDirectoryEntries()) {
        if (!folder.isDirectory()) {
            continue
        }

        try {
            // Find all files matching the pattern pgp_eval_*.json
            val jsonFiles = folder.listDirectoryEntries("pgp_eval_*.json")

            if (jsonFiles.isEmpty()) {
                println("No pgp_eval_*.json files found in $folder")
                continue
            }

            // Process each matching file
            for (jsonPath in jsonFiles) {
                try {
                    // Read and parse JSON file
                    val jsonContent = jsonPath.readText()
                    val jsonElement = Json.parseToJsonElement(jsonContent)

                    // Convert JSON to DataFrame
                    val df = when (jsonElement) {
                        is JsonObject -> {
                            // If it's a single object, create DataFrame from it
                            DataFrame.readJson(jsonPath.toString())
                        }
                        is JsonArray -> {
                            // If it's an array, create DataFrame from array
                            DataFrame.readJson(jsonPath.toString())
                        }
                        else -> {
                            println("Unsupported JSON format in ${jsonPath.fileName}")
                            continue
                        }
                    }

                    // Add metadata columns
                    val enrichedDf = df
                        .add("source_file") { jsonPath.fileName.toString() }
                        .add("source_folder") { folder.fileName.toString() }

                    dataframes.add(enrichedDf)

                } catch (e: Exception) {
                    println("Error processing file $jsonPath: ${e.message}")
                    continue
                }
            }

        } catch (e: Exception) {
            println("Error reading files in $folder: ${e.message}")
            continue
        }
    }

    // Combine all dataframes
    return if (dataframes.isNotEmpty()) {
        dataframes.reduce { acc, df -> acc.concat(df) }
    } else {
        DataFrame.empty()
    }
}

@OptIn(ExperimentalPathApi::class)
fun removeAllFilesFromFolders(folderPaths: List<String>, workdir: String = "") {
    folderPaths.forEach { folderPath ->
        val path = workdir + folderPath
        val folder = Paths.get(path)

        if (!folder.exists()) {
            println("Folder does not exist: $path")
            return@forEach
        }

        if (!folder.isDirectory()) {
            println("Path is not a directory: $path")
            return@forEach
        }

        try {
            folder.listDirectoryEntries().forEach { p ->
                when {
                    p.isRegularFile() -> {
                        p.deleteExisting()
                        println("Deleted file: $p")
                    }
                    p.isDirectory() -> {
                        p.deleteRecursively()
                        println("Deleted subdirectory: $p")
                    }
                }
            }
            folder.deleteExisting()
            println("Deleted folder: $path")
        } catch (e: Exception) {
            println("Error processing folder $path: ${e.message}")
        }
    }
}

/**
 * Stratified sampling for model file paths like 'anthropic/claude-sonnet-4/claude-sonnet-4-5-prompt_with_hints_and_remarks-0.1/pgp_eval_beautiful_volhard.json'
 * Groups by provider/model name only (first two path segments)
 *
 * @param data List of model file paths
 * @param samplesPerModel Number of samples to take from each model type
 * @return List of sampled file paths with equal representation per model
 */
fun stratifiedSampleModels(data: List<String>, samplesPerModel: Int = 10): List<String> {
    // Group by provider/model name only (ignore everything after second slash)
    val grouped = data.groupBy { item ->
        val parts = item.split("/")
        when {
            parts.size >= 2 -> "${parts[0]}/${parts[1]}"
            else -> item
        }
    }

    // Sample equally from each group
    val sampled = mutableListOf<String>()

    grouped.forEach { (modelType, items) ->
        println("Model type '$modelType': ${items.size} items")
        when {
            items.size >= samplesPerModel -> {
                val selected = items.shuffled().take(samplesPerModel)
                sampled.addAll(selected)
//                println("  -> Sampled $samplesPerModel items")
            }
            else -> {
                sampled.addAll(items)
//                println("  -> Took all ${items.size} items (less than $samplesPerModel)")
            }
        }
    }

    return sampled
}

fun plotSingle(
    df: DataFrame<*>,
    xCol: String,
    yCol: String,
    facet: ColumnReference<*>
) = df.plot {
    bars {
        layout {
            size = 600 to 400
            style {
                legend.position = LegendPosition.None
            }
        }
        x(xCol) {
            axis.name = xCol
        }
        y(yCol) {
            axis.name = yCol
            axis.min = 0.0
        }
        fillColor(xCol)
    }
    facetGridX(facet)
}

fun plotAll(df: DataFrame<*>, metrics: List<String>, dfFacet: ColumnReference<*>) =
    plotBunch {
        val numMetrics = metrics.size
        val maxRows = 999
        val maxCols = 2

        val cols = minOf(maxCols, numMetrics)
        val rows = minOf(maxRows, ceil(numMetrics.toDouble() / cols).toInt())

        metrics.take(rows * cols).forEachIndexed { idx, metric ->
            val row = idx / cols
            val col = idx % cols

            val xOffset = col * 600
            val yOffset = row * 400

            add(plotSingle(df, "Model", metric, dfFacet), xOffset, yOffset)
        }
    }

In [3]:
val data = "data"

In [4]:
val anthropic = createExperimentDataFrame("../metrics/anthropic")
val deepseek = createExperimentDataFrame("../metrics/deepseek")
val openai = createExperimentDataFrame("../metrics/openai")
val google = createExperimentDataFrame("../metrics/google")
val allModels = anthropic.concat(deepseek, openai, google)

Number of generation attempts failed, likely due to network errors

In [5]:
val invalidFolders = allModels.filter { generationConfig.temperature == null }.source_folder.toList()
// removeAllFilesFromFolders(invalidFolders, workdir = "../metrics/deepseek/")
invalidFolders.count()

2

In [6]:
val all = allModels.dropNA()

In [7]:
all.columnNames()

[masId, agentId, pgpId, parsedPlans, amountGeneratedPlans, averageAmountBeliefs, averageAmountOperations, amountGeneralPlan, amountInventedGoals, amountInventedBeliefs, amountUselessPlans, amountNotParseablePlans, amountInadequateUsageGoals, amountInadequateUsageBeliefs, amountInadequateUsageActions, timeUntilCompletion, executable, achievesGoal, generationData, generationConfig, source_file, source_folder]

In [8]:
all.generationConfig.columnNames()

[type, modelId, temperature, maxTokens, lmServerUrl, contextFiltersNames, systemPromptBuilderName, userPromptBuilderName, remarks, requestTimeout, connectTimeout, socketTimeout]

In [9]:
all.generationData.columnNames()

[id, total_cost, latency, generation_time, tokens_prompt, tokens_completion, native_tokens_reasoning, native_tokens_prompt, native_tokens_completion]

Max, min, average, etc. sum of input and output tokens (normalized using Gpt4 tokenizer)

In [10]:
all.generationData.tokens_prompt.describe()

name,type,count,unique,nulls,top,freq,mean,std,min,median,max
tokens_prompt,Int,372,14,0,808,119,994.045699,139.017386,808,1051,1196


In [11]:
all.filter {
    generationData.tokens_completion < 100
}.select {
    generationConfig.modelId and achievesGoal
}

modelId,achievesGoal
google/gemini-2.5-flash,False


In [12]:
all.generationData.tokens_completion.describe()

name,type,count,unique,nulls,top,freq,mean,std,min,median,max
tokens_completion,Int,372,169,0,130,19,222.543011,76.31606,77,211,553


Max, min, average, etc. latency when making requests to OpenRouter.

In [13]:
all.generationData.latency.describe()

name,type,count,unique,nulls,top,freq,mean,std,min,median,max
latency,Int,372,333,0,353,4,1471.16129,945.899883,288,1320,6120


Price in USD to run the experiments.

In [14]:
all.generationData.total_cost.sum()

1.0688588000000008

In [15]:
val allGrouped = all
    .groupBy("masId", "agentId").aggregate {
        count() into "pgpCount"
    }

Number of instances in which the PGP was invoked more than one time

In [16]:
allGrouped.filter { pgpCount > 1 }.count()

8

Max, min, average, etc. of PGPs when more than one was invoked.

In [17]:
allGrouped.filter { pgpCount > 1 }.select { pgpCount }.describe()

name,type,count,unique,nulls,top,freq,mean,std,min,median,max
pgpCount,Int,8,3,0,2,6,3.0,2.44949,2,2,9


In [18]:
val df = all.excludeJoin(allGrouped.filter { pgpCount > 1 }) { masId and agentId }

Why the time until completion (GAT) is so high? H

In [30]:
df.filter {
    achievesGoal == true //&& timeUntilCompletion <= 55
}.plot {
    statBin(timeUntilCompletion) {
        bars {
            x(Stat.x)
            y(Stat.count)
        }
    }
}

In [36]:
df.filter {
    achievesGoal == true && !generationConfig.modelId.contains("deepseek")//&& timeUntilCompletion <= 55
}.plot {
    statBin(timeUntilCompletion) {
        bars {
            x(Stat.x)
            y(Stat.count)
        }
    }
}

In [37]:
df.filter {
    achievesGoal == true && timeUntilCompletion <= 55
}.count()

76

In [35]:
df.filter {
    achievesGoal == true && !generationConfig.modelId.contains("deepseek") && timeUntilCompletion <= 55
}.count()

38

In [39]:
df.filter {
    achievesGoal == true && !generationConfig.modelId.contains("deepseek") && timeUntilCompletion > 55
}.count()

29

How many useless plans where generated?

In [57]:
df.amountUselessPlans.sum()

20

In [58]:
val mapper: (String) -> String = { value ->
    when (value) {
        "UserMessageWithHintsAndRemarks" -> "WithHintsAndRemarks"
        "UserMessageWithHints" -> "WithHints"
        "UserMessageNoHints" -> "NoHints"
        else -> value
    }
}

val modelMapper: (String) -> String = { value ->
    when(value) {
        "deepseek/deepseek-chat-v3-0324:free" -> "Deepseek"
        "anthropic/claude-sonnet-4" -> "Claude"
        "google/gemini-2.5-flash" -> "Gemini"
        "openai/gpt-4.1" -> "GPT 4.1"
        else -> value
    }.split("/").last { true }
}

In [59]:
val renamedDf = df
    .add("Temperature") { generationConfig.temperature }
    .rename { amountGeneratedPlans }.into("PC") // Plan Count
    .rename { averageAmountBeliefs }.into("CC") // Context Complexity
    .rename { averageAmountOperations }.into("PBC") // Plan Body Complexity
    .rename { amountGeneralPlan }.into("GC") // Generalization Count
    .rename { amountUselessPlans }.into("RR") // Redundancy Amount
    .rename { amountInventedGoals }.into("NGC") // Novel Goal Count
    .rename { amountInventedBeliefs }.into("NBC") // Novel Belief Count
    .rename { amountInadequateUsageGoals }.into("GSA") // Goal Semantic Alignment
    .rename { amountInadequateUsageBeliefs }.into("BSA") // Belief Semantic Alignment
    .rename { achievesGoal }.into("TSR") // Task Success Rate
    .rename { timeUntilCompletion }.into("GAT") // Goal Achievement Time
    .add("Model") { modelMapper(generationConfig.modelId) }
    .add("PromptType") { mapper(generationConfig.userPromptBuilderName) }
    .remove { generationConfig and generationData and source_file and source_folder }
    .remove { executable and masId and agentId and pgpId }
    .remove { parsedPlans and amountNotParseablePlans and amountInadequateUsageActions }

In [60]:
df.columnNames()

[masId, agentId, pgpId, parsedPlans, amountGeneratedPlans, averageAmountBeliefs, averageAmountOperations, amountGeneralPlan, amountInventedGoals, amountInventedBeliefs, amountUselessPlans, amountNotParseablePlans, amountInadequateUsageGoals, amountInadequateUsageBeliefs, amountInadequateUsageActions, timeUntilCompletion, executable, achievesGoal, generationData, generationConfig, source_file, source_folder]

In [61]:
renamedDf.columnNames()

[PC, CC, PBC, GC, NGC, NBC, RR, GSA, BSA, GAT, TSR, Temperature, Model, PromptType]

In [62]:
val gat = renamedDf.filter { TSR == true }.groupBy { Model }
gat.aggregate { mean { GAT } into "GAT" }.sortBy { GAT }

Model,GAT
Deepseek,37.710526
Claude,247.52381
Gemini,506.333333
GPT 4.1,1033.162791


In [63]:
val overallStats = renamedDf
//    .filter { PromptType != "WithoutHints" }
//    .filter { PromptType == "WithHintsAndRemarks" && Temperature == 0.1 }
    .filter { TSR == true }
    .groupBy { Model }
    .aggregate {
        count() into "pgpCount"
        count { TSR } into "sum"
        mean { PC } into "PC"
        mean { CC } into "CC"
        mean { PBC } into "PBC"
        mean { GC } into "GC"
        mean { RR } into "RR"
        mean { NGC } into "NGC"
        mean { NBC } into "NBC"
        mean { GSA } into "GSA"
        mean { BSA } into "BSA"
        mean { GAT } into "GAT"
    }
    .add("TSR") { (it["sum"] as Int).toDouble() / (it["pgpCount"] as Int) * 100 }
    .remove("sum")
    .remove("pgpCount")
    .remove("TSR")
    .sortBy { GAT }

File("$data/overall.csv").writeText(overallStats.convert { colsOf<Double>() }.with {
    "%.2f".format(it)
}.toCsv())

In [64]:
val tempStats = renamedDf.groupBy {
    Model and Temperature
}.aggregate {
    count() into "count"
    count { if (it["TSR"] == true) true else false } into "sum"
    mean { PC } into "PC"
    mean { CC } into "CC"
    mean { PBC } into "PBC"
    mean { GC } into "GC"
    mean { RR } into "RR"
    mean { NGC } into "NGC"
    mean { NBC } into "NBC"
    mean { GSA } into "GSA"
    mean { BSA } into "BSA"
}.add("TSR") {
    (it["sum"] as Int).toDouble() / (it["count"] as Int) * 100
}
    .remove("sum")
    .remove("count")

File("$data/temperatureStats.csv").writeText(
    tempStats.convert { colsOf<Double>() }.with { "%.2f".format(it) }.toCsv()
)
tempStats.sortByDesc("Model", "Temperature")

Model,Temperature,PC,CC,PBC,GC,RR,NGC,NBC,GSA,BSA,TSR
Gemini,0.9,3.615385,1.44359,1.498718,3.615385,0.038462,0.538462,0.461538,0.461538,1.961538,0.0
Gemini,0.5,4.206897,1.458621,1.47069,4.137931,0.206897,0.517241,0.413793,0.241379,1.758621,10.344828
Gemini,0.1,4.1,1.348889,1.201111,4.066667,0.4,0.666667,1.333333,0.333333,3.0,0.0
GPT 4.1,0.9,4.3,2.319365,1.663889,4.233333,0.033333,1.433333,1.0,0.166667,1.533333,36.666667
GPT 4.1,0.5,4.166667,2.145317,1.605079,4.133333,0.0,1.533333,0.5,0.133333,1.1,43.333333
GPT 4.1,0.1,3.766667,2.246667,1.702222,3.766667,0.0,1.5,0.5,0.2,0.933333,63.333333
Deepseek,0.9,3.758621,1.888013,1.855911,0.37931,0.0,0.689655,1.103448,0.482759,2.517241,24.137931
Deepseek,0.5,3.464286,1.596429,2.067262,0.285714,0.0,0.571429,1.25,0.392857,3.142857,46.428571
Deepseek,0.1,3.517241,1.600575,2.146552,0.137931,0.0,0.344828,1.137931,0.103448,3.482759,62.068966
Claude,0.9,4.466667,1.67,1.523968,1.866667,0.0,1.3,0.533333,0.233333,1.966667,26.666667


In [65]:
val plotReadyTempStats = tempStats
    .rename { PC }.into("Plan Count")
    .rename { CC }.into("Context Complexity")
    .rename { PBC }.into("Plan Body Complexity")
    .rename { GC }.into("Generalization Count")
    .rename { RR }.into("Redundancy Count")
    .rename { NGC }.into("Novel Goal Count")
    .rename { NBC }.into("Novel Belief Count")
    .rename { GSA }.into("Goal Semantic Alignment")
    .rename { BSA }.into("Belief Semantic Alignment")
    .rename { TSR }.into("Task Success Rate")

val metrics = plotReadyTempStats.columnNames()
    .minus("Model")
    .minus("Temperature")

val res = plotAll(
    plotReadyTempStats,
    metrics.dropLast(2),
    plotReadyTempStats.Temperature
)

val res2 = plotAll(
    plotReadyTempStats,
    metrics.takeLast(2),
    plotReadyTempStats.Temperature
)
res.save("temperature-stats.svg")
res2.save("temperature-stats2.svg")

/home/rbattistini/Progetti/master-thesis-rbattistini/jakta-playground/notebooks/lets-plot-images/temperature-stats2.svg

In [66]:
val promptStats = renamedDf.groupBy {
    Model and PromptType
}.aggregate {
    count() into "count"
    count { if (it["TSR"] == true) true else false } into "sum"
    mean { PC } into "PC"
    mean { CC } into "CC"
    mean { PBC } into "PBC"
    mean { GC } into "GC"
    mean { RR } into "RR"
    mean { NGC } into "NGC"
    mean { NBC } into "NBC"
    mean { GSA } into "GSA"
    mean { BSA } into "BSA"
}.add("TSR") {
    (it["sum"] as Int).toDouble() / (it["count"] as Int) * 100
}.remove("sum").remove("count")

File("$data/promptStats.csv").writeText(
    promptStats.convert { colsOf<Double>() }.with { "%.2f".format(it) }.toCsv()
)
promptStats.sortByDesc("Model", "PromptType")

Model,PromptType,PC,CC,PBC,GC,RR,NGC,NBC,GSA,BSA,TSR
Gemini,WithHintsAndRemarks,4.035714,1.127976,1.127381,3.964286,0.0,0.857143,0.178571,0.178571,1.75,10.714286
Gemini,WithHints,4.821429,1.478571,1.530357,4.785714,0.642857,0.714286,1.785714,0.535714,2.857143,0.0
Gemini,NoHints,3.137931,1.631609,1.490805,3.137931,0.034483,0.172414,0.310345,0.310345,2.172414,0.0
GPT 4.1,WithHintsAndRemarks,4.466667,1.995873,1.359524,4.366667,0.0,1.566667,0.066667,0.0,0.566667,80.0
GPT 4.1,WithHints,4.233333,2.06127,1.705317,4.233333,0.033333,1.7,0.9,0.133333,1.133333,63.333333
GPT 4.1,NoHints,3.533333,2.654206,1.906349,3.533333,0.0,1.2,1.033333,0.366667,1.866667,0.0
Deepseek,WithHintsAndRemarks,3.566667,1.399524,2.079603,0.3,0.0,0.366667,1.033333,0.466667,3.466667,80.0
Deepseek,WithHints,3.535714,1.85119,1.973214,0.0,0.0,0.25,1.214286,0.25,3.357143,50.0
Deepseek,NoHints,3.642857,1.858929,2.01131,0.5,0.0,1.0,1.25,0.25,2.285714,0.0
Claude,WithHintsAndRemarks,3.413793,1.396552,1.614943,0.034483,0.0,0.034483,0.0,0.37931,1.724138,65.517241


In [67]:
val plotReadyPromptStats = promptStats
    .rename { PC }.into("Plan Count")
    .rename { CC }.into("Context Complexity")
    .rename { PBC }.into("Plan Body Complexity")
    .rename { GC }.into("Generalization Count")
    .rename { RR }.into("Redundancy Count")
    .rename { NGC }.into("Novel Goal Count")
    .rename { NBC }.into("Novel Belief Count")
    .rename { GSA }.into("Goal Semantic Alignment")
    .rename { BSA }.into("Belief Semantic Alignment")
    .rename { TSR }.into("Task Success Rate")

val metrics = plotReadyPromptStats.columnNames()
    .minus("Model")
    .minus("PromptType")

val res = plotAll(
    plotReadyPromptStats,
    metrics.dropLast(2),
    plotReadyPromptStats.PromptType
)

val res2 = plotAll(
    plotReadyPromptStats,
    metrics.takeLast(2),
    plotReadyPromptStats.PromptType
)
res.save("prompt-stats.svg")
res2.save("prompt-stats2.svg")

/home/rbattistini/Progetti/master-thesis-rbattistini/jakta-playground/notebooks/lets-plot-images/prompt-stats2.svg

In [68]:
val comboStats = renamedDf.groupBy {
    Model and PromptType and Temperature
}.aggregate {
    count() into "count"
    count { if (it["TSR"] == true) true else false } into "sum"
}.add("TSR") {
    (it["sum"] as Int).toDouble() / (it["count"] as Int) * 100
}
    .select("Model", "PromptType", "Temperature", "TSR")
    .sortByDesc("TSR", "Model")

File("$data/promptTemperatureStats.csv")
    .writeText(comboStats.convert { colsOf<Double>() }.with { "%.2f".format(it) }.toCsv())
comboStats

Model,PromptType,Temperature,TSR
GPT 4.1,WithHintsAndRemarks,0.1,100.0
Deepseek,WithHintsAndRemarks,0.1,100.0
Deepseek,WithHintsAndRemarks,0.5,100.0
GPT 4.1,WithHints,0.1,90.0
Deepseek,WithHints,0.1,88.888889
Claude,WithHintsAndRemarks,0.5,77.777778
GPT 4.1,WithHintsAndRemarks,0.9,70.0
GPT 4.1,WithHintsAndRemarks,0.5,70.0
Claude,WithHintsAndRemarks,0.9,70.0
GPT 4.1,WithHints,0.5,60.0


In [69]:
val successfullPgps = df.filter { achievesGoal == true }
    .select { generationConfig.modelId and source_file and source_folder }
    .add("path") { it["source_folder"] as String }
    .remove { source_file and source_folder }
    .map { it["modelId"] as String + "/" + it["path"] as String }

val sampledModels = stratifiedSampleModels(successfullPgps, samplesPerModel = 5)
    .map { it.split("/").first { true } + "/" + it.split("/").last { true } } // remove redudant model name

println("\nTotal sampled: ${sampledModels.size}")
println("Sampled models:")
sampledModels
    .sorted()
    .forEach { model -> println("  - $model") }

val whitelistFile = java.io.File("$data/whitelist.txt")
whitelistFile.writeText(sampledModels.joinToString("\n"))
println("\nWhitelist written to whitelist.txt")

Model type 'anthropic/claude-sonnet-4': 21 items
Model type 'deepseek/deepseek-chat-v3-0324:free': 38 items
Model type 'openai/gpt-4.1': 43 items
Model type 'google/gemini-2.5-flash': 3 items

Total sampled: 18
Sampled models:
  - anthropic/claude-sonnet-4-2-prompt_with_hints_and_remarks-0.5
  - anthropic/claude-sonnet-4-3-prompt_with_hints_and_remarks-0.5
  - anthropic/claude-sonnet-4-5-prompt_with_hints_and_remarks-0.5
  - anthropic/claude-sonnet-4-6-prompt_with_hints_and_remarks-0.9
  - anthropic/claude-sonnet-4-9-prompt_with_hints_and_remarks-0.9
  - deepseek/deepseek-chat-v3-0324:free-0-prompt_with_hints_and_remarks-0.1
  - deepseek/deepseek-chat-v3-0324:free-1-prompt_with_hints_and_remarks-0.5
  - deepseek/deepseek-chat-v3-0324:free-2-prompt_with_hints_and_remarks-0.1
  - deepseek/deepseek-chat-v3-0324:free-6-prompt_with_hints_and_remarks-0.9
  - deepseek/deepseek-chat-v3-0324:free-9-prompt_with_hints_and_remarks-0.5
  - google/gemini-2.5-flash-1-prompt_with_hints_and_remarks-0.5