In [None]:
%use serialization
%use koog
%use lets-plot

#### Разбиение на блоки

In [None]:
import kotlinx.serialization.Serializable
import kotlinx.serialization.Serializer

@Serializable
data class Line(
    val sourceId: String,
    val startTime: Int,
    val text: String
)

@Serializable
data class Block(
    val sourceId: String,
    val startSec: Int,
    val endSec: Int,
    val text: String
)

// TODO: Блоки длиной > 25 разбиваются

fun buildBlocks(lines: List<Line>): List<Block> {
    val blocks = mutableListOf<Block>()
    var currentStart = lines.first().startTime
    var lastSec = currentStart
    var lastSourceId = lines.first().sourceId
    val buffer = StringBuilder()

    for (line in lines) {
        val gap = line.startTime - lastSec

        if (gap > 4 || (line.startTime - currentStart) > 25) {
            blocks += Block(
                sourceId = line.sourceId,
                startSec = currentStart,
                endSec = lastSec,
                text = buffer.toString().trim()
            )
            buffer.clear()
            currentStart = line.startTime
        }

        buffer.append(" ").append(line.text)
        lastSec = line.startTime
        lastSourceId = line.sourceId
    }

    if (buffer.isNotEmpty()) {
        blocks += Block(
            sourceId = lastSourceId,
            startSec = currentStart,
            endSec = lastSec,
            text = buffer.toString().trim()
        )
    }

    return blocks
}

In [None]:
@file:Suppress("RECEIVER_NULLABILITY_MISMATCH_BASED_ON_JAVA_ANNOTATIONS")

import kotlinx.serialization.json.Json
import java.io.File


val lines = File("notebooks/subtitles/UCSQGkViib9XLwjDfIahtLdw").listFiles()
    .fold(emptyList<File>()) { acc: List<File>, folder: File ->
        val targetTxt = folder.listFiles()
            .find { it.name.contains(".txt") }!!
        acc + targetTxt
    }
    .fold(emptyList<Line>()) { acc: List<Line>, file: File ->
        val targetLines = file.readLines()
            .map { rawLine ->
                val timeStamp = rawLine.substringBefore(" ")
                val text = rawLine.substringAfter(" ")
                Line(
                    sourceId = file.name,
                    startTime = timeStamp.toInt(),
                    text = text
                )
            }
        acc + targetLines
    }
lines.size

In [None]:

val result = buildBlocks(lines)
val json = Json {
    prettyPrint = true
}
val encoded = json.encodeToString(result)
encoded



#### Фильтр мусора

In [None]:
val filtered = result.map { b ->
    // [&gt;&gt;] - это смена голоса
    val filterTechnical = b.text.replace("&gt;&gt;", "").replace("[&nbsp;__&nbsp;]", "[мат]")

    val filterDuplicates = filterTechnical.replace(Regex("\\b(\\w+)(\\s+\\1\\b)+"), "$1")

    val filterIndents = filterDuplicates.replace(Regex("^\\s+"), "")

    return@map Block(b.sourceId, b.startSec, b.endSec, filterIndents)
}
val encodedFiltered = json.encodeToString(filtered)
encodedFiltered


#### Склейка "коротких" фрагментов с соседними блоками

In [None]:
// FIXME: некоторые реакции могут длиться меньше 4 секунд. Да, их ценность сомнительная, но всё-равно контент(возможно)
fun mergeShortBlocksToPrevious(blocks: List<Block>): List<Block> {
    return blocks
        .windowed(2,2, partialWindows = true)
        .fold(emptyList<Block>()) { acc, list ->
            val first = list.first()
            val second = list.last()
            val secondBlockDuration = second.endSec - second.startSec
            val merge = if (secondBlockDuration < 4) {
                val m = first.copy(
                    text = first.text + " " + second.text,
                    endSec = second.endSec
                )
                listOf(m)
            } else {
                listOf(first, second)
            }
            acc + merge
        }
}

In [None]:
val mergeShortBlocksToPrevious1 = mergeShortBlocksToPrevious(filtered)
val mergeShortBlocksToPrevious2 = mergeShortBlocksToPrevious(mergeShortBlocksToPrevious1)

// FIXME: последний блок затраивается

json.encodeToString(mergeShortBlocksToPrevious2)

#### Сохранение файлов на жёсткий диск

In [None]:
val outputDir = File("./blocks")
if (!outputDir.exists()) {
    outputDir.mkdirs()
}

mergeShortBlocksToPrevious2.forEach { b ->
    val blockFile = File(outputDir, "${b.sourceId}_${b.startSec}-${b.endSec}.txt")
    blockFile.writeText(b.text)
}


In [None]:
import ai.koog.embeddings.local.LLMEmbedder
import ai.koog.embeddings.local.OllamaEmbeddingModels
import ai.koog.rag.base.mostRelevantDocuments
import ai.koog.rag.vector.EmbeddingBasedDocumentStorage
import ai.koog.rag.vector.InMemoryVectorStorage
import ai.koog.rag.vector.JVMTextDocumentEmbedder
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.MutableSharedFlow
import kotlinx.coroutines.runBlocking
import java.nio.file.Path
import kotlinx.coroutines.flow.*

// Create an embedder using Ollama
val embedder = LLMEmbedder(OllamaClient(), OllamaEmbeddingModels.NOMIC_EMBED_TEXT)

// Create a JVM-specific document embedder
val documentEmbedder = JVMTextDocumentEmbedder(embedder)

// Create a ranked document storage using in-memory vector storage
val rankedDocumentStorage = EmbeddingBasedDocumentStorage(documentEmbedder, InMemoryVectorStorage())

outputDir.walkTopDown()
    .toList()
    .parallelStream()
    .filter { it.isFile }
    .forEach { file ->
        runBlocking {
            rankedDocumentStorage.store(Path.of(file.path))
        }
    }

In [None]:
import java.net.URL
import kotlin.io.path.readText

val query = "нормально нормально нормально"
runBlocking {
    val relevantFiles = rankedDocumentStorage.mostRelevantDocuments(query, similarityThreshold = 0.85)
    if (!relevantFiles.iterator().hasNext()) println("No relevant files found for query '$query'")
    relevantFiles
        .sortedByDescending { file -> file.fileName }
        .forEach { file ->
        val split = file.fileName.toString().split(".ru.txt_")
        val sourceId = split[0]
        val startTime = split[1].substringBefore("-")
        println(
            "https://www.youtube.com/watch?v=${sourceId}&t=${startTime}s"
        )
    }
}


### Работа со звуковой дорожкой

#### Парсинг лога

In [None]:
data class AudioFrame(
    val timeSec: Int,
    val rms: Double?,
    val peak: Double?
)

In [None]:
import java.io.File
import kotlin.math.floor

fun parseRmsPeakLog(file: File): List<AudioFrame> {
    val frames = mutableListOf<AudioFrame>()

    var currentTime: Int? = null
    var currentRms: Double? = null
    var currentPeak: Double? = null

    file.forEachLine { line ->
        when {
            // новая секунда / новый фрейм
            line.contains("pts_time:") -> {
                // сохранить предыдущий
                if (currentTime != null) {
                    frames += AudioFrame(
                        timeSec = currentTime!!,
                        rms = currentRms,
                        peak = currentPeak
                    )
                }

                val ptsTime = line.substringAfter("pts_time:")
                    .trim()
                    .toDoubleOrNull()

                currentTime = ptsTime?.let { floor(it).toInt() }
                currentRms = null
                currentPeak = null
            }

            line.contains("lavfi.astats.Overall.RMS_level") -> {
                currentRms = line.substringAfter("=")
                    .trim()
                    .toDoubleOrNull()
            }

            line.contains("lavfi.astats.Overall.Peak_level") -> {
                currentPeak = line.substringAfter("=")
                    .trim()
                    .toDoubleOrNull()
            }
        }
    }

    // последний фрейм
    if (currentTime != null) {
        frames += AudioFrame(
            timeSec = currentTime!!,
            rms = currentRms,
            peak = currentPeak
        )
    }

    return frames
}


In [None]:
val target = File("../rms+peak.log")
val parsedTarget = parseRmsPeakLog(target)
val plotData = mapOf(
    "time" to parsedTarget.map { it.timeSec },
    "rms" to parsedTarget.map { it.rms },
    "peak" to parsedTarget.map { it.peak }
)

In [None]:
import org.jetbrains.letsPlot.*
import org.jetbrains.letsPlot.geom.*

letsPlot(plotData) +
        geomLine {
            x = "time"
            y = "rms"
        } +
        labs(
            title = "RMS level over time",
            x = "Time (sec)",
            y = "RMS (dB)"
        )

In [None]:
fun audioEmotionScore(
    block: Block,
    frames: List<AudioFrame>,
    baselineRms: Double
): Double {
    val window = frames.filter {
        it.timeSec in block.startSec..block.endSec
    }

    if (window.isEmpty()) return 0.0

//    val maxRms = window.sortedByDescending { it.rms }.first().rms!!
//    val maxPeak = window.sortedByDescending { it.peak }.first().peak!!
    val maxRms = frames
        .mapNotNull { frame -> frame.rms?.let { frame to it } }
        .maxByOrNull { it.second }
        ?.first
        ?.rms!!

    val maxPeak = frames
        .mapNotNull { frame -> frame.peak?.let { frame to it } }
        .maxByOrNull { it.second }
        ?.first
        ?.peak!!

    val rmsBoost = maxRms - baselineRms
    val peakBoost = maxPeak + 60  // нормализация (dB)

    return rmsBoost * 0.7 + peakBoost * 0.3
}

In [None]:
val audioEmotionFromBlocks = filtered.map { block ->
    val emotion = audioEmotionScore(block, parsedTarget, 10.0)
    emotion to block
}

In [None]:
val emotionPlotData = mapOf(
    "time" to audioEmotionFromBlocks.map { it.second.startSec },
    "emotion" to audioEmotionFromBlocks.map { it.first },
)
letsPlot(emotionPlotData) +
        geomLine {
            x = "time"
            y = "emotion"
        } +
        labs(
            title = "Emotion Score level over time",
            x = "Time (sec)",
            y = "Emotion (score)"
        )