# Analyzing the Data

In this notebook, we'll analyze the enriched events from the previous notebook. We'll use a combination of techniques to analyze the data:

1. Querying Redis for trending topics
2. Semantic search using embeddings
3. Summarization of posts using a Large Language Model (LLM)
4. Creating a simple query router to handle different types of user queries

This notebook demonstrates how to build a simple question-answering system on top of the enriched data.

In [1]:
import dev.raphaeldelio.*

In [2]:
%use ktor-client
%use serialization
%use coroutines

In [3]:
val trendingTopicsRoute = listOf(
    "What are the most mentioned topics?",
    "What's trending right now?",
    "What’s hot in the network",
    "Top topics?",
    "What are people talking about?",
    "What people are discussing?",
)

In [4]:
@file:DependsOn("org.springframework.ai:spring-ai-redis-store:1.0.0-RC1")

In [5]:
import org.springframework.ai.transformers.TransformersEmbeddingModel

val embeddingModel = TransformersEmbeddingModel()
embeddingModel.setModelResource("file:resources/model/bge-large-en-v1.5/model.onnx")
embeddingModel.setTokenizerResource("file:resources/model/bge-large-en-v1.5/tokenizer.json")
embeddingModel.afterPropertiesSet()

In [44]:
import org.springframework.ai.vectorstore.redis.RedisVectorStore
import org.springframework.ai.vectorstore.redis.RedisVectorStore.MetadataField
import redis.clients.jedis.search.Schema.FieldType

val redisVectorStore = RedisVectorStore.builder(jedisPooled, embeddingModel)
    .indexName("routeIdx")
    .contentFieldName("text")
    .embeddingFieldName("textEmbedding")
    .metadataFields(
        MetadataField("route", FieldType.TEXT),
        MetadataField("minThreshold", FieldType.NUMERIC),
    )
    .prefix("route:")
    .initializeSchema(true)
    .vectorAlgorithm(RedisVectorStore.Algorithm.FLAT)
    .build()
redisVectorStore.afterPropertiesSet()

In [38]:
import org.springframework.ai.document.Document
import java.util.UUID

fun createRouteDocument(route: String, text: String, minThreshold: Double): Document {
    return Document(
        UUID.randomUUID().toString(),
        text,
        mapOf(
            "route" to route,
            "text" to text,
            "minThreshold" to minThreshold,
        )
    )
}

fun storeRouteDocumentsInRedis(routeName: String, minThreshold: Double, routeSamples: List<String>) {
    val trendingTopicDocuments = routeSamples.map { text ->
        createRouteDocument(routeName, text, minThreshold)
    }

    redisVectorStore.add(trendingTopicDocuments)
}

storeRouteDocumentsInRedis("trending_topics", 0.9, trendingTopicsRoute)

In [29]:
import org.springframework.ai.vectorstore.SearchRequest

redisVectorStore.similaritySearch(
    SearchRequest.builder()
        .topK(1)
        .query("Hey Dev Bubble. What's trending today? Excited to hear the news!")
        .build()
)

[Document{id='e6031c4f-a313-4fe2-93af-f3290b8e96a0', text='What's trending right now?', media='null', metadata={vector_score=0.09470183, minThreshold=0.9, route=trending_topics, distance=0.09470183}, score=0.9052981734275818}]

In [32]:
import redis.clients.jedis.search.FTSearchParams
import redis.clients.jedis.search.Query

fun breakSentenceIntoClauses(sentence: String): List<String> {
    return sentence.split(Regex("""[!?,.:;()"\[\]{}]+"""))
        .filter { it.isNotBlank() }.map { it.trim() }
}

fun matchRoute(query: String): Set<String> {
    return breakSentenceIntoClauses(query).flatMap { clause ->
        val result = redisVectorStore.similaritySearch(
            SearchRequest.builder()
                .topK(1)
                .query(clause)
                .build()
        )

        val route = result?.firstOrNull()?.metadata?.get("route") as String
        val minThreshold = result.firstOrNull()?.metadata?.get("minThreshold") as String

        result.forEach {
            println(clause)
            println(route)
            println(it.score ?: 0.0)
            println(minThreshold)
            println()
        }

        result.filter { (it?.score ?: 0.0) > minThreshold.toDouble() }.map {
            it?.metadata?.get("route") as String
        }
    }.toSet()
}

In [33]:
matchRoute("Hey DevBubble, what's trending today? Excited to hear the news!")

Hey DevBubble
trending_topics
0.7936376333236694
0.9

what's trending today
trending_topics
0.979021430015564
0.9

Excited to hear the news
trending_topics
0.8192511796951294
0.9



[trending_topics]

In [34]:
import org.springframework.ai.chat.messages.SystemMessage
import org.springframework.ai.chat.messages.UserMessage
import org.springframework.ai.chat.prompt.Prompt
import java.time.LocalDateTime

fun trendingTopics(): Set<String> {
    val currentMinute = LocalDateTime.now().withSecond(0).withNano(0).toString()
    return jedisPooled.smembers("topics")
        .map { it to jedisPooled.cmsQuery("topics-cms:$currentMinute", it).first() }
        .sortedByDescending { it.second }
        .take(10)
        .map { it.first }
        .toSet()
}

In [12]:
trendingTopics()

redis.clients.jedis.exceptions.JedisDataException: CMS: key does not exist

In [65]:
import dev.raphaeldelio.*

val trendingTopicsHandler: (String, String) -> Iterable<String> = { route, query ->
    when (route) {
        "trending_topics" -> trendingTopics()
        else -> emptyList()
    }
}

fun processUserRequest(
    query: String,
    handler: (String, String) -> Iterable<String>
): String {
    val routes = matchRoute(query)
    println(routes)

    val enrichedData = routes.map { route -> handler(route, query) }

    val systemPrompt = "You are a bot that helps users analyse posts about politics. You may be given a data set to help you answer questions. Answer in a max od 300 chars. I MEAN IT. It's a TWEET. Don't write more than 300 chars. Respond in only ONE paragraph. Be as concise as possible"

    return ollamaChatModel.call(
        Prompt(
            SystemMessage(systemPrompt),
            SystemMessage("Enriching data: $enrichedData"),
            UserMessage("User query: $query")
        )
    ).result.output.text ?: ""
}

In [14]:
processUserRequest("What are people talking about?", trendingTopicsHandler)

What are people talking about
trending_topics
0.9953664541244507

[trending_topics]


redis.clients.jedis.exceptions.JedisDataException: CMS: key does not exist

In [36]:
val summarizationRoute = listOf(
    "What are people saying about {topics}?",
    "What’s the buzz around {topics}?",
    "Any chatter about {topics}?",
    "What are folks talking about regarding {topics}?",
    "What’s being said about {topics} lately?",
    "What have people been posting about {topics}?",
    "What's trending in conversations about {topics}?",
    "What’s the latest talk on {topics}?",
    "Any recent posts about {topics}?",
    "What's the sentiment around {topics}?",
    "What are people saying about {topic1} and {topic2}?",
    "What are folks talking about when it comes to {topic1}, {topic2}, or both?",
    "What’s being said about {topic1}, {topic2}, and others?",
    "Is there any discussion around {topic1} and {topic2}?",
    "How are people reacting to both {topic1} and {topic2}?",
    "What’s the conversation like around {topic1}, {topic2}, or related topics?",
    "Are {topic1} and {topic2} being discussed together?",
    "Any posts comparing {topic1} and {topic2}?",
    "What's trending when it comes to {topic1} and {topic2}?",
    "What are people saying about the relationship between {topic1} and {topic2}?"
)

In [39]:
storeRouteDocumentsInRedis("summarization", 0.8, summarizationRoute)

In [60]:
import org.springframework.ai.chat.messages.SystemMessage
import org.springframework.ai.chat.messages.UserMessage
import org.springframework.ai.chat.prompt.Prompt

fun summarization(userQuery: String): List<String> {
    val existingTopics = jedisPooled.smembers("topics").joinToString { ", " }
    val queryTopics = topicModeling(userQuery, existingTopics).replace("\"", "").split(", ")
    println(queryTopics)

    return queryTopics.map { topic ->
        val query = Query("@topics:{'$topic'}")
            .returnFields("text")
            .setSortBy("time_us", false)
            .dialect(2)
            .limit(0, 10)

        val result = jedisPooled.ftSearch(
            "postIdx",
            query
        )

        result.documents.map {
            document -> document.get("text").toString()
        }
    }.flatten()
}

In [61]:
summarization("What's being said about Trump and Angela Merkel?")

[ Donald Trump, Angela Merkel, US Politics, German Politics]


[I have many problems with Biden, including some big ones like enabling genocide and putting corporate profits over public safety during a pandemic. But if you’re a news organization and you’re leading with a Biden story while Trump systematically destroys our republic, you’re criminally negligent, Living in the past is how we got into this shit show to start with. The 2024 election is over, the Biden presidency is over. What happened during that time, no longer matters. We don't have the same country we had then., The idea this is a legitimate government, or that it had been this entire time, is actually what's been destroyed already in the minds of tens of millions of Americans that weren't so inclined before. Now they see the truth that thought leaders like you end at these brick walls, Dan Moynihan.
Cheers, And when a handful of Republicans met at a steakhouse in DC on the day of his inauguration to plot to derail Obama's agenda, the Republican party officially began strategizing h

In [62]:
val multiHandler: (String, String) -> Iterable<String> = { route, query ->
    when (route) {
        "trending_topics" -> trendingTopics()
        "summarization" -> summarization(query)
        else -> emptyList()
    }
}

In [66]:
processUserRequest("What's being said about Trump and Angela Merkel?", multiHandler)

What's being said about Trump and Angela Merkel
summarization
0.8182173371315002
0.8

[summarization]
[ Donald Trump, Angela Merkel, US Politics, German Politics]


 The tweets highlight concerns about Biden, particularly his handling of the genocide issue and pandemic response. They criticize news organizations for focusing too much on Biden while ignoring Trump's actions that allegedly threaten the republic. Some comments also express frustration with Democrats and their approach to politics, suggesting a need for better solutions and strategies. There's an underlying theme of disappointment in current political figures and a call for more effective leadership.

In [None]:
@Serializable
data class LoginResponse(
    @SerialName("accessJwt") val accessJwt: String,
    @SerialName("refreshJwt") val refreshJwt: String,
    @SerialName("handle") val handle: String,
    @SerialName("did") val did: String,
    @SerialName("didDoc") val didDoc: DidDoc?,
    @SerialName("email") val email: String?,
    @SerialName("emailConfirmed") val emailConfirmed: Boolean?,
    @SerialName("emailAuthFactor") val emailAuthFactor: Boolean?,
    @SerialName("active") val active: Boolean,
    @SerialName("status") val status: String? = null
)

@Serializable
data class DidDoc(
    @SerialName("id") val id: String?
)

In [None]:
import io.ktor.client.HttpClient
import io.ktor.client.engine.cio.CIO
import io.ktor.client.plugins.contentnegotiation.ContentNegotiation
import io.ktor.serialization.kotlinx.json.json

val client = HttpClient(CIO) {
    install(ContentNegotiation) {
        json(Json {
            ignoreUnknownKeys = true
        })
    }
}

val API_URL = "https://bsky.social/xrpc"

val USERNAME = "devbubble.bsky.social"
val PASSWORD = System.getenv("DEVBUBBLE_TOKEN")

In [14]:
fun processUserRequest(query: String): String {
    val routes = matchRoute(query)
    println(routes)

    val enrichedData = routes.map { routes ->
        when (routes) {
            "trending_topics" -> trendingTopics()
            "summarization" -> summarization(query)
            else -> ""
        }
    }

    val systemPrompt = "You are a bot that helps users analyse posts about politics. You may be given a data set to help you answer questions. Answer in a max od 300 chars. I MEAN IT. It's a TWEET. Don't write more than 300 chars. Respond in only ONE paragraph. Be as concise as possible"

    return ollamaChatModel.call(
        Prompt(
            SystemMessage(systemPrompt),
            SystemMessage("Enriching data: $enrichedData"),
            UserMessage("User query: $query")
        )
    ).result.output.text ?: ""
}

org.jetbrains.kotlinx.jupyter.exceptions.ReplCompilerException: at Cell In[14], line 5, column 37: Name shadowed: routes
at Cell In[14], line 8, column 32: Unresolved reference: summarization

In [None]:
import io.ktor.client.call.body
import io.ktor.client.request.HttpRequestBuilder
import io.ktor.client.request.headers
import io.ktor.client.request.post
import io.ktor.client.request.setBody
import io.ktor.http.ContentType
import io.ktor.http.HeadersBuilder
import io.ktor.http.HttpStatusCode
import io.ktor.http.contentType

suspend fun getAccessToken(): String {
    val response = client.post("$API_URL/com.atproto.server.createSession") {
        contentType(ContentType.Application.Json)
        setBody(
            mapOf(
                "identifier" to USERNAME,
                "password" to PASSWORD
            )
        )
    }

    return if (response.status == HttpStatusCode.OK) {
        val result: LoginResponse = response.body()
        jedis.set("mainDid", result.did)
        println("✅ Login successful. DID: ${result.did}")
        result.accessJwt
    } else {
        println("⚠️ Authentication failed: ${response.status}")
        ""
    }
}

In [None]:
lateinit var blueskyToken: String;
runBlocking {
    blueskyToken = getAccessToken()
}

In [None]:
@Serializable
data class SearchResponse(
    @SerialName("cursor") val cursor: String? = null,
    @SerialName("hitsTotal") val hitsTotal: Int? = null,
    @SerialName("posts") val posts: List<Post>
)

@Serializable
data class Post(
    @SerialName("uri") val uri: String,
    @SerialName("cid") val cid: String,
    @SerialName("author") val author: Author,
    @SerialName("indexedAt") val indexedAt: String,
    @SerialName("record") val record: Record?,
    @SerialName("replyCount") val replyCount: Int? = null,
    @SerialName("repostCount") val repostCount: Int? = null,
    @SerialName("likeCount") val likeCount: Int? = null,
    @SerialName("quoteCount") val quoteCount: Int? = null,

    )

@Serializable
data class Author(
    @SerialName("did") val did: String,
    @SerialName("handle") val handle: String,
    @SerialName("displayName") val displayName: String? = null,
    @SerialName("avatar") val avatar: String? = null
)

@Serializable
data class Record(
    @SerialName("text") val text: String? = null,
    @SerialName("embed") val embed: Embed? = null,
    @SerialName("createdAt") val createdAt: String
)

@Serializable
data class Embed(
    @SerialName("images") val images: List<Image>? = null
)

@Serializable
data class Image(
    @SerialName("thumb") val thumb: String? = null, // Nullable to handle missing values
    @SerialName("fullsize") val fullsize: String? = null,
    @SerialName("alt") val alt: String? = null // Alt text is also optional
)

In [None]:
import io.ktor.client.*
import io.ktor.client.request.*
import io.ktor.client.statement.*
import io.ktor.client.call.*
import io.ktor.http.*

import java.time.Instant
import java.time.temporal.ChronoUnit

suspend fun searchPosts(sinceTime: String, tag: String): List<Post> {
    val allPosts = mutableListOf<Post>()
    var cursor: String? = null

    println("🔍 Searching posts with tag: $tag since: $sinceTime")
    do {
        val response: HttpResponse = client.get("$API_URL/app.bsky.feed.searchPosts") {
            headers {
                append("Authorization", "Bearer $blueskyToken")
            }
            parameter("q", tag)
            parameter("sort", "latest")
            parameter("limit", 100)
            parameter("since", sinceTime)
            if (cursor != null) {
                parameter("cursor", cursor)
            }
        }

        if (response.status == HttpStatusCode.OK) {
            val result: SearchResponse = response.body()
            val posts = result.posts
            println("✅ Retrieved ${posts.size} posts. Total so far: ${allPosts.size + posts.size}.")
            allPosts.addAll(posts)
            cursor = result.cursor
        } else {
            println("⚠️ Failed to fetch posts. Status: ${response.status}")
            println(response.bodyAsText())
            break
        }
    } while (cursor != null)

    println("🎉 Finished fetching posts. Total retrieved: ${allPosts.size}.")
    return allPosts
}

In [None]:
import kotlinx.coroutines.runBlocking

runBlocking {
    val sinceTime = Instant.now().minus(15, ChronoUnit.HOURS).toString()
    searchPosts(sinceTime, "java")
}

In [None]:
processUserRequest("What are people saying about Trump or Angela Merkel?")

In [None]:
matchRoute("What are folks talking about tpics and Angela Merkel?")

In [None]:
val embeddingModel = TransformersEmbeddingModel()
embeddingModel.setModelResource("file:model/bge-large-en-v1.5/model.onnx")
embeddingModel.setTokenizerResource("file:model/bge-large-en-v1.5/tokenizer.json")
embeddingModel.afterPropertiesSet()

In [None]:
fun createEmbedding(input: String): ByteArray {
    val embedding = embeddingModel.embed(input)
    val embeddingBytes = ByteArray(Float.BYTES * embedding.size)
    ByteBuffer.wrap(embeddingBytes).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().put(embedding)
    return embeddingBytes
}

val redisVectorStore = RedisVectorStore.builder(jedis, embeddingModel)
    .indexName("routeIdx")
    .contentFieldName("text")
    .embeddingFieldName("textEmbedding")
    .metadataFields(
        MetadataField("route", FieldType.TEXT)
    )
    .prefix("route:")
    .initializeSchema(true)
    .vectorAlgorithm(RedisVectorStore.Algorithm.FLAT)
    .build()
redisVectorStore.afterPropertiesSet()

summarizationRoute.forEach { text ->
    redisVectorStore.add(
        listOf(
            org.springframework.ai.document.Document(
                UUID.randomUUID().toString(),
                text,
                mapOf(
                    "route" to "summarization",
                    "text" to text,
                )
            )
        )
    )
}

trendingTopicsRoute.forEach { text ->
    redisVectorStore.add(
        listOf(
            org.springframework.ai.document.Document(
                UUID.randomUUID().toString(),
                text,
                mapOf(
                    "route" to "trending_topics",
                    "text" to text,
                )
            )
        )
    )
}

In [None]:
matchRoute("What are folks saying regarding Trump and Angela Merkel?")

In [None]:
processUserRequest("What have people been posting about Trump or Angela Merkel?")

In [None]:
processUserRequest("What are people talking about?")