In [None]:
// Kotlin Regex Basics — quick tutorial with runnable examples
// This notebook shows the essentials of using regular expressions in Kotlin
// using the Regex class and top-level regex helpers.


In [None]:
// 1) Creating Regex and simple matching

// You can use a string pattern to create a Regex.
val digits = Regex("\\d+") // one or more digits

println(digits.containsMatchIn("abc123xyz")) // true — any match anywhere
println(digits.matches("123"))               // true — full string must match
println(digits.matches("a123"))              // false — not a full match

// Kotlin also offers Regex.matchEntire which returns a MatchResult? for full match
println(digits.matchEntire("123") != null)   // true
println(digits.matchEntire("a123") != null)  // false


In [None]:
// 2) Escaping patterns: regular vs raw strings

// In normal strings, backslashes must be escaped ("\\d").
val wordNormal = Regex("\\w+")

// In raw strings (triple quotes), you can write patterns without escaping backslashes.
val wordRaw = Regex("""\w+""")

println(wordNormal.containsMatchIn("hello_world")) // true
println(wordRaw.containsMatchIn("hello-world"))    // true (matches letters/digits/_)


In [None]:
// 3) Character classes and ranges
// [] defines a set of characters; use - for ranges.
val vowels = Regex("[aeiou]")
println(vowels.findAll("education").count()) // number of vowel matches

// Predefined classes (Java-style):
// \d digits, \D non-digits, \w word chars [A-Za-z0-9_], \W non-word, \s whitespace, \S non-whitespace
println(Regex("\\s+").replace("a\t b\n c", " ")) // normalize whitespace to single spaces


In [None]:
// 4) Quantifiers
// ? 0 or 1, * 0+, + 1+, {m}, {m,}, {m,n}
println(Regex("colou?r").find("color, colour").let { it?.value }) // "color" (first match)
println(Regex("colou?r").findAll("color, colour").map { it.value }.toList()) // [color, colour]

println(Regex("a{2,4}").findAll("a aa aaa aaaa aaaaa").map { it.value }.toList()) // [aa, aaa, aaaa, aaaa]


In [None]:
// 5) Anchors and boundaries
// ^ start of string, $ end of string, \b word boundary, \B non-boundary
println(Regex("^hello").containsMatchIn("hello world")) // true
println(Regex("world$").containsMatchIn("hello world")) // true
println(Regex("\\bcat\\b").containsMatchIn("the cat sat")) // true
println(Regex("\\bcat\\b").containsMatchIn("concatenate")) // false


In [None]:
// 6) Finding matches and inspecting groups
val sku = Regex("(\\w+)-(\\d+)")
val text = "item-123 and tool-42"

val first = sku.find(text)
println(first?.value)              // item-123
println(first?.groups?.get(1)?.value) // group 1: item
println(first?.groups?.get(2)?.value) // group 2: 123

// Iterate all matches
for (m in sku.findAll(text)) {
    println("${m.value} -> g1=${m.groups[1]?.value}, g2=${m.groups[2]?.value}")
}

// Destructure captured groups conveniently
val (name, id) = sku.find("gadget-9000")!!.destructured
println("name=$name id=$id")


In [None]:
// 7) Named groups (Java/Kotlin support) and access
val person = Regex("(?<first>[A-Z][a-z]+)\\s+(?<last>[A-Z][a-z]+)")
val pm = person.find("Ada Lovelace")
println(pm?.groups?.get("first")?.value) // Ada
println(pm?.groups?.get("last")?.value)  // Lovelace


In [None]:
// 8) Replacing text
val sentence = "Contact me at john.doe@example.com or jane@example.org"
val email = Regex("[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}")

// Replace with constant
println(email.replace(sentence, "<redacted>"))

// Replace with transform lambda — you can use groups to customize the output
val masked = email.replace(sentence) { mr ->
    val full = mr.value
    val at = full.indexOf('@')
    val local = full.substring(0, at)
    val domain = full.substring(at + 1)
    val keep = local.take(2)
    "$keep***@${domain}"
}
println(masked)


In [None]:
// 9) Splitting strings with regex
println(Regex("[,;\\s]+").split("a, b; c   d")) // [a, b, c, d]


In [None]:
// 10) Flags (RegexOption)
// IGNORE_CASE, MULTILINE (^/$ match line starts/ends), DOT_MATCHES_ALL (. matches newline)
val ci = Regex("hello", setOf(RegexOption.IGNORE_CASE))
println(ci.containsMatchIn("HeLLo world")) // true

val multiline = Regex("^\\w+", setOf(RegexOption.MULTILINE))
println(multiline.findAll("alpha\nbeta\ngamma").map { it.value }.toList()) // [alpha, beta, gamma]

val dotAll = Regex("a.+z", setOf(RegexOption.DOT_MATCHES_ALL))
println(dotAll.containsMatchIn("a\n\n\nz")) // true


In [None]:
// 11) Backreferences
// (\w+)\s+\1 means "the same word repeated"
val dupWord = Regex("(\\w+)\\s+\\1")
println(dupWord.containsMatchIn("hello hello world")) // true
println(dupWord.containsMatchIn("hello world"))       // false


In [None]:
// 12) Common pitfalls
// • Remember to escape special chars like . + * ? ( ) [ ] { } | ^ $ when you mean literals.
//   For a literal dot, use \. or a raw string """\.""".
// • Use raw strings to avoid double-escaping: Regex("""\\d{4}-\\d{2}-\\d{2}""").
// • Prefer matchEntire() or matches() when you need full-string validation.
// • Use destructured for clean access to capture groups.

// Mini validation example: ISO-like date yyyy-mm-dd
val date = Regex("""\d{4}-\d{2}-\d{2}""")
println(date.matches("2025-09-29")) // true
println(date.matches("29-09-2025")) // false


In [None]:
// 13) Dynamically creating Regex at runtime

// Build from variables using string templates
val n = 3
val exactlyNdigits = Regex("""\d{$n}""")
println(exactlyNdigits.matches("123"))  // true
println(exactlyNdigits.matches("1234")) // false

// Using String.toRegex() — equivalent to Regex(pattern)
val dynamic = "\\w+-\\d+".toRegex()
println(dynamic.find("ref-42")?.value) // ref-42

// Escaping user input to make a safe literal pattern
val userInput = "a+b?c"
val safe = Regex.escape(userInput)
val literal = Regex(safe)
println(literal.containsMatchIn("xx a+b?c yy")) // true

// Building alternation from a collection (properly escaped)
val keywords = listOf("C++", "C#", "Go", "Rust")
val alternation = keywords.joinToString("|") { Regex.escape(it) }
val kwRegex = Regex("""\b(?:$alternation)\b""")
println(kwRegex.findAll("I like Go, Rust, and C++").map { it.value }.toList()) // [Go, Rust, C++]

// Conditional flags at runtime
fun makeWordRegex(ignoreCase: Boolean, multiline: Boolean): Regex {
    val opts = mutableSetOf<RegexOption>()
    if (ignoreCase) opts += RegexOption.IGNORE_CASE
    if (multiline) opts += RegexOption.MULTILINE
    return Regex("""^\w+""", opts)
}
println(makeWordRegex(true, true).findAll("Alpha\nbeta").map { it.value }.toList()) // [Alpha, beta]

// (Optional) Caching compiled Regex objects when reused frequently
val cache = mutableMapOf<String, Regex>()
fun cached(pattern: String, opts: Set<RegexOption> = emptySet()): Regex =
    cache.getOrPut(pattern + "#" + opts.sortedBy { it.name }.joinToString(",")) { Regex(pattern, opts) }
println(cached("""\d+""").find("abc123")?.value) // 123



In [None]:
// 14) Negation with character classes [^...]
// Use [^...] to match any single character NOT in the set. This is different from negative lookarounds.

// Match consonants by excluding vowels (ASCII example)
val notVowel = Regex("[^aeiou]+")
println(notVowel.findAll("education").map { it.value }.toList()) // [d, c, t, n]

// Non-digits using class negation vs \D
val nonDigitsClass = Regex("[^0-9]+")
val nonDigitsShort = Regex("\\D+")
println(nonDigitsClass.findAll("a1b22c333").map { it.value }.toList()) // [a, b, c]
println(nonDigitsShort.findAll("a1b22c333").map { it.value }.toList())  // [a, b, c]

// Remove all non-letters (ASCII); note that \W excludes underscore too, but here we keep only letters
val nonLetters = Regex("[^A-Za-z]+")
println(nonLetters.replace("abc_DEF-123", " ")) // abc DEF 

// Newline-aware: match up to (but not including) newline using [^\n]+
val lineChunk = Regex("[^\\n]+")
println(lineChunk.findAll("alpha\nbeta\n\ngamma").map { it.value }.toList()) // [alpha, beta, , gamma]

// Careful with ^ and - positions inside classes:
// • ^ means negation only if it is the first char. To include a literal ^, place it elsewhere or escape it.
println(Regex("[\\^_-]+").containsMatchIn("^ - _")) // true — literal ^, _, -
// • - denotes a range unless first/last or escaped.
println(Regex("[A-Z-]+").matches("ABC-XYZ")) // true

// Reminder: [^...] negates single characters only.
// To assert that a multi-character substring does NOT appear, use a negative lookahead instead:
println(Regex("^(?!.*foo).*$").matches("bar baz")) // true (does not contain 'foo')
println(Regex("^(?!.*foo).*$").matches("bar foo baz")) // false
