# Example: Character with the longest consecutive subsequence of a string

In [1]:
def check(implementation):
    tests = [
        # [input, expected],
        ["aaaabb", ('a', 4)],
        ["bbbaaabaaaa", ('a', 4)],
        ["bbbaaaabaaa", ('a', 4)],
        ["cbdeuuu900", ('u', 3)],
        ["abbbbb", ('b', 5)],
        ["aabb", ('a', 2)],
        ["ba", ('b', 1)],
        ["", ('', 0)],
    ]
    for s, true_answer in tests:
        print(f"Input: {repr(s)} => Expected output: {true_answer}")
        our_answer = implementation(s)
        assert our_answer == true_answer, f"{our_answer} does not match expected result."

## Method 0: "One-at-a-time" approach

Scan the string from left-to-right, one character at a time. Keep track of the longest consecutive substring encountered so far, and update that every time we encounter a new character.

In [2]:
s = "bbbaaaabaaa"

In [3]:
def lcs0(s):
    previous, previous_count = '', 0  # logical relationship between previous and previous_count is "invariant"
    longest, longest_count = '', 0
    for c in s:
        if c == previous: # same substring, keep going
            previous_count += 1
            continue
        else: # new substring
            if previous_count > longest_count: # update max
                longest = previous
                longest_count = previous_count
            previous, previous_count = c, 1
    if previous_count > longest_count: # update max
        longest = previous
        longest_count = previous_count
    return longest, longest_count

lcs0(s)

('a', 4)

In [4]:
check(lcs0)

Input: 'aaaabb' => Expected output: ('a', 4)
Input: 'bbbaaabaaaa' => Expected output: ('a', 4)
Input: 'bbbaaaabaaa' => Expected output: ('a', 4)
Input: 'cbdeuuu900' => Expected output: ('u', 3)
Input: 'abbbbb' => Expected output: ('b', 5)
Input: 'aabb' => Expected output: ('a', 2)
Input: 'ba' => Expected output: ('b', 1)
Input: '' => Expected output: ('', 0)


## Method 1: A "data-parallel" approach

For all consecutive pairs of characters, detect where changes occur. These are, in effect, the locations of the last letter of a consecutive subsequence. From those locations, looking at the differences between them yields the length of each subsequence.

In [5]:
s = "bbbaaaabaaa"

```python
                    1
0 1 2 3 4 5 6 7 8 9 0
b b b a a a a b a a a
b b a a a a b a a a
```

```python
-1, 2, 6, 7, 10  # <-- look at difference between pairs
 3, 4, 1, 3
```

In [6]:
list(zip(s[:-1], s[1:]))

[('b', 'b'),
 ('b', 'b'),
 ('b', 'a'),
 ('a', 'a'),
 ('a', 'a'),
 ('a', 'a'),
 ('a', 'b'),
 ('b', 'a'),
 ('a', 'a'),
 ('a', 'a')]

In [7]:
k = 0
for left, right in zip(s[:-1], s[1:]):
    if left != right:
        print(k, left, right)
    k += 1

2 b a
6 a b
7 b a


In [8]:
list(enumerate(zip(s[:-1], s[1:])))

[(0, ('b', 'b')),
 (1, ('b', 'b')),
 (2, ('b', 'a')),
 (3, ('a', 'a')),
 (4, ('a', 'a')),
 (5, ('a', 'a')),
 (6, ('a', 'b')),
 (7, ('b', 'a')),
 (8, ('a', 'a')),
 (9, ('a', 'a'))]

In [9]:
cut_points = [-1]

for k, (left, right) in enumerate(zip(s[:-1], s[1:])):
    if left != right:
        cut_points.append(k)

cut_points.append(len(s)-1)
print(cut_points)

[-1, 2, 6, 7, 10]


In [10]:
cut_points = [-1]
cut_points += [k for k, (left, right) in enumerate(zip(s[:-1], s[1:])) if left != right]
cut_points += [len(s)-1]
print(cut_points)

[-1, 2, 6, 7, 10]


In [11]:
lengths = [y-x for x, y in zip(cut_points[:-1], cut_points[1:])]
lengths

[3, 4, 1, 3]

In [12]:
letters = [s[k] for k in cut_points[1:]]
letters

['b', 'a', 'b', 'a']

In [13]:
letter_count_pairs = zip(letters, lengths)
max(letter_count_pairs, key=lambda t: t[1])

('a', 4)

In [14]:
def lcs1(s):
    cut_points = [-1]
    cut_points += [k for k, (left, right) in enumerate(zip(s[:-1], s[1:])) if left != right]
    cut_points += [len(s)-1]
    lengths = [y-x for x, y in zip(cut_points[:-1], cut_points[1:])]
    letters = [s[k:k+1] for k in cut_points[1:]]
    letter_count_pairs = zip(letters, lengths)
    return max(letter_count_pairs, key=lambda t: t[1])

s, lcs1(s)

('bbbaaaabaaa', ('a', 4))

In [15]:
check(lcs1)

Input: 'aaaabb' => Expected output: ('a', 4)
Input: 'bbbaaabaaaa' => Expected output: ('a', 4)
Input: 'bbbaaaabaaa' => Expected output: ('a', 4)
Input: 'cbdeuuu900' => Expected output: ('u', 3)
Input: 'abbbbb' => Expected output: ('b', 5)
Input: 'aabb' => Expected output: ('a', 2)
Input: 'ba' => Expected output: ('b', 1)
Input: '' => Expected output: ('', 0)


In [16]:
('')[-1]  # Fails! A reasonable output would have been an empty string

IndexError: string index out of range

In [17]:
s[-1:0] # This pattern, `i:i+1`, works

''

In [18]:
[][-1:0] # It works for lists, too

[]