In [29]:
import pandas as pd
from os import listdir

In [30]:
# App constants
ASHLEY = 'ASHLEY:'
LAURA = 'LAURA:'
LIAM = 'LIAM:'
MARISHA = 'MARISHA:'
MATT = 'MATT:'
SAM = 'SAM:'
TALIESIN = 'TALIESIN:'
TRAVIS = 'TRAVIS:'

# What I want to find out

- Who spoke most/least?
- Who has largest vocabulary
- Who spoke the longest/shortest?
- Who spoke the fastest/slowest?
- Did people's speaking habbits change over time?
    - Did Fjord changing his accent make him speak more/less, faster/slower?
- Who went the longest without speaking? (Excluding absentees)
- Longest sentences?
- Compare and contrast Season 1 and Season 2

## Todo

- Remove words in parenthesis
- Remove quote marks, especially for Matt

## Other Notes

- [Starting in Season 2 Episode 54, Critical Role transitioned to a professional transcription service](https://critrole.com/cr-transcript-closed-captions-update/)

In [31]:
data_path = './data/'
rows = []
for file_name in listdir('./data'):
    episode_num = file_name.split('.')[0]
    with open(data_path + file_name) as f:
        row = {}
        for line in f:

            # Detect just the number
            if line.rstrip('\n').isnumeric():
                sanitized_line = line.replace('\n', '')
                row['index'] = f'{episode_num}x{sanitized_line}'
            # Detect the timestamp
            elif line.find('-->') != -1:
                row['timestamp'] = line.rstrip('\n')
            # Detect the next section
            elif line == '\n':
                rows.append(row)
                row = {}
            # Everything else is spoken text
            else:
                if 'text' not in row:
                    row['text'] = line
                else:
                    row['text'] += ' ' + line
    break

In [44]:
# Get a long string of all the texts
text = ''
for row in rows:
    try:
        text += row['text']
    except:
        print(row)
text = text.replace('\n', ' ')
text = text.replace('.', '')
text = text.replace(',', '')
text = text.replace('?', '')
text = text.replace('!', '')
text = text.replace('"', '')
print(text)

y snaps and opens and it snaps back into place the torso locking  back in The second strike does how much damage TALIESIN: Four points of damage MATT: Yeah The second one hits again and you  watch as the torso spins off It looks like only the spinal column is holding it in place but it's  managing to maintain its structure as it (snarls) That ends your turn TALIESIN: Someone anyone Yep MATT: All right ending Mollymauk's turn that  brings us to Beau LAURA: Beau you got this MARISHA: In response to him I say I thought you  said you didn't let in the sick TALIESIN: Just kill it MARISHA: I come up and I want to do a flying  Superman (impact) MATT: Wham Go for it MARISHA: First one Okay 16 MATT: That'll hit Damage MARISHA: I had my staff-- oh I had my staff but  I said-- LIAM: You said Superman punch man MARISHA: It's fine it's fine Six damage MATT: Six points of damage Natural 20 You watch  as you sock it in the face and the jaw breaks and then snaps back into place As you land pulling  it

In [33]:
def parse_texts(text):
    '''
    Read through the combined text and sort them by voice actor
    '''
    voice_actors_words = {
        'ASHLEY:': [],
        'LAURA:': [],
        'LIAM:': [],
        'MARISHA:': [],
        'MATT:': [],
        'SAM:': [],
        'TALIESIN:': [],
        'TRAVIS:': [],
    }
    words = []
    current_speaker = ''
    

    # Find first speaker:
    for word in text.split(' '):
        if word in voice_actors_words:
            current_speaker = word

    # Sort all the rest of the words
    for word in text.split(' '):
        if word in voice_actors_words:
            voice_actors_words[current_speaker] += words
            words = []
            current_speaker = word
        else:
            words.append(word)
    return voice_actors_words
words_per_actor = parse_texts(text)

In [43]:
def sanitize_words(words):    
    '''
    Removes words in parenthesis. Removes double quotes, commas, question marks

    param {string} words - a list of words to sanitize    
    '''
    cleaned_words = []

    for word in words:
        if word.find('(') != -1 or word.find(')') != -1:
            continue
        word = word.replace(',', '')
        word = word.replace('.', '')
        word = word.replace('"', '')
        word = word.replace('--', '')
        word = word.replace('?', '')
        word = word.replace('?', '')
        cleaned_words += word
    return cleaned_words


for actor in words_per_actor:
    words = words_per_actor[actor]
    
    cleaned_words = sanitize_words(words)
    words_per_actor[actor] = cleaned_words

m', 'a', 'g', 'e', 'a', 'n', 'd', 'o', 'n', 'e', 'p', 'o', 'i', 'n', 't', 'o', 'f', 'i', 'c', 'e', 'd', 'a', 'm', 'a', 'g', 'e', 'N', 'o', 't', 'h', 'a', 't', 'w', 'a', 's', 'm', 'y', 'b', 'o', 'n', 'u', 's', 'Y', 'e', 'a', 'h', 'n', 'o', 'I', "'", 'm', 'g', 'o', 'i', 'n', 'g', 't', 'o', 'u', 's', 'e', 'B', 'l', 'o', 'o', 'd', 'M', 'a', 'l', 'a', 'd', 'i', 'c', 't', 'O', 'h', 't', 'h', 'a', 't', "'", 's', 'r', 'i', 'g', 'h', 't', 'b', 'e', 'c', 'a', 'u', 's', 'e', 'I', "'", 'm', 'n', 'o', 't', 'd', 'o', 'u', 'b', 'l', 'i', 'n', 'g', 'i', 't', 'u', 'p', 'I', "'", 'm', 's', 't', 'i', 'l', 'l', 'e', 'n', 'g', 'a', 'g', 'e', 'd', 'w', 'i', 't', 'h', 'i', 't', 'r', 'i', 'g', 'h', 't', 'O', 'k', 'a', 'y', 'I', "'", 'm', 'g', 'o', 'i', 'n', 'g', 't', 'o', 't', 'a', 'k', 'e', 't', 'w', 'o', 's', 'w', 'i', 'p', 'e', 's', 'P', 'u', 'r', 'p', 'l', 'e', "'", 's', 't', 'h', 'e', 'i', 'c', 'e', 'T', 'h', 'a', 't', "'", 's', 't', 'h', 'e', 'p', 'u', 'r', 'p', 'l', 'e', 'i', 's', '1', '2', 't', 'o', '

# Number of words said by each nerdy-ass voice actor

In [36]:
print(f'ASHLEY said {len(words_per_actor[ASHLEY])} words.')
print(f'LAURA said {len(words_per_actor[LAURA])} words.')
print(f'LIAM said {len(words_per_actor[LIAM])} words.')
print(f'MARISHA said {len(words_per_actor[MARISHA])} words.')
print(f'MATT said {len(words_per_actor[MATT])} words.')
print(f'SAM said {len(words_per_actor[SAM])} words.')
print(f'TALIESIN said {len(words_per_actor[TALIESIN])} words.')
print(f'TRAVIS said {len(words_per_actor[TRAVIS])} words.')

ASHLEY said 3707 words.
LAURA said 10879 words.
LIAM said 7649 words.
MARISHA said 8107 words.
MATT said 58923 words.
SAM said 8793 words.
TALIESIN said 6090 words.
TRAVIS said 2605 words.


# Size of vocab by each nerdy-ass voice actor

In [37]:
print(f'ASHLEY\'s vocabulary consists of {len(set(words_per_actor[ASHLEY]))} words.')
print(f'LAURA\'s vocabulary consists of {len(set(words_per_actor[LAURA]))} words.')
print(f'LIAM\'s vocabulary consists of {len(set(words_per_actor[LIAM]))} words.')
print(f'MARISHA\'s vocabulary consists of {len(set(words_per_actor[MARISHA]))} words.')
print(f'MATT\'s vocabulary consists of {len(set(words_per_actor[MATT]))} words.')
print(f'SAM\'s vocabulary consists of {len(set(words_per_actor[SAM]))} words.')
print(f'TALIESIN\'s vocabulary consists of {len(set(words_per_actor[TALIESIN]))} words.')
print(f'TRAVIS\'s vocabulary consists of {len(set(words_per_actor[TRAVIS]))} words.')

ASHLEY's vocabulary consists of 54 words.
LAURA's vocabulary consists of 62 words.
LIAM's vocabulary consists of 63 words.
MARISHA's vocabulary consists of 63 words.
MATT's vocabulary consists of 74 words.
SAM's vocabulary consists of 62 words.
TALIESIN's vocabulary consists of 56 words.
TRAVIS's vocabulary consists of 53 words.


In [38]:
set(words_per_actor[MATT])

{'$',
 '%',
 '&',
 "'",
 '-',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 '<',
 '>',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}