In [23]:
import os
import pandas as pd
import yaml
from IPython.display import display
from utils import CheckboxSelector, get_line_files, analyze_characters, analyze_points, \
    analyze_width
root_dir = './data'

In [24]:
"""
Select directories to analyze
"""

# What data sources to format
data_sources = [d for d in os.listdir('./data') if os.path.isdir(f'{root_dir}/{d}') and d != '__pycache__']
data_selector = CheckboxSelector(data_sources, 'Select data sources to format')
data_selector.display()

Select data sources to format


Checkbox(value=False, description='twelve_numbers')

Checkbox(value=False, description='pangram_jordan')

Checkbox(value=False, description='fliff')

Checkbox(value=False, description='fortune_coins')

Checkbox(value=False, description='player_name')

Checkbox(value=False, description='speech_jordan')

Checkbox(value=False, description='high5')

Checkbox(value=False, description='speech')

Checkbox(value=False, description='special_characters')

Checkbox(value=False, description='number')

Checkbox(value=False, description='IAM')

Checkbox(value=False, description='chumba_v2_jordan')

Checkbox(value=False, description='address')

Checkbox(value=False, description='chumba_v2')

Checkbox(value=False, description='chumba')

Checkbox(value=False, description='pangram')

Checkbox(value=False, description='email')

In [25]:
"""
Set selections to variables
"""

data_sources = data_selector.get_selected_items()

In [26]:
# Initialize the dictionary to store the data of each dataset
data = {}
for ds in data_sources:
    data[ds] = {'lines': 0, 'characters': 0, 'spaces': 0, 'clean_characters': 0, 'character_dict': {}, 'points': 0, 'width': 0}

In [27]:
"""
Count the lines in each dataset
"""

for ds in data_sources:
    lines = get_line_files(root_dir, ds)
    data[ds]['lines'] = len(lines)

In [28]:
"""
Analyze character counts
"""
for ds in data_sources:
    
    # analyze the characters in the transcript
    characters, spaces, clean_characters, character_dict = analyze_characters(root_dir, ds)
            
    # update the data dictionary
    data[ds]['characters'] = characters
    data[ds]['spaces'] = spaces
    data[ds]['clean_characters'] = clean_characters
    # convert defaultdict to dict
    character_dict = dict(character_dict)
    data[ds]['character_dict'] = character_dict

In [29]:
"""
Analyze points per character
"""
for ds in data_sources:
    
    # analyze the points in the strokes
    points = analyze_points(root_dir, ds)           

    # update the data dictionary
    data[ds]['points'] = points

In [30]:
"""
Analyse the width per character
"""
for ds in data_sources:
    
    # analyze the width of the strokes
    width = analyze_width(root_dir, ds)
    
    # update the data dictionary
    data[ds]['width'] = width

In [31]:
"""
Basic Calcs on the data
"""

for ds in data_sources:
    data[ds]['points_per_char'] = data[ds]['points'] / data[ds]['characters']
    data[ds]['points_per_clean_char'] = data[ds]['points'] / data[ds]['clean_characters']
    data[ds]['width_per_char'] = data[ds]['width'] / data[ds]['characters']
    data[ds]['width_per_clean_char'] = data[ds]['width'] / data[ds]['clean_characters']
    data[ds]['clean_chars_per_char'] = data[ds]['clean_characters'] / data[ds]['characters']
    data[ds]['spaces_per_char'] = data[ds]['spaces'] / data[ds]['characters']

In [32]:
"""
Save the data 
"""

for ds in data_sources:
    config_file = f'{root_dir}/{ds}/config.yaml'
    with open(config_file, 'r') as f:
        config = yaml.safe_load(f)
    config['analysis'] = data[ds]
    with open(config_file, 'w') as f:
        yaml.dump(config, f)
        

In [33]:
"""
View Analysis
"""          
# exclude the character_dict from the data dictionary
for ds in data_sources:
    data[ds].pop('character_dict')
df = pd.DataFrame.from_dict(data, orient='index')
sum_row = df.sum(axis=0).to_frame().T
sum_row.index = ['Total']
df = pd.concat([df, sum_row])
display(df)

Unnamed: 0,lines,characters,spaces,clean_characters,points,width,points_per_char,points_per_clean_char,width_per_char,width_per_clean_char,clean_chars_per_char,spaces_per_char
twelve_numbers,150.0,1800.0,0.0,1800.0,39812.0,218745.0,22.117778,22.117778,121.525,121.525,1.0,0.0
pangram_jordan,110.0,3116.0,440.0,2605.0,89167.0,377222.0,28.615854,34.229175,121.059692,144.80691,0.836008,0.141207
fliff,486.0,11025.0,1458.0,9318.0,169044.0,1033932.0,15.332789,18.141661,93.78068,110.960721,0.84517,0.132245
fortune_coins,582.0,9592.0,1164.0,8229.0,150991.0,923670.0,15.741347,18.348645,96.295872,112.245716,0.857902,0.121351
player_name,36.0,828.0,108.0,720.0,12178.0,85067.0,14.707729,16.913889,102.737923,118.148611,0.869565,0.130435
speech_jordan,377.0,7904.0,1122.0,6580.0,203734.0,816578.0,25.776063,30.962614,103.311994,124.1,0.83249,0.141953
high5,246.0,3635.0,492.0,3094.0,56312.0,335553.0,15.491609,18.200388,92.311692,108.452812,0.851169,0.135351
speech,992.0,20630.0,2951.0,17170.0,307880.0,2450780.0,14.923897,17.931275,118.796898,142.736168,0.832283,0.143044
special_characters,300.0,6591.0,626.0,5562.0,118095.0,719726.0,17.917615,21.23247,109.198301,129.400575,0.843878,0.094978
number,1800.0,9000.0,0.0,9000.0,202045.0,1196175.0,22.449444,22.449444,132.908333,132.908333,1.0,0.0
