In [1]:
import os
import pickle
import numpy as np
import shutil
import pandas as pd
import yaml
from IPython.display import display
import ipywidgets as widgets
from utils import CheckboxSelector, unpickle, clean_transcript_characters, analyze_characters, analyze_points, \
    analyze_width
root_dir = './data'

2024-07-24 13:33:34,561 - uim - INFO - Completed configuring logger()!
2024-07-24 13:33:34,624 - uim.model.ink - INFO - Completed configuring logger()!
2024-07-24 13:33:34,624 - uim.model.ink - INFO - Completed configuring logger()!


In [2]:
"""
Select directories to package, how to modify data, and name of the package
"""

# What data sources to format
data_sources = [d for d in os.listdir(root_dir) if os.path.isdir(f'{root_dir}/{d}') and d != '__pycache__']
data_selector = CheckboxSelector(data_sources, 'Select data sources to format')
data_selector.display()

# How to modify data
data_modifications = ['interpolate', 'scale']
modifications_selector = CheckboxSelector(data_modifications, 'Select data modifications')
modifications_selector.display()

# Remove long lines
lines_to_remove = ['No adjustments', '700', '1000']
lines_to_remove_selector = CheckboxSelector(lines_to_remove, 'Select data modifications')
lines_to_remove_selector.display()

# Create a text input widget
input_field = widgets.Text(
    value='',
    placeholder='Type something',
    description='Name for packaged data:',
    disabled=False
)
display(input_field)

Select data sources to format


Checkbox(value=False, description='twelve_numbers')

Checkbox(value=False, description='pangram_jordan')

Checkbox(value=False, description='fliff')

Checkbox(value=False, description='fortune_coins')

Checkbox(value=False, description='player_name')

Checkbox(value=False, description='speech_jordan')

Checkbox(value=False, description='high5')

Checkbox(value=False, description='speech')

Checkbox(value=False, description='special_characters')

Checkbox(value=False, description='number')

Checkbox(value=False, description='IAM')

Checkbox(value=False, description='chumba_v2_jordan')

Checkbox(value=False, description='address')

Checkbox(value=False, description='chumba_v2')

Checkbox(value=False, description='chumba')

Checkbox(value=False, description='pangram')

Checkbox(value=False, description='email')

Select data modifications


Checkbox(value=False, description='interpolate')

Checkbox(value=False, description='scale')

Select data modifications


Checkbox(value=False, description='No adjustments')

Checkbox(value=False, description='700')

Checkbox(value=False, description='1000')

Text(value='', description='Name for packaged data:', placeholder='Type something')

In [3]:
"""
Set selections to variables
"""

data_sources = data_selector.get_selected_items()
data_modifications = modifications_selector.get_selected_items()
lines_to_remove = lines_to_remove_selector.get_selected_items()
package_root = '../prepared_data'
package_name = input_field.value

In [4]:
"""
Calculate Interpolation Constant, Define Interpolation Function
"""

iam_config = yaml.safe_load(open(f'{root_dir}/IAM/config.yaml', 'r'))
iam_points_per_clean_character = iam_config['analysis']['points_per_clean_char']

points = 0
clean_characters = 0
for ds in [d for d in data_sources if d != 'IAM']:
    config = yaml.safe_load(open(f'{root_dir}/{ds}/config.yaml', 'r'))
    points += config['analysis']['points']
    clean_characters += config['analysis']['clean_characters']
    
points_per_clean_character = points / clean_characters
interpolation_constant = iam_points_per_clean_character / points_per_clean_character
print(f'Points per clean character: {points_per_clean_character}')
print(f'Interpolation constant: {interpolation_constant}')

def interpolate_strokes(strokes, constant):
    interpolated_strokes = []
    for stroke in strokes:
        original_points = np.array(stroke)
        original_num_points = len(stroke)
        new_num_points = int(np.ceil(constant * original_num_points))
        new_indices = np.linspace(0, original_num_points - 1, new_num_points)
        x = original_points[:, 0]
        new_x = np.interp(new_indices, np.arange(original_num_points), x).astype(int)
        y = original_points[:, 1]
        new_y = np.interp(new_indices, np.arange(original_num_points), y).astype(int)
        new_points = np.column_stack((new_x, new_y))
        interpolated_strokes.append(new_points)
        
    return interpolated_strokes

ZeroDivisionError: division by zero

In [5]:
"""
Calculate Scaling Constant, Define Scaling Function
"""

iam_config = yaml.safe_load(open(f'{root_dir}/IAM/config.yaml', 'r'))
iam_width_per_char = iam_config['analysis']['width_per_char']

width = 0
characters = 0
for ds in [d for d in data_sources if d != 'IAM']:
    config = yaml.safe_load(open(f'{root_dir}/{ds}/config.yaml', 'r'))
    width += config['analysis']['width']
    characters += config['analysis']['characters']
    
width_per_character = width / characters
scaling_constant = iam_width_per_char / width_per_character
print(f'Width per character: {width_per_character}')
print(f'Scaling constant: {scaling_constant}')


def scale_strokes(strokes, constant):
    new_strokes = []
    for stroke in strokes:
        new_stroke = []
        for point in stroke:
            scaled_point = [p * constant for p in point]
            new_stroke.append(scaled_point)
        new_strokes.append(new_stroke)
        
    return new_strokes

ZeroDivisionError: division by zero

In [6]:
"""
Create new package directory
"""

if not os.path.exists(f'{package_root}/{package_name}/lines'):
    os.makedirs(f'{package_root}/{package_name}/lines')
else:
    print('Package already exists.')

In [7]:
"""
Copy IAM Lines to package
"""

if 'IAM' in data_sources:
    iam_files = os.listdir(f'{root_dir}/IAM/lines')
    for file in iam_files:
        src = f'{root_dir}/IAM/lines/{file}'
        dst = f'{package_root}/{package_name}/lines/{file}_IAM.pkl'
        shutil.copy(src, dst)
        

In [8]:
"""
Interpolate, scale, and copy data to package
"""

for ds in [d for d in data_sources if d != 'IAM']:

    
    for file in os.listdir(f'{root_dir}/{ds}/lines'):
        line = unpickle(f'{root_dir}/{ds}/lines/{file}')
        strokes = line['strokes']
        
        # Interpolate and scale strokes
        if 'interpolate' in data_modifications:
            strokes = interpolate_strokes(strokes, interpolation_constant)
        if 'scale' in data_modifications:
            strokes = scale_strokes(strokes, scaling_constant)

        # Save strokes to new file
        with open(f'{package_root}/{package_name}/lines/{file}_{ds}.pkl', 'wb') as f:
            pickle.dump({'transcript': line['transcript'], 'strokes': strokes}, f)
        

In [9]:
"""
Remove long lines
"""

if not 'No adjustments' in lines_to_remove and len(lines_to_remove) == 1:
    all_files = os.listdir(f'{package_root}/{package_name}/lines')
    for file in all_files:
        # open the file and see how many points are in the file
        line = unpickle(f'{package_root}/{package_name}/lines/{file}')
        points = 0
        for stroke in line['strokes']:
            points += len(stroke)
        if points > int(lines_to_remove[0]):
            os.remove(f'{package_root}/{package_name}/lines/{file}')

In [10]:
"""
Shuffle and rename files in package
"""

files = os.listdir(f'{package_root}/{package_name}/lines')
np.random.shuffle(files)
for i, file in enumerate(files):
    os.rename(f'{package_root}/{package_name}/lines/{file}', f'{package_root}/{package_name}/lines/{i}.pkl')

In [11]:
"""
Analyze new package
"""

# Initialize the dictionary to store the data of each dataset

# analyze the characters in the transcript
characters, spaces, clean_characters, char_dict = analyze_characters(package_root, package_name)
points = analyze_points(package_root, package_name)
width = analyze_width(package_root, package_name)
data = {'characters': characters, 'spaces': spaces, 'clean_characters': clean_characters, 'points': points, 'width': width, 'char_dict': dict(char_dict)}



In [12]:
"""
Basic Calcs on the data
"""

data['points_per_char'] = data['points'] / data['characters']
data['points_per_clean_char'] = data['points'] / data['clean_characters']
data['width_per_char'] = data['width'] / data['characters']
data['width_per_clean_char'] = data['width'] / data['clean_characters']
data['clean_chars_per_char'] = data['clean_characters'] / data['characters']
data['spaces_per_char'] = data['spaces'] / data['characters']

In [13]:
"""
Add analysis to a metadata file
"""

# add analysis to metadata file
metadata = {'analysis': data}
with open(f'{package_root}/{package_name}/config.yaml', 'w') as f:
    yaml.dump(metadata, f)

In [14]:
"""
View Analysis
"""          

# get analysis from IAM
iam_config = yaml.safe_load(open(f'{root_dir}/IAM/config.yaml', 'r'))
iam_analysis = iam_config['analysis']

# display the analysis
df = pd.DataFrame.from_dict({'IAM': iam_analysis, 'New': data}, orient='index')
display(df)

Unnamed: 0,character_dict,characters,clean_characters,clean_chars_per_char,lines,points,points_per_char,points_per_clean_char,spaces,spaces_per_char,width,width_per_char,width_per_clean_char,char_dict
IAM,"{' ': 53115, '!': 166, '""': 1840, '#': 90, '%'...",351868,286741,0.814911,12187.0,7651948,21.746644,26.685922,53115,0.150951,63809067,181.343762,222.532066,
New,,215516,174926,0.811661,,4312692,20.011006,24.65438,32864,0.15249,41128909,190.839237,235.12176,"{'A': 465, ' ': 32864, 's': 10734, 'p': 2922, ..."


In [15]:
"""
Length of the data (training, validation)
"""

# get the length of the data
length = len(os.listdir(f'{package_root}/{package_name}/lines'))
train_length = int(length * 0.85)
val_length = length - train_length
print(f'Training Length: {train_length}')
print(f'Validation Length: {val_length}')

Training Length: 6977
Validation Length: 1232
