Idea of this notebook:
- separate keys with process_audio_files function
- ideally for both datasets (practical_dl and MKA)
- remember optimal tresholds and do some majority voting in future data loader

save separated audio in folder structure:
- processed data
    - train/val/test
        - label 
            - segmented recordings
- later add there also files from https://www.kaggle.com/datasets/nguyncaoduy/keystroke-noiseless-final

Idea for DataLoader:
- Separate audio for 10ms frames
- Calculate energylevel in each frame, if the treshold is exceeded then enlargen the frame with subsequent 90ms or so

In [2]:
from tqdm import tqdm
import os
from src.utils.data_engineering_utils import process_audio_files

# Practical_dl Dataset

In [18]:
OUTPUT_DIR = "./data/isolated_keystrokes/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_DIR_IMG = "./data/isolated_keystrokes_img/"
os.makedirs(OUTPUT_DIR_IMG, exist_ok=True)

In [None]:
AUDIO_FILE = './data/practical_dl/'
PREFIX = 'practical_dl'

process_audio_files(AUDIO_FILE, OUTPUT_DIR, OUTPUT_DIR_IMG, PREFIX, num_keystrokes=25, save_plots=True, show_plots=False)

Processing files in : 100%|██████████| 1/1 [00:00<00:00, 16980.99it/s]
Processing files in MBPWavs:   0%|          | 0/36 [00:00<?, ?it/s]

Processing files in MBPWavs: 100%|██████████| 36/36 [00:15<00:00,  2.40it/s]
Processing files in Zoom: 100%|██████████| 36/36 [00:10<00:00,  3.36it/s]
Processing files in .git: 100%|██████████| 5/5 [00:00<00:00, 183960.70it/s]
Processing files in objects: 0it [00:00, ?it/s]
Processing files in pack: 100%|██████████| 3/3 [00:00<00:00, 157286.40it/s]
Processing files in info: 0it [00:00, ?it/s]
Processing files in info: 100%|██████████| 1/1 [00:00<00:00, 65536.00it/s]
Processing files in logs: 100%|██████████| 1/1 [00:00<00:00, 71089.90it/s]
Processing files in refs: 0it [00:00, ?it/s]
Processing files in heads: 100%|██████████| 1/1 [00:00<00:00, 65536.00it/s]
Processing files in remotes: 0it [00:00, ?it/s]
Processing files in origin: 100%|██████████| 1/1 [00:00<00:00, 65536.00it/s]
Processing files in hooks: 100%|██████████| 14/14 [00:00<00:00, 947100.90it/s]
Processing files in refs: 0it [00:00, ?it/s]
Processing files in heads: 100%|██████████| 1/1 [00:00<00:00, 71089.90it/s]
Processi

### Trim the leading silence from data

In [None]:
from src.utils.data_engineering_utils import get_audio_lengths, trim_silence_in_directory
import shutil
TRIMMED_DATA_DIR = './data/isolated_keystrokes_trimmed/'

In [None]:
maximum, minimum, average, count = get_audio_lengths(OUTPUT_DIR)
print(f"Maximum: {maximum}, Minimum: {minimum}, Average: {average}, Count: {count}")

Maximum: 0.36363636363636365, Minimum: 0.18095454545454545, Average: 0.36211071787016785, Count: 1721


In [None]:
count = trim_silence_in_directory(OUTPUT_DIR, TRIMMED_DATA_DIR, min_length=180)
count

118

In [5]:
maximum, minimum, average, count = get_audio_lengths(TRIMMED_DATA_DIR)
print(f"Maximum: {maximum}, Minimum: {minimum}, Average: {average}, Count: {count}")

Maximum: 0.364, Minimum: 0.18, Average: 0.3591702498547322, Count: 1721


In [8]:
labels_in_dir = set()
for dirname, _, filenames in os.walk(TRIMMED_DATA_DIR):
    for filename in filenames:
        if filename.endswith('.wav'):
            label = filename.split('_')[2]
            labels_in_dir.add(label)
labels = list(labels_in_dir)

In [None]:
FINAL_DATA_DIR = "./data/final_keystrokes/"

for label in labels:
    label_dir = os.path.join(FINAL_DATA_DIR, label)
    if not os.path.exists(label_dir):
        os.makedirs(label_dir)

for root, _, files in os.walk(TRIMMED_DATA_DIR):
    for filename in files:
        if filename.endswith('.wav'):
            label = filename.split('_')[2]
            src_path = os.path.join(root, filename)
            dst_path = os.path.join(FINAL_DATA_DIR, label, filename)
            shutil.move(src_path, dst_path)

In [10]:
from sklearn.model_selection import train_test_split

for split in ['train', 'val', 'test']:
    for label in labels:
        os.makedirs(os.path.join(FINAL_DATA_DIR, split, label), exist_ok=True)

# Function to move files to the appropriate directory
def move_files(file_paths, subset_name):
    for file_path in file_paths:
        label = file_path.split('/')[-2]
        file_name = os.path.basename(file_path)
        dest_path = os.path.join(FINAL_DATA_DIR, subset_name, label, file_name)
        shutil.move(file_path, dest_path)

In [11]:
for label in labels:
    label_dir = os.path.join(FINAL_DATA_DIR, label)
    files = [os.path.join(label_dir, f) for f in os.listdir(label_dir) if os.path.isfile(os.path.join(label_dir, f))]
    train_files, temp_files = train_test_split(files, test_size=0.3, random_state=42)
    val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)

    move_files(train_files, 'train')
    move_files(val_files, 'val')
    move_files(test_files, 'test')

In [12]:
for label in labels:
    label_dir = os.path.join(FINAL_DATA_DIR, label)
    if not os.listdir(label_dir):
        os.rmdir(label_dir)

## Noiseless Dataset

In [16]:
SOURCE_DIR = './data/data_noiseless_final/'

labels_in_dir_new = set()
for dirname, _, filenames in os.walk(SOURCE_DIR):
    for filename in filenames:
        if filename.endswith('.wav'):
            label = filename.split('-')[1]
            labels_in_dir_new.add(label)
labels_new = list(labels_in_dir_new)

In [None]:
for label in labels_new:
    label_dir = os.path.join(SOURCE_DIR, label)
    if not os.path.exists(label_dir):
        os.makedirs(label_dir)

for root, _, files in os.walk(SOURCE_DIR):
    for filename in files:
        if filename.endswith('.wav'):
            label = filename.split('-')[1]
            src_path = os.path.join(root, filename)
            if label in labels:
                filename_new = 'mac_live_' + label + '_' + str(24+int(filename.split('-')[-1].split('.')[0])) + '.wav'
            else:
                filename_new = 'mac_live_' + label + '_' + str(int(filename.split('-')[-1].split('.')[0])) + '.wav'
            dst_path = os.path.join(SOURCE_DIR, label, filename_new)
            shutil.move(src_path, dst_path)

In [None]:
def move_files(file_paths, subset_name):
    for file_path in file_paths:
        label = file_path.split('/')[-2]

        if not os.path.exists(os.path.join(FINAL_DATA_DIR, subset_name, label)):
            os.makedirs(os.path.join(FINAL_DATA_DIR, subset_name, label), exist_ok=True)

        file_name = os.path.basename(file_path)
        dest_path = os.path.join(FINAL_DATA_DIR, subset_name, label, file_name)

        shutil.move(file_path, dest_path)

for key in os.listdir(SOURCE_DIR):
    key_dir = os.path.join(SOURCE_DIR, key)
    if os.path.isdir(key_dir):
        files = [os.path.join(key_dir, f) for f in os.listdir(key_dir) if f.endswith('.wav')]
        train_files, temp_files = train_test_split(files, test_size=0.3, random_state=42)
        val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)

        move_files(train_files, 'train')
        move_files(val_files, 'val')
        move_files(test_files, 'test')

## MKA Dataset

In [3]:
import os

In [4]:
OUTPUT_DIR = "./data/isolated_keystrokes_MKA/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_DIR_IMG = "./data/isolated_keystrokes_img_MKA/"
os.makedirs(OUTPUT_DIR_IMG, exist_ok=True)

In [14]:
from src.utils.data_engineering_utils import get_audio_lengths, trim_silence_in_directory, process_audio_files
import shutil
TRIMMED_DATA_DIR = './data/isolated_keystrokes_trimmed/'

In [8]:
AUDIO_FILE = './data/MKA/Mac/Raw Data/'
PREFIX = 'MKA'

process_audio_files(AUDIO_FILE, OUTPUT_DIR, OUTPUT_DIR_IMG, PREFIX, num_keystrokes=30, save_plots=True, show_plots=False, initial_threshold=0.4,
                    initial_step=0.003, num_tries=1000, before=3000, after=5000)

Processing files in : 100%|██████████| 61/61 [02:42<00:00,  2.67s/it]


In [5]:
maximum, minimum, average, count = get_audio_lengths(OUTPUT_DIR)
print(f"Maximum: {maximum}, Minimum: {minimum}, Average: {average}, Count: {count}")

Maximum: 0.36363636363636365, Minimum: 0.18095454545454545, Average: 0.3621089428314637, Count: 1719


In [6]:
count = trim_silence_in_directory(OUTPUT_DIR, TRIMMED_DATA_DIR, min_length=180)
count

116

In [7]:
maximum, minimum, average, count = get_audio_lengths(TRIMMED_DATA_DIR)
print(f"Maximum: {maximum}, Minimum: {minimum}, Average: {average}, Count: {count}")

Maximum: 0.364, Minimum: 0.181, Average: 0.35937870855148, Count: 1719


In [None]:
from sklearn.model_selection import train_test_split
FINAL_DATA_DIR = './data/final_keystrokes/'

def get_max_idx(label_dir):
    files = [f for f in os.listdir(label_dir) if os.path.isfile(os.path.join(label_dir, f))]
    idxs = [int(f.split('_')[-1].split('.')[0]) for f in files if f.endswith('.wav')]
    return max(idxs, default=-1)

def generate_unique_filename(label_dir, base_filename):
    max_idx = get_max_idx(label_dir)
    new_idx = max_idx + 1
    base_filename = os.path.splitext(base_filename)[0]
    filename_parts = base_filename.split('_')
    return f"{filename_parts[0]}_{filename_parts[1]}_{filename_parts[2]}_{new_idx}.wav"

def move_files(files, data_split):
    for file in files:
        label = os.path.basename(os.path.dirname(file))
        base_filename = os.path.splitext(os.path.basename(file))[0]
        label = base_filename.split('_')[2]
        target_dir = os.path.join(FINAL_DATA_DIR, data_split, label)
        os.makedirs(target_dir, exist_ok=True)

        if os.path.exists(os.path.join(FINAL_DATA_DIR, data_split, label)):
            new_filename = generate_unique_filename(target_dir, base_filename)
        else:
            new_filename = f"{base_filename}.wav"

        target_path = os.path.join(target_dir, new_filename)
        shutil.copy(file, target_path)

files = [os.path.join(TRIMMED_DATA_DIR, f) for f in os.listdir(TRIMMED_DATA_DIR) if os.path.isfile(os.path.join(TRIMMED_DATA_DIR, f))]

train_files, temp_files = train_test_split(files, test_size=0.3, random_state=42)
val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)

move_files(train_files, 'train')
move_files(val_files, 'val')
move_files(test_files, 'test')