Idea of this notebook:
- separate keys with process_audio_files function
- ideally for both datasets (practical_dl and MKA)
- remember optimal tresholds and do some majority voting in future data loader

save separated audio in folder structure:
- processed data
    - train/val/test
        - label 
            - segmented recordings
- later add there also files from https://www.kaggle.com/datasets/nguyncaoduy/keystroke-noiseless-final

Idea for DataLoader:
- Separate audio for 10ms frames
- Calculate energylevel in each frame, if the treshold is exceeded then enlargen the frame with subsequent 90ms or so

In [11]:
from tqdm import tqdm
import os
from preprocessing_utils import process_audio_files

In [4]:
OUTPUT_DIR = "./Data/isolated_keystrokes/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_DIR_IMG = "./Data/isolated_keystrokes_img/"
os.makedirs(OUTPUT_DIR_IMG, exist_ok=True)

In [3]:
AUDIO_FILE = './Data/practical_dl/'
PREFIX = 'practical_dl'

process_audio_files(AUDIO_FILE, OUTPUT_DIR, OUTPUT_DIR_IMG, PREFIX, num_keystrokes=25, save_plots=True, show_plots=False)

Processing files in : 100%|██████████| 1/1 [00:00<00:00, 26379.27it/s]
Processing files in MBPWavs: 100%|██████████| 36/36 [00:12<00:00,  2.90it/s]
Processing files in Zoom: 100%|██████████| 36/36 [00:10<00:00,  3.28it/s]
Processing files in .git: 100%|██████████| 5/5 [00:00<00:00, 238312.73it/s]
Processing files in objects: 0it [00:00, ?it/s]
Processing files in pack: 100%|██████████| 3/3 [00:00<00:00, 165564.63it/s]
Processing files in info: 0it [00:00, ?it/s]
Processing files in info: 100%|██████████| 1/1 [00:00<00:00, 52428.80it/s]
Processing files in logs: 100%|██████████| 1/1 [00:00<00:00, 66576.25it/s]
Processing files in refs: 0it [00:00, ?it/s]
Processing files in heads: 100%|██████████| 1/1 [00:00<00:00, 66576.25it/s]
Processing files in remotes: 0it [00:00, ?it/s]
Processing files in origin: 100%|██████████| 1/1 [00:00<00:00, 66576.25it/s]
Processing files in hooks: 100%|██████████| 14/14 [00:00<00:00, 932067.56it/s]
Processing files in refs: 0it [00:00, ?it/s]
Processing fi

In [19]:
# rename files - should be already updated in original function
for root, dirs, files in os.walk(OUTPUT_DIR):
        subfolder = os.path.basename(root)
        for file in tqdm(files, desc=f"Processing files in {subfolder}"):
            if file.endswith('.wav'):
                loc = os.path.join(root, file)
                old_name = file.split('_')
                old_name_num = old_name[-1].split('.')[0]
                new_name = f"{old_name[0]}_{old_name[1]}_{old_name[2]}_{str(int(old_name_num))}.wav"
                new_loc = os.path.join(root, new_name)
                os.rename(loc, new_loc)

Processing files in : 100%|██████████| 2393/2393 [00:00<00:00, 26228.65it/s]


MKA dataset has not yet been segmented

In [None]:
OUTPUT_DIR = "./Data/isolated_keystrokes_tests/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_DIR_IMG = "./Data/isolated_keystrokes_img_tests/"
os.makedirs(OUTPUT_DIR_IMG, exist_ok=True)

In [1]:
# AUDIO_FILE = './Data/MKA/All Dataset/Raw Data/'
# PREFIX = 'MKA'

# process_audio_files(AUDIO_FILE, OUTPUT_DIR, OUTPUT_DIR_IMG, PREFIX, num_keystrokes=30, save_plots=True, show_plots=True)

### Trim the leading silence from data

In [4]:
from preprocessing_utils import get_audio_lengths, trim_silence_in_directory
import shutil
DATA_DIR = './Data/isolated_keystrokes/'
TRIMMED_DATA_DIR = './Data/isolated_keystrokes_trimmed/'

In [5]:
get_audio_lengths(DATA_DIR)

(0.9090909090909091, 0.47745454545454546, 0.8029219156998437, 2389)

In [8]:
count = trim_silence_in_directory(DATA_DIR, TRIMMED_DATA_DIR, min_length=600)
count

10

In [9]:
get_audio_lengths(TRIMMED_DATA_DIR)

(0.909, 0.477, 0.80174213729029, 2389)

In [22]:
labels_in_dir = set()
for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
        if filename.endswith('.wav'):
            label = filename.split('_')[2]
            labels_in_dir.add(label)
labels = list(labels_in_dir)

In [23]:
labels_dict = {val: i for i, val in enumerate(labels, start=1)}

In [None]:
FINAL_DATA_DIR = "./Data/final_keystrokes/"

for label in labels:
    label_dir = os.path.join(FINAL_DATA_DIR, label)
    if not os.path.exists(label_dir):
        os.makedirs(label_dir)

for root, _, files in os.walk(TRIMMED_DATA_DIR):
    for filename in files:
        if filename.endswith('.wav'):
            label = filename.split('_')[2]
            src_path = os.path.join(root, filename)
            dst_path = os.path.join(FINAL_DATA_DIR, label, filename)
            shutil.copy(src_path, dst_path)

In [None]:
from sklearn.model_selection import train_test_split

for split in ['train', 'val', 'test']:
    for label in labels:
        os.makedirs(os.path.join(DATA_DIR, split, label), exist_ok=True)

# Function to move files to the appropriate directory
def move_files(file_paths, subset_name):
    for file_path in file_paths:
        label = file_path.split('/')[-2]
        file_name = os.path.basename(file_path)
        dest_path = os.path.join(DATA_DIR, subset_name, label, file_name)
        shutil.move(file_path, dest_path)

# Split files for each label
for label in labels:
    label_dir = os.path.join(DATA_DIR, label)
    files = [os.path.join(label_dir, f) for f in os.listdir(label_dir) if os.path.isfile(os.path.join(label_dir, f))]
    train_files, temp_files = train_test_split(files, test_size=0.3, random_state=42)
    val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)

    move_files(train_files, 'train')
    move_files(val_files, 'val')
    move_files(test_files, 'test')
    # Remove empty folders
    for label in labels:
        label_dir = os.path.join(DATA_DIR, label)
        if not os.listdir(label_dir):
            os.rmdir(label_dir)