# Generate Stats
Processes a set of audio files and produces a dataframe and a dataset with the stats

## Imports

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from datetime import timedelta, datetime
import numpy as np

import os
import re
from glob import glob

# audio info processing
from pydub.utils import mediainfo

# local import
from lib_henryk.logger import *
from lib_henryk.config import *

## Parameters

In [3]:
# params
logger.info(f"using csv file: {FILE_AUDIO_STATS_CSV}")
logger.info(f"using audio files path: {DIR_WIADOMOSCI_DO_HENRYCZKA}")

2024-05-04 14:02:01 - [32mINFO [0m - [34m<module>[0m - using csv file: ../../data/processed/henryk_audio_stats.csv
2024-05-04 14:02:01 - [32mINFO [0m - [34m<module>[0m - using audio files path: /mnt/onedrive/My Private/My documents/Other/Henryk/Alienacja rodzicielska materiały/Wiadomości do Henryczka


## Process Files

In [4]:
# load patsh to all audio files from wiadomości do Henryczka
recordings_files = glob(f'{DIR_WIADOMOSCI_DO_HENRYCZKA}/**/*.m4a', recursive=True)
logger.info(f'retrieved {len(recordings_files)} files')

2024-05-04 14:02:01 - [32mINFO [0m - [34m<module>[0m - retrieved 499 files


In [5]:
def get_file_info(path: str) -> dict:
    """
    get info from a file and process into a dictionary
    """

    _recording_info = {}

    try:
        _info = mediainfo(path)
        _filename = os.path.basename(path)
        _filename_regex_groups = re.search('^(.+) (\d+-\d+-\d+) (.+)\.(.+)$', _filename) # kind, date, name, extension
        _recording_kind = _filename_regex_groups[1]
        _recording_date = _filename_regex_groups[2]
        _recording_date = datetime.strptime(_recording_date, "%Y-%m-%d")
        _recording_name = _filename_regex_groups[3]
        _recording_type = _filename_regex_groups[4]
        _recording_duration = _info['duration']
    
        _recording_info = {
            'file': _filename,
            'name': _recording_name,
            'kind': _recording_kind,
            'date': _recording_date,
            'type': _recording_type,
            'duration': _recording_duration
        }
    except Exception as e:
        logger.warning(f'encountered error for {path}: {e}')
        return None
    
    return _recording_info

df = pd.DataFrame( {'file': [], 'name': [], 'kind': [], 'date': [], 'type': [], 'duration': []} )
for ix, i in enumerate(range(0,len(recordings_files))):
    printProgressBar(ix,len(recordings_files)-1, prefix=f'processing {len(recordings_files)} audio files')
    _row = get_file_info(recordings_files[i])
    if _row != None:
        df.loc[len(df)] = _row

processing 499 audio files |██████████████████████████████████████████████████| 100.0% 


In [6]:
# save to csv
df.sort_values('date', inplace=True)
df.to_csv(FILE_AUDIO_STATS_CSV, index=False)

# References
- https://www.kaggle.com/code/robikscube/working-with-audio-in-python
- https://towardsdatascience.com/get-to-know-audio-feature-extraction-in-python-a499fdaefe42
- https://stackoverflow.com/questions/55669182/how-to-fix-filenotfounderror-winerror-2-the-system-cannot-find-the-file-speci