In [9]:
from collections import defaultdict
import os
from pathlib import Path
import pandas as pd

In [10]:
# path to ANALYSIS folder
PATH_TO_ANALYSIS = "/Users/polinap/Yandex.Disk.localized/RESEARCH_RU/VOCAL_NOTES_PROJECT/RUSSIAN/ANALYSIS_quan/" 

# Filename for file with the list of transcriber initials
PATH_TRANS_NAME = "transcribers.csv"

# Filename for file with the list of songs / recordings
PATH_SONG_NAME = "recordings.csv"

# The type of divider used in filenames to separate SONG, TRANSCRIBER, and FILETYPE:
# e.g. "SONG__TRANSCRIBER__FILETYPE"
DIVIDER = "__" 
    
FILETYPE_LIST = ['notes', 'segments', 'pitches']


In [11]:
BASE = Path(PATH_TO_ANALYSIS)

# Checks whether the list of transcribers in PATH_TRANS_NAME matches the
# list of directories in a song directory.
# Identifies both missing directories, and incorrectly labelled directories
def check_transcriber_directories(song, transcriber_dirs, transcriber_list):
    missing = []
    wrong = []
    
    for t in transcriber_list:
        if t not in transcriber_dirs:
            missing.append(t)
            
    for t in transcriber_dirs:
        if t not in transcriber_list:
            wrong.append(t)
    
    if len(missing):
        print(f"No data for Transcribers {missing}, for {song}\n")
        
    if len(wrong):
        print(f"Incorrect transcriber folder(s) {wrong}, for {song}\n")


# Check if all files exist
# Only check existing directories "transcriber_dirs", rather than
# printing file_not_found errors for every file in "transcriber_list"
def check_all_files_exist(song, transcriber_dirs):
    file_extensions = ['.csv', '.svl']
    path_list = []
    csv_path_dict = defaultdict(dict)
    for td in transcriber_dirs:
        for ft in FILETYPE_LIST:
            for ext in file_extensions:
                path = BASE.joinpath(song, td, f"{song}{DIVIDER}{td}{DIVIDER}{ft}{ext}")
                if not path.exists():
                    print(f"File not found:\n\t{path}\n")
                else:
                    path_list.append(path)
                    if ext == '.csv':
                        csv_path_dict[td][ft] = path
    return path_list, csv_path_dict
    
    
# If a note is shorter than 10 ms, identify it as a potential error
def identify_short_notes(path, df, min_duration=0.01):
    if 'DURATION' not in df.columns:
        print(f"Incorrect column name in {path}\n\tPlease ensure that note duration column is called DURATION\n")
        return
    for i, dur in enumerate(df['DURATION']):
        if dur < min_duration:
            print(f"Short duration found for note {i+1} in {path}\n\tPlease check if this is correct!\n")
        

# Try to read csv files, and print any errors that occur
# Not sure what to do with the ".svl" files
def check_file_contents(path):
    if path.suffix == '.csv':
        try:
            df = pd.read_csv(path)
            if 'notes' in path.stem:
                identify_short_notes(path, df)
        except Exception as e:
            print(f"Error found while reading {path}\n\t{e}\n")
            
            
# If durations of 'notes' or 'pitches' files are not within 20%
# of each other, identify it as a potential error
def check_duration_consistency(song, csv_path_dict, transcriber_dirs, threshold=0.2):
    for ft in FILETYPE_LIST:
        if ft == 'segments':
            continue
        total_duration = []
        for td in transcriber_dirs:
            if ft not in csv_path_dict[td]:
                continue
                
            path = csv_path_dict[td][ft]
            try:
                df = pd.read_csv(path)
            except:
                continue
                
            if ft == 'notes':
                total_duration.append(df['DURATION'].sum())
            elif ft == 'pitches':
                total_duration.append(len(df))
                
        if len(total_duration) <= 1:
            continue
            
        mean_duration = sum(total_duration) / len(total_duration)
        if any([abs(1 - dur / mean_duration) > threshold for dur in total_duration]):
            print(f"Transcribers differ in total notated duration of the {ft} file for {song}\n")
            
            
    
# Run a series of checks for each song
def check_each_song(song, transcriber_list):
    song_dir = BASE.joinpath(song)
    if not song_dir.exists():
        print(f"Directory for recording {song} could not be found at\n\t{song_dir}\n")
        return
    
    transcriber_dirs = next(os.walk(song_dir))[1]
    check_transcriber_directories(song, transcriber_dirs, transcriber_list)
    path_list, csv_path_dict = check_all_files_exist(song, transcriber_dirs)
    
    if len(transcriber_dirs) > 1:
        check_duration_consistency(song, csv_path_dict, transcriber_dirs)
    
    for path in path_list:
        check_file_contents(path)
        
    
def check_all_files():
    song_list = pd.read_csv(BASE.joinpath(PATH_SONG_NAME))['name']
    transcriber_list = pd.read_csv(BASE.joinpath(PATH_TRANS_NAME))['initials'].values
    for song in song_list:
        check_each_song(song, transcriber_list)
    

In [12]:
check_all_files()

Error found while reading /Users/polinap/Yandex.Disk.localized/RESEARCH_RU/VOCAL_NOTES_PROJECT/RUSSIAN/ANALYSIS_quan/Da_po_zoriushke_1/PP/Da_po_zoriushke_1__PP__pitches.csv
	Error tokenizing data. C error: Expected 2 fields in line 1918, saw 3


Error found while reading /Users/polinap/Yandex.Disk.localized/RESEARCH_RU/VOCAL_NOTES_PROJECT/RUSSIAN/ANALYSIS_quan/Da_po_zoriushke_1/OV/Da_po_zoriushke_1__OV__pitches.csv
	Error tokenizing data. C error: Expected 2 fields in line 1918, saw 3


Error found while reading /Users/polinap/Yandex.Disk.localized/RESEARCH_RU/VOCAL_NOTES_PROJECT/RUSSIAN/ANALYSIS_quan/Da_po_zoriushke_1/YN/Da_po_zoriushke_1__YN__pitches.csv
	Error tokenizing data. C error: Expected 2 fields in line 1918, saw 3


Error found while reading /Users/polinap/Yandex.Disk.localized/RESEARCH_RU/VOCAL_NOTES_PROJECT/RUSSIAN/ANALYSIS_quan/Da_po_zoriushke_3/OV/Da_po_zoriushke_3__OV__pitches.csv
	Error tokenizing data. C error: Expected 2 fields in line 5298, saw 3


Error found whil