In [1]:
import pandas as pd
import numpy as np

## Train test split

In [2]:
df = pd.read_csv("speech-accent-archive-march-version.csv")
df.rename(columns={'accent': 'label'}, inplace=True)
df = df.dropna()
df

Unnamed: 0.1,Unnamed: 0,index,url-id,age,sex,label
0,0,0,428,47.0,male,arabic
1,1,7,421,21.0,male,uk
2,2,8,420,26.0,female,usa
3,3,9,419,21.0,male,uk
4,4,12,416,19.0,male,usa
...,...,...,...,...,...,...
1468,1468,3086,442,38.0,female,usa
1469,1469,3087,441,18.0,male,spanish
1470,1470,3089,439,31.0,male,uk
1471,1471,3091,437,40.0,male,arabic


In [5]:
df['sex'].value_counts()

female    769
male      703
Name: sex, dtype: int64

In [6]:
# Dividing the dataset into 3 age groups
df['age-group'] = pd.qcut(df['age'], 3, labels=[1, 2, 3])
df

Unnamed: 0,url-id,age,sex,label,age-group
0,428,47.0,male,arabic,3
1,421,21.0,male,uk,1
2,420,26.0,female,usa,2
3,419,21.0,male,uk,1
4,416,19.0,male,usa,1
...,...,...,...,...,...
1468,442,38.0,female,usa,3
1469,441,18.0,male,spanish,1
1470,439,31.0,male,uk,2
1471,437,40.0,male,arabic,3


In [7]:
# Creating groups to see if stratification on age is possible with the dataset - it is not really
grouped_df = df.groupby(['label', 'sex', 'age-group']).size().reset_index(name='count')
print(grouped_df)

         label     sex age-group  count
0       arabic  female         1     31
1       arabic  female         2     39
2       arabic  female         3     20
3       arabic    male         1     36
4       arabic    male         2     34
5       arabic    male         3     40
6        dutch  female         1     22
7        dutch  female         2      2
8        dutch  female         3      8
9        dutch    male         1      9
10       dutch    male         2      2
11       dutch    male         3     11
12      french  female         1     20
13      french  female         2     10
14      french  female         3     17
15      french    male         1     14
16      french    male         2      8
17      french    male         3     16
18      korean  female         1     30
19      korean  female         2     11
20      korean  female         3     20
21      korean    male         1     10
22      korean    male         2     17
23      korean    male         3      9


In [8]:
df['age-group'].value_counts()

1    534
3    482
2    456
Name: age-group, dtype: int64

In [9]:
# Function dividing the dataset into training, validation and testing sets

def train_test_validation_split(df, test_per_label=6, validation_per_label=6, random_state=None):
    # Set the random seed for reproducible results
    if random_state is not None:
        np.random.seed(random_state)

    # Initialize empty DataFrames for the test and validation sets
    test_df = pd.DataFrame(columns=df.columns)
    validation_df = pd.DataFrame(columns=df.columns)
    
    # Loop through each label
    for label in df["label"].unique():
        # Get a subset of the DataFrame containing only rows with the current label
        label_df = df[df["label"] == label]
        
        # Loop through each unique pair of sex and age-group
        for sex in label_df["sex"].unique():
            for age_group in label_df["age-group"].unique():
                # Get a subset of the label DataFrame containing only rows with the current sex and age-group
                subset = label_df[(label_df["sex"] == sex) & (label_df["age-group"] == age_group)]

                # If the subset is not empty, randomly sample one row and add it to the test DataFrame
                if not subset.empty:
                    sample = subset.sample(1)
                    test_df = test_df.append(sample)
                    # Remove the selected row from the original DataFrame
                    df = df.drop(sample.index)

        # Calculate the number of rows needed to complete the test_per_label constraint
        remaining_samples = test_per_label - len(test_df[test_df["label"] == label])

        # If there are remaining samples needed, randomly select them from the label DataFrame
        if remaining_samples > 0:
            sample = label_df.sample(remaining_samples)
            test_df = test_df.append(sample)
            # Remove the selected rows from the original DataFrame
            df = df.drop(sample.index)

    # Repeat the process for the validation set
    for label in df["label"].unique():
        label_df = df[df["label"] == label]
        for sex in label_df["sex"].unique():
            for age_group in label_df["age-group"].unique():
                subset = label_df[(label_df["sex"] == sex) & (label_df["age-group"] == age_group)]
                if not subset.empty:
                    sample = subset.sample(1)
                    validation_df = validation_df.append(sample)
                    df = df.drop(sample.index)

        remaining_samples = validation_per_label - len(validation_df[validation_df["label"] == label])
        if remaining_samples > 0:
            sample = label_df.sample(remaining_samples)
            validation_df = validation_df.append(sample)
            df = df.drop(sample.index)

    # Add a new column "set" to the train, test, and validation DataFrames
    df["set"] = "train"
    test_df["set"] = "test"
    validation_df["set"] = "validation"

    # Concatenate the train, test, and validation DataFrames
    result_df = pd.concat([df, test_df, validation_df], ignore_index=True)

    return result_df


In [10]:
result_df = train_test_validation_split(df, test_per_label=6, validation_per_label=6, random_state=42)

In [11]:
result_df

Unnamed: 0,url-id,age,sex,label,age-group,set
0,428,47.0,male,arabic,3,train
1,421,21.0,male,uk,1,train
2,420,26.0,female,usa,2,train
3,419,21.0,male,uk,1,train
4,412,21.0,male,french,1,train
...,...,...,...,...,...,...
1467,1484,25.0,male,dutch,2,validation
1468,1275,22.0,male,dutch,1,validation
1469,770,54.0,female,dutch,3,validation
1470,2989,28.0,female,dutch,2,validation


In [12]:
result_df[(result_df['label'] == "usa") & (result_df['sex'] == "female")]["set"].value_counts()

train         210
test            3
validation      3
Name: set, dtype: int64

In [14]:
result_df[(result_df['set']=="test") & (result_df['sex']=="male")]["label"].value_counts()

arabic        3
uk            3
usa           3
french        3
spanish       3
russian       3
portuguese    3
mandarin      3
korean        3
dutch         3
Name: label, dtype: int64

In [15]:
# Deciding which samples go into the n-f-n dataset (in the thesis this was called dataset A)

def sampler_n_f_n(df):
    # Initialize the n-f-n column with False values
    df["n-f-n"] = False

    # Loop through each label
    for label in df["label"].unique():
        # Get a subset of the DataFrame containing only rows with the current label and "train" set
        label_df = df[(df["label"] == label) & (df["set"] == "train")]

        # Calculate the number of rows to sample (minimum of 100 or the number of rows in label_df)
        num_samples = min(100, len(label_df))

        # Randomly sample rows from the label_df
        sampled_indices = label_df.sample(num_samples).index

        # Set the "n-f-n" column to True for the sampled rows
        df.loc[sampled_indices, "n-f-n"] = True

    return df

In [16]:
result_df_with_n_f_n = sampler_n_f_n(result_df)
result_df_with_n_f_n

Unnamed: 0,url-id,age,sex,label,age-group,set,n-f-n
0,428,47.0,male,arabic,3,train,True
1,421,21.0,male,uk,1,train,True
2,420,26.0,female,usa,2,train,False
3,419,21.0,male,uk,1,train,True
4,412,21.0,male,french,1,train,True
...,...,...,...,...,...,...,...
1467,1484,25.0,male,dutch,2,validation,False
1468,1275,22.0,male,dutch,1,validation,False
1469,770,54.0,female,dutch,3,validation,False
1470,2989,28.0,female,dutch,2,validation,False


In [19]:
result_df_with_n_f_n[result_df_with_n_f_n["n-f-n"] == True]["label"].value_counts()

arabic        100
spanish       100
mandarin      100
usa           100
korean         85
french         73
russian        69
uk             58
portuguese     57
dutch          42
Name: label, dtype: int64

In [32]:
# Dediding how many times each speaker gets sampled for the s-f-o (aka C) dataset

def sampler_s_f_o(df):
    # Initialize the s-f-o column with value 1 for each row
    df["s-f-o"] = 1

    # Set s-f-o to 0 for testing and validation rows
    df.loc[df["set"] != "train", "s-f-o"] = 0

    # Define the target sum for each label - sex pair
    target_sum = 210

    # Loop until the sum of s-f-o for each label - sex pair in the training set reaches the target sum
    while True:
        # Initialize a boolean flag to track if any pair is still below the target sum
        below_target = False

        # Loop through each label
        for label in df["label"].unique():
            # Loop through each sex
            for sex in df["sex"].unique():
                # Get a subset of the DataFrame containing only rows with the current label and sex in the training set
                subset = df[(df["label"] == label) & (df["sex"] == sex) & (df["set"] == "train")]

                # Calculate the current sum of s-f-o for the label - sex pair
                current_sum = subset["s-f-o"].sum()

                # If the current sum is below the target sum
                if current_sum < target_sum:
                    # Set the below_target flag to True
                    below_target = True

                    # Randomly sample a row from the subset
                    sampled_index = subset.sample(1).index

                    # Increment the s-f-o value for the sampled row
                    df.loc[sampled_index, "s-f-o"] += 1

        # If none of the label - sex pairs in the training set are below the target sum, break the loop
        if not below_target:
            break

    return df


In [33]:
result_df_with_s_f_o = sampler_s_f_o(result_df_with_n_f_n)
result_df_with_s_f_o


Unnamed: 0,url-id,age,sex,label,age-group,set,n-f-n,s-f-o
0,428,47.0,male,arabic,3,train,True,1
1,421,21.0,male,uk,1,train,True,5
2,420,26.0,female,usa,2,train,False,1
3,419,21.0,male,uk,1,train,True,5
4,412,21.0,male,french,1,train,True,5
...,...,...,...,...,...,...,...,...
1467,1484,25.0,male,dutch,2,validation,False,0
1468,1275,22.0,male,dutch,1,validation,False,0
1469,770,54.0,female,dutch,3,validation,False,0
1470,2989,28.0,female,dutch,2,validation,False,0


In [38]:
result_df_with_s_f_o[(result_df_with_s_f_o["label"]=="french") & (result_df_with_s_f_o["sex"]=="female")]["s-f-o"].sum()

210

In [39]:
# Deciding how many times each sample is used in s-f-c (dataset B)

def sampler_s_f_c(df):
    # Initialize the s-f-c column with value 1 for each row
    df["s-f-c"] = 1

    # Set s-f-c to 0 for testing and validation rows
    df.loc[df["set"] != "train", "s-f-c"] = 0

    # Loop until the sum of s-f-c for both sexes with the same label is equal
    while True:
        # Initialize a boolean flag to track if any label has unequal sums
        unequal_sum = False

        # Loop through each label
        for label in df["label"].unique():
            # Get the subsets of the DataFrame containing only rows with the current label in the training set, grouped by sex
            subsets = [df[(df["label"] == label) & (df["sex"] == sex) & (df["set"] == "train")] for sex in df["sex"].unique()]

            # Calculate the current sum of s-f-c for each sex in the label group
            current_sums = [subset["s-f-c"].sum() for subset in subsets]

            # If the current sums are not equal
            if current_sums[0] != current_sums[1]:
                # Set the unequal_sum flag to True
                unequal_sum = True

                # Find the index of the subset with the smaller sum
                smaller_subset_index = 0 if current_sums[0] < current_sums[1] else 1

                # Randomly sample a row from the subset with the smaller sum
                sampled_index = subsets[smaller_subset_index].sample(1).index

                # Increment the s-f-c value for the sampled row
                df.loc[sampled_index, "s-f-c"] += 1

        # If none of the labels have unequal sums, break the loop
        if not unequal_sum:
            break

    return df


In [40]:
result_df_with_s_f_c = sampler_s_f_c(result_df_with_s_f_o)
result_df_with_s_f_c

Unnamed: 0,url-id,age,sex,label,age-group,set,n-f-n,s-f-o,s-f-c
0,428,47.0,male,arabic,3,train,True,1,1
1,421,21.0,male,uk,1,train,True,5,1
2,420,26.0,female,usa,2,train,False,1,1
3,419,21.0,male,uk,1,train,True,5,1
4,412,21.0,male,french,1,train,True,5,1
...,...,...,...,...,...,...,...,...,...
1467,1484,25.0,male,dutch,2,validation,False,0,0
1468,1275,22.0,male,dutch,1,validation,False,0,0
1469,770,54.0,female,dutch,3,validation,False,0,0
1470,2989,28.0,female,dutch,2,validation,False,0,0


In [44]:
result_df_with_s_f_c[(result_df_with_s_f_c["label"]=="dutch") & (result_df_with_s_f_c["sex"]=="female")]["s-f-c"].sum()

26

In [45]:
result_df_with_s_f_c["s-f-c"].sum()

1516

In [48]:
# s-r-5o (dataset E)

def sampler_s_r_5o(df):
    # Initialize the s_r_5o column with value 1 for each row
    df["s_r_5o"] = 1

    # Set s-f-o to 0 for testing and validation rows
    df.loc[df["set"] != "train", "s_r_5o"] = 0

    # Define the target sum for each label - sex pair
    target_sum = 1000

    # Loop until the sum of s-f-o for each label - sex pair in the training set reaches the target sum
    while True:
        # Initialize a boolean flag to track if any pair is still below the target sum
        below_target = False

        # Loop through each label
        for label in df["label"].unique():
            # Loop through each sex
            for sex in df["sex"].unique():
                # Get a subset of the DataFrame containing only rows with the current label and sex in the training set
                subset = df[(df["label"] == label) & (df["sex"] == sex) & (df["set"] == "train")]

                # Calculate the current sum of s-f-o for the label - sex pair
                current_sum = subset["s_r_5o"].sum()

                # If the current sum is below the target sum
                if current_sum < target_sum:
                    # Set the below_target flag to True
                    below_target = True

                    # Randomly sample a row from the subset
                    sampled_index = subset.sample(1).index

                    # Increment the s-f-o value for the sampled row
                    df.loc[sampled_index, "s_r_5o"] += 1

        # If none of the label - sex pairs in the training set are below the target sum, break the loop
        if not below_target:
            break

    return df

In [49]:
result_df_with_s_r_5o = sampler_s_r_5o(result_df_with_s_f_c)
result_df_with_s_r_5o

Unnamed: 0,url-id,age,sex,label,age-group,set,n-f-n,s-f-o,s-f-c,s_r_5o
0,428,47.0,male,arabic,3,train,True,1,1,14
1,421,21.0,male,uk,1,train,True,5,1,22
2,420,26.0,female,usa,2,train,False,1,1,5
3,419,21.0,male,uk,1,train,True,5,1,28
4,412,21.0,male,french,1,train,True,5,1,31
...,...,...,...,...,...,...,...,...,...,...
1467,1484,25.0,male,dutch,2,validation,False,0,0,0
1468,1275,22.0,male,dutch,1,validation,False,0,0,0
1469,770,54.0,female,dutch,3,validation,False,0,0,0
1470,2989,28.0,female,dutch,2,validation,False,0,0,0


In [51]:
result_df_with_s_f_c["s_r_5o"].sum()

20000

In [56]:
# This dataset was not used in the thesis

def sampler_s_r_10c(df):
    # Initialize the s-r-5c column with value 1 for each row
    df["s-r-10c"] = 1

    # Set s-r-5c to 0 for testing and validation rows
    df.loc[df["set"] != "train", "s-r-10c"] = 0

    # Loop until the sum of s-r-5c for both sexes with the same label is five times the count of the larger sex in the label
    while True:
        # Initialize a boolean flag to track if any label has not reached the target sum
        below_target = False

        # Loop through each label
        for label in df["label"].unique():
            # Get the subsets of the DataFrame containing only rows with the current label in the training set, grouped by sex
            subsets = [df[(df["label"] == label) & (df["sex"] == sex) & (df["set"] == "train")] for sex in df["sex"].unique()]

            # Calculate the current sum of s-r-5c for each sex in the label group
            current_sums = [subset["s-r-10c"].sum() for subset in subsets]

            # Calculate the target sum as five times the count of the larger sex in the label
            target_sum = 10 * max(len(subsets[0]), len(subsets[1]))

            # Check if any of the sums are below the target sum
            if any(current_sum < target_sum for current_sum in current_sums):
                # Set the below_target flag to True
                below_target = True

                # Find the index of the subset with the smaller sum
                smaller_subset_index = 0 if current_sums[0] < current_sums[1] else 1

                # Randomly sample a row from the subset with the smaller sum
                sampled_index = subsets[smaller_subset_index].sample(1).index

                # Increment the s-r-5c value for the sampled row
                df.loc[sampled_index, "s-r-10c"] += 1

        # If none of the labels have sums below the target sum, break the loop
        if not below_target:
            break

    return df


In [57]:
result_df_with_s_r_10c = sampler_s_r_10c(result_df_with_s_r_5c)
result_df_with_s_r_10c


Unnamed: 0,url-id,age,sex,label,age-group,set,n-f-n,s-f-o,s-f-c,s_r_5o,s-r-5c,s-r-10c
0,428,47.0,male,arabic,3,train,True,1,1,14,3,10
1,421,21.0,male,uk,1,train,True,5,1,22,2,7
2,420,26.0,female,usa,2,train,False,1,1,5,6,6
3,419,21.0,male,uk,1,train,True,5,1,28,5,14
4,412,21.0,male,french,1,train,True,5,1,31,9,13
...,...,...,...,...,...,...,...,...,...,...,...,...
1467,1484,25.0,male,dutch,2,validation,False,0,0,0,0,0
1468,1275,22.0,male,dutch,1,validation,False,0,0,0,0,0
1469,770,54.0,female,dutch,3,validation,False,0,0,0,0,0
1470,2989,28.0,female,dutch,2,validation,False,0,0,0,0,0


In [59]:
result_df_with_s_r_10c["n-f-n"] = result_df_with_s_r_10c["n-f-n"].astype(int)


In [61]:
result_df_with_s_r_10c = result_df_with_s_r_10c.rename(columns={"s_r_5o": "s-r-5o"})


In [66]:
result_df_with_s_r_10c

Unnamed: 0,url-id,age,sex,label,age-group,set,n-f-n,s-f-o,s-f-c,s-r-5o,s-r-5c,s-r-10c
0,428,47.0,male,arabic,3,train,1,1,1,14,3,10
1,421,21.0,male,uk,1,train,1,5,1,22,2,7
2,420,26.0,female,usa,2,train,0,1,1,5,6,6
3,419,21.0,male,uk,1,train,1,5,1,28,5,14
4,412,21.0,male,french,1,train,1,5,1,31,9,13
...,...,...,...,...,...,...,...,...,...,...,...,...
1467,1484,25.0,male,dutch,2,validation,1,1,1,5,5,5
1468,1275,22.0,male,dutch,1,validation,1,1,1,5,5,5
1469,770,54.0,female,dutch,3,validation,1,1,1,5,5,5
1470,2989,28.0,female,dutch,2,validation,1,1,1,5,5,5


In [65]:
# This is defining how many times the testing and validation set speakers will be sampled
# Generally once, except for the datasets that take random samples, there it is defined this way so that the test sets also contain multiple different random samples

result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "validation", "n-f-n"] = 1
result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "test", "n-f-n"] = 1
result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "validation", "s-f-o"] = 1
result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "test", "s-f-o"] = 1
result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "validation", "s-f-c"] = 1
result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "test", "s-f-c"] = 1
result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "validation", "s-r-5o"] = 5
result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "test", "s-r-5o"] = 5
result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "validation", "s-r-5c"] = 5
result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "test", "s-r-5c"] = 5
result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "validation", "s-r-10c"] = 5
result_df_with_s_r_10c.loc[result_df_with_s_r_10c["set"] == "test", "s-r-10c"] = 5

In [67]:
import os
import librosa
import numpy as np
import soundfile as sf

# This is the function that actually creates the audio datasets
# It samples according to the instructions of the dataframe created above

def organize_audio_files(df, column, src_dir, target_duration=10):
    # Create the main output folder if it doesn't exist
    if not os.path.exists(column):
        os.makedirs(column)

    # Loop through each unique set value (train, test, validation)
    for set_value in df["set"].unique():
        # Create the set folder if it doesn't exist
        set_folder = os.path.join(column, set_value)
        if not os.path.exists(set_folder):
            os.makedirs(set_folder)

        # Loop through each unique label value
        for label in df["label"].unique():
            # Create the label folder if it doesn't exist
            label_folder = os.path.join(set_folder, label)
            if not os.path.exists(label_folder):
                os.makedirs(label_folder)

            # Filter the DataFrame based on the current set and label values
            filtered_df = df[(df["set"] == set_value) & (df["label"] == label)]

            # Loop through each row in the filtered DataFrame
            for _, row in filtered_df.iterrows():
                # Calculate the number of copies needed
                num_copies = row[column]

                # Load the audio file using librosa
                src_file = os.path.join(src_dir, f"{row['url-id']}.mp3")
                audio, sr = librosa.load(src_file, sr=None)

                # Calculate target length in samples
                target_length = target_duration * sr

                # Crop or pad the audio to the target length
                if len(audio) >= target_length:
                    audio = audio[:target_length]
                else:
                    audio = np.pad(audio, (0, target_length - len(audio)))

                # Create the required number of copies for the current audio file
                for i in range(num_copies):
                    dst_file = os.path.join(label_folder, f"{row['url-id']}_{i}.wav")

                    # Save the processed audio as a WAV file
                    sf.write(dst_file, audio, sr)


In [68]:
organize_audio_files(result_df_with_s_r_10c, "n-f-n", r"D:\Data Science and Entrepreneurship MSc\Thesis\Data\SAA\all_mp3s")


In [69]:
organize_audio_files(result_df_with_s_r_10c, "s-f-o", r"D:\Data Science and Entrepreneurship MSc\Thesis\Data\SAA\all_mp3s")


In [70]:
organize_audio_files(result_df_with_s_r_10c, "s-f-c", r"D:\Data Science and Entrepreneurship MSc\Thesis\Data\SAA\all_mp3s")


In [72]:
import os
import librosa
import numpy as np
import soundfile as sf

# Another version of the above function that takes random samples rather than the first part of the audio

def organize_audio_files(df, column, src_dir, target_duration=10, random_seed=42):
    np.random.seed(random_seed)

    # Create the main output folder if it doesn't exist
    if not os.path.exists(column):
        os.makedirs(column)

    # Loop through each unique set value (train, test, validation)
    for set_value in df["set"].unique():
        # Create the set folder if it doesn't exist
        set_folder = os.path.join(column, set_value)
        if not os.path.exists(set_folder):
            os.makedirs(set_folder)

        # Loop through each unique label value
        for label in df["label"].unique():
            # Create the label folder if it doesn't exist
            label_folder = os.path.join(set_folder, label)
            if not os.path.exists(label_folder):
                os.makedirs(label_folder)

            # Filter the DataFrame based on the current set and label values
            filtered_df = df[(df["set"] == set_value) & (df["label"] == label)]

            # Loop through each row in the filtered DataFrame
            for _, row in filtered_df.iterrows():
                # Calculate the number of copies needed
                num_copies = row[column]

                # Load the audio file using librosa
                src_file = os.path.join(src_dir, f"{row['url-id']}.mp3")
                audio, sr = librosa.load(src_file, sr=None)

                # Calculate target length in samples
                target_length = target_duration * sr

                # Loop for each copy of the current audio file
                for i in range(num_copies):
                    # Check if the audio length is greater than or equal to the target length
                    if len(audio) >= target_length:
                        # Choose a random start position for the 10-second sample
                        start = np.random.randint(0, len(audio) - target_length + 1)
                        end = start + target_length
                        cropped_audio = audio[start:end]
                    else:
                        cropped_audio = np.pad(audio, (0, target_length - len(audio)))

                    dst_file = os.path.join(label_folder, f"{row['url-id']}_{i}.wav")

                    # Save the processed audio as a WAV file
                    sf.write(dst_file, cropped_audio, sr)


In [73]:
organize_audio_files(result_df_with_s_r_10c, "s-r-5o", r"D:\Data Science and Entrepreneurship MSc\Thesis\Data\SAA\all_mp3s")

In [74]:
organize_audio_files(result_df_with_s_r_10c, "s-r-5c", r"D:\Data Science and Entrepreneurship MSc\Thesis\Data\SAA\all_mp3s")

In [75]:
organize_audio_files(result_df_with_s_r_10c, "s-r-10c", r"D:\Data Science and Entrepreneurship MSc\Thesis\Data\SAA\all_mp3s")

## Upload datasets to Hugging Face Hub

In [76]:
import pyarrow
pyarrow.__version__

'11.0.0'

In [77]:
import datasets
from datasets import load_dataset

In [78]:
dataset = load_dataset("n-f-n", name='SAA_n-f-n')

Resolving data files: 100%|██████████| 784/784 [00:00<00:00, 193397.30it/s]
Resolving data files: 100%|██████████| 60/60 [00:00<00:00, 60090.32it/s]
Resolving data files: 100%|██████████| 60/60 [00:00<00:00, 60190.92it/s]


Downloading and preparing dataset audiofolder/n-f-n to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/n-f-n-336de0eaea3d8d91/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files: 100%|██████████| 784/784 [00:00<00:00, 8426.03it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files: 100%|██████████| 60/60 [00:00<00:00, 7507.48it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files: 100%|██████████| 60/60 [00:00<00:00, 4999.67it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
                                                                

Dataset audiofolder downloaded and prepared to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/n-f-n-336de0eaea3d8d91/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 57.69it/s]


In [81]:
dataset.push_to_hub("reralle/n-f-n", private=False)

Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.31s/ba]
Upload 1 LFS files: 100%|██████████| 1/1 [01:50<00:00, 110.47s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.33s/ba].23s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:29<00:00, 89.52s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [03:31<00:00, 105.54s/it]
Pushing split test to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  4.58ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:14<00:00, 14.14s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:15<00:00, 15.24s/it]
Pushing split validation to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  4.33ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:15<00:00, 15.37s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:16<00:00, 16.50s/it]


In [82]:
dataset = load_dataset("s-f-o", name='SAA_s-f-o')
dataset.push_to_hub("reralle/s-f-o", private=False)

Resolving data files: 100%|██████████| 4200/4200 [00:00<00:00, 8741.72it/s] 
Resolving data files: 100%|██████████| 60/60 [00:00<00:00, 57548.19it/s]
Resolving data files: 100%|██████████| 60/60 [00:00<?, ?it/s]


Downloading and preparing dataset audiofolder/s-f-o to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/s-f-o-22fb25859134090f/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files: 100%|██████████| 4200/4200 [00:00<00:00, 9900.81it/s] 
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files: 100%|██████████| 60/60 [00:00<00:00, 9936.36it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files: 100%|██████████| 60/60 [00:00<00:00, 7510.62it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
                                                                 

Dataset audiofolder downloaded and prepared to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/s-f-o-22fb25859134090f/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 33.71it/s]
Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.46ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [02:20<00:00, 140.56s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.81ba/s].35s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [00:19<00:00, 19.40s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.43ba/s]02s/it] 
Upload 1 LFS files: 100%|██████████| 1/1 [00:48<00:00, 48.74s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.64ba/s]74s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [00:57<00:00, 57.33s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  3.08ba/s]43s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [00:30<00:00, 30.69s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.65ba/s]08s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:13<00:00, 73.39s/

In [83]:
dataset = load_dataset("s-f-c", name='SAA_s-f-c')
dataset.push_to_hub("reralle/s-f-c", private=False)

Resolving data files: 100%|██████████| 1516/1516 [00:00<00:00, 16478.09it/s]
Resolving data files: 100%|██████████| 60/60 [00:00<00:00, 60176.53it/s]
Resolving data files: 100%|██████████| 60/60 [00:00<00:00, 61802.12it/s]


Downloading and preparing dataset audiofolder/s-f-c to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/s-f-c-5859f3ec5242cb1f/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files: 100%|██████████| 1516/1516 [00:00<00:00, 9298.99it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files: 100%|██████████| 60/60 [00:00<00:00, 9936.36it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files: 100%|██████████| 60/60 [00:00<00:00, 8536.86it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
                                                                 

Dataset audiofolder downloaded and prepared to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/s-f-c-5859f3ec5242cb1f/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 52.41it/s]
Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.26s/ba]
Upload 1 LFS files: 100%|██████████| 1/1 [01:57<00:00, 117.44s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.50s/ba].11s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:50<00:00, 110.41s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.57s/ba].23s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [02:08<00:00, 128.43s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 3/3 [06:14<00:00, 124.99s/it]
Pushing split test to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  5.35ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:17<00:00, 17.18s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:18<00:00, 18.21s/it]
Pushing split validation to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<

In [84]:
dataset = load_dataset("s-r-5o", name='SAA_s-r-5o')
dataset.push_to_hub("reralle/s-r-5o", private=False)

Resolving data files: 100%|██████████| 20000/20000 [00:01<00:00, 12357.54it/s]
Resolving data files: 100%|██████████| 300/300 [00:00<00:00, 299735.87it/s]
Resolving data files: 100%|██████████| 300/300 [00:00<00:00, 298739.60it/s]


Downloading and preparing dataset audiofolder/s-r-5o to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/s-r-5o-1663b601c9454370/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files: 100%|██████████| 20000/20000 [00:02<00:00, 8254.46it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files: 100%|██████████| 300/300 [00:00<00:00, 7892.09it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files: 100%|██████████| 300/300 [00:00<00:00, 5657.10it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
                                                                  

Dataset audiofolder downloaded and prepared to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/s-r-5o-1663b601c9454370/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 11.91it/s]
Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.95s/ba]
Upload 1 LFS files: 100%|██████████| 1/1 [02:01<00:00, 121.97s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.95s/ba]125.77s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:59<00:00, 119.42s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:02<00:00,  2.15s/ba]128.19s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [02:11<00:00, 131.09s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.58s/ba]134.22s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:45<00:00, 105.37s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:02<00:00,  2.04s/ba]126.86s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [02:21<00:00, 141.34s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.99s/ba]135.94s/it]
Upload 1 LFS files: 100%|██████████| 1/1

In [85]:
dataset = load_dataset("s-r-5c", name='SAA_s-r-5c')
dataset.push_to_hub("reralle/s-r-5c", private=False)

Resolving data files: 100%|██████████| 7580/7580 [00:00<00:00, 30938.91it/s]
Resolving data files: 100%|██████████| 300/300 [00:00<00:00, 300379.85it/s]
Resolving data files: 100%|██████████| 300/300 [00:00<00:00, 299950.23it/s]


Downloading and preparing dataset audiofolder/s-r-5c to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/s-r-5c-4fbd18b3f7f407f7/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files: 100%|██████████| 7580/7580 [00:00<00:00, 11048.89it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files: 100%|██████████| 300/300 [00:00<00:00, 10699.85it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files: 100%|██████████| 300/300 [00:00<00:00, 11101.23it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
                                                                  

Dataset audiofolder downloaded and prepared to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/s-r-5c-4fbd18b3f7f407f7/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 30.84it/s]
Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.65s/ba]
Upload 1 LFS files: 100%|██████████| 1/1 [02:05<00:00, 125.65s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.45s/ba]8.97s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:24<00:00, 84.08s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.59s/ba]7.57s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:20<00:00, 80.97s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.68s/ba].41s/it] 
Upload 1 LFS files: 100%|██████████| 1/1 [01:23<00:00, 83.40s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.61s/ba].28s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:22<00:00, 82.83s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.44s/ba].47s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:14<00:00, 7

In [88]:
dataset = load_dataset("s-r-10c", name='SAA_s-r-10c')
dataset.push_to_hub("reralle/s-r-10c", private=False)

Resolving data files: 100%|██████████| 15160/15160 [00:00<00:00, 18748.52it/s]
Resolving data files: 100%|██████████| 300/300 [00:00<00:00, 300308.16it/s]
Resolving data files: 100%|██████████| 300/300 [00:00<00:00, 149778.74it/s]
Found cached dataset audiofolder (C:/Users/Rita/.cache/huggingface/datasets/audiofolder/s-r-10c-11fb54e2e31918cf/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)
100%|██████████| 3/3 [00:00<00:00, 12.27it/s]
Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.67s/ba].33s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:04<00:00, 64.72s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.76s/ba].17s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:07<00:00, 67.57s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.92s/ba].95s/it]
Upload 1 LFS files: 100%|██████████| 1/1 [01:14<00:00, 74.07s/it]
Creating parquet from Arrow format: 10

In [None]:
result_df_with_s_r_10c.to_excel('saa_april.xlsx')