In [2]:
import pandas as pd
import requests
import re

In [24]:
# CSV files were generated using webscraper.io to pull metadata from every speaker in the Speech Accent Archive on February 17th, 2023
df1 = pd.read_csv('speech-accent-archive-part-1.csv')
df2 = pd.read_csv('speech-accent-archive-part-2.csv')
df = pd.concat([df1, df2], axis=0).reset_index()

In [29]:
# Dropping rows with native language not specified
df = df.dropna(subset=['native-language'])

In [None]:
# Creating cleaned data columns using regex
df['url-id'] = df.apply(lambda x: re.findall('=(\d+)', x['web-scraper-start-url'])[0], axis=1)
df['length-residence2'] = df.apply(lambda x: re.findall('length of english residence: (\d+\.*\d*) years', x['length-residence'])[0], axis=1)
df['english-residence2'] = df.apply(lambda x: re.findall('english residence: (.*)', x['english-residence'])[0] if len(re.findall('english residence: (.*)', x['english-residence'])) > 0 else "", axis=1)
df['age-of-english-onset2'] = df.apply(lambda x: re.findall('age of english onset: (\d*)', x['age-of-english-onset'])[0], axis=1)
df['other-languages2'] = df.apply(lambda x: re.findall('other language\(s\): (.*)', x['other-languages'])[0] if len(re.findall('other language\(s\): (.*)', x['other-languages'])) > 0 else "", axis=1)
df['birth-place2'] = df.apply(lambda x: re.findall('birth place: (.*) \(map\)', x['birth-place'])[0] if len(re.findall('birth place: (.*) \(map\)', x['birth-place'])) > 0 else "", axis=1)
df['age2'] = df.apply(lambda x: re.findall('age, sex: (\d+),', x['age-sex'])[0] if len(re.findall('age, sex: (\d+),', x['age-sex'])) > 0 else pd.NA, axis=1)
df['sex2'] = df.apply(lambda x: re.findall('age, sex: \d+, (.*)', x['age-sex'])[0] if len(re.findall('age, sex: \d+, (.*)', x['age-sex'])) > 0 else pd.NA, axis=1)
df['english-dialect'] = df['birth-place2']
df['english-dialect'] = df.apply(lambda x: x['birth-place2'] if x['native-language'] == 'english' else "", axis=1)
df['english-dialect'] = df.apply(lambda x: 'usa' if 'usa' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'uk' if 'uk' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'canada' if 'canada' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'ireland' if 'ireland' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'new zealand' if 'new zealand' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'jamaica' if 'jamaica' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'india' if 'india' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'south africa' if 'south africa' in x['english-dialect'] else x['english-dialect'], axis=1)

In [132]:
df['native-language'].value_counts()[:30]

english       652
spanish       235
arabic        200
mandarin      156
korean         97
french         85
russian        81
portuguese     69
dutch          54
japanese       45
german         45
turkish        42
vietnamese     40
italian        39
farsi          39
polish         39
cantonese      35
hindi          34
urdu           29
macedonian     27
tagalog        27
amharic        27
romanian       24
thai           23
swedish        23
bengali        22
bulgarian      20
serbian        19
nepali         18
gujarati       17
Name: native-language, dtype: int64

In [None]:
# Creating the first version of the dataset, containing the 10 most frequent native languages, English broken down into dialects

march = df[df['native-language'].isin(['english', 'spanish', 'arabic', 'mandarin', 'korean', 'french', 'russian', 'portuguese', 'dutch'])]
def accent_class(x):
    if x['native-language'] == 'english':
        if x['english-dialect'] == 'usa':
            return 'usa'
        if x['english-dialect'] == 'uk':
            return 'uk'
        else:
            return pd.NA
    else:
        return x['native-language']

march['accent'] = march.apply(accent_class, axis=1)

In [163]:
march['accent'].value_counts()

usa           426
spanish       235
arabic        200
mandarin      156
korean         97
french         85
russian        81
uk             70
portuguese     69
dutch          54
Name: accent, dtype: int64

In [165]:
march = march.dropna(subset=['accent'])
march['age'] = march['age2']
march['sex'] = march['sex2']
march = march[['url-id', 'age', 'sex', 'accent']]
march = march.reset_index()
march.to_csv('speech-accent-archive-march-version.csv')
march

Unnamed: 0,index,url-id,age,sex,accent
0,0,428,47,male,arabic
1,7,421,21,male,uk
2,8,420,26,female,usa
3,9,419,21,male,uk
4,12,416,19,male,usa
...,...,...,...,...,...
1468,3086,442,38,female,usa
1469,3087,441,18,male,spanish
1470,3089,439,31,male,uk
1471,3091,437,40,male,arabic


In [45]:
# Downloading the corresponding mp3 files from the web

saa_url = "https://accent.gmu.edu/"

with requests.Session() as req:

    for index, row, in df.iterrows():
        mp3_url = row['mp3-link']
        name = row['url-id']

        download = req.get(saa_url + mp3_url)
        if download.status_code == 200:
            with open(name + ".mp3", 'wb') as f:
                f.write(download.content)
        else:
            print(f"Download Failed For File {name}")

## Train test split

In [5]:
df = pd.read_csv("speech-accent-archive-march-version.csv")
df.rename(columns={'accent': 'label'}, inplace=True)
df = df[['url-id', 'label']]
df

Unnamed: 0,url-id,label
0,428,arabic
1,421,uk
2,420,usa
3,419,uk
4,416,usa
...,...,...
1468,442,usa
1469,441,spanish
1470,439,uk
1471,437,arabic


In [6]:
# train test extra
# test = 10 instances of each class
# train = 100 instances of each class or remaining
# extra = all remaining

import random
random.seed(42)

i = list(df[df['label'] == 'uk'].index.values)
random.shuffle(i)

df['set'] = ''

In [7]:
for label in list(df['label'].values):
  index_list = list(df[df['label'] == label].index.values)
  random.shuffle(index_list)
  for i, value in enumerate(index_list):
    if i < 10:
      df.loc[value, 'set'] = 'test'
    elif i < 110:
      df.loc[value, 'set'] = 'train'
    else:
      df.loc[value, 'set'] = 'extra'

In [8]:
df

Unnamed: 0,url-id,label,set
0,428,arabic,train
1,421,uk,train
2,420,usa,extra
3,419,uk,train
4,416,usa,extra
...,...,...,...
1468,442,usa,extra
1469,441,spanish,extra
1470,439,uk,test
1471,437,arabic,extra


In [9]:
df['label'].value_counts()

usa           426
spanish       235
arabic        200
mandarin      156
korean         97
french         85
russian        81
uk             70
portuguese     69
dutch          54
Name: label, dtype: int64

In [10]:
df[df['label']=='usa']['set'].value_counts()

extra    316
train    100
test      10
Name: set, dtype: int64

In [12]:
df[df['set']=='train'].iloc[100]

url-id      2974
label     arabic
set        train
Name: 188, dtype: object

In [187]:
import os
cwd = os.getcwd()

for index, row, in df.iterrows():
    dataset = row['set']
    label = row['label']
    name = str(row['url-id'])  + ".mp3"
    newdir = os.path.join(cwd, "folder-based", dataset, label) + "\\"
    os.makedirs(os.path.dirname(newdir), exist_ok=True)
    newpath = os.path.join(newdir, name)
    shutil.copy2("march_mp3s/" + name, newdir)


## Create Hugging Face Dataset

In [1]:
import pyarrow
pyarrow.__version__

'11.0.0'

In [5]:
import datasets
from datasets import load_dataset
#from datasets import AudioFolder

In [8]:
dataset = load_dataset("folder-based", name='SAA_march')

Resolving data files: 100%|██████████| 796/796 [00:00<00:00, 198848.48it/s]
Resolving data files: 100%|██████████| 100/100 [00:00<00:00, 99959.58it/s]


Downloading and preparing dataset audiofolder/folder-based to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/folder-based-dc17e94d8fca8d9a/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files: 100%|██████████| 796/796 [00:00<00:00, 6352.79it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files: 100%|██████████| 100/100 [00:00<00:00, 4763.22it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
                                                                

Dataset audiofolder downloaded and prepared to C:/Users/Rita/.cache/huggingface/datasets/audiofolder/folder-based-dc17e94d8fca8d9a/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 22.73it/s]


In [9]:
dataset["train"][0]

{'audio': {'path': 'd:\\Data Science and Entrepreneurship MSc\\Thesis\\Data\\SAA\\folder-based\\train\\arabic\\1011.mp3',
  'array': array([ 4.8441798e-06,  3.1870608e-05,  5.9425816e-05, ...,
         -6.9502983e-07, -2.1234566e-05, -1.6558781e-05], dtype=float32),
  'sampling_rate': 44100},
 'label': 0}

In [18]:
dataset["train"][200]
# this is an existing file that has a length, what is going wrong?

{'audio': {'path': 'd:\\Data Science and Entrepreneurship MSc\\Thesis\\Data\\SAA\\folder-based\\train\\french\\2984.mp3',
  'array': array([], dtype=float32),
  'sampling_rate': 44100},
 'label': 2}

## Upload dataset to hugging face hub

In [19]:
dataset.push_to_hub("reralle/saa-march", private=True)

Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:03<00:00,  3.92s/ba]
Upload 1 LFS files: 100%|██████████| 1/1 [02:47<00:00, 167.18s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [02:55<00:00, 175.66s/it]
Pushing split test to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.02ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:25<00:00, 25.84s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:28<00:00, 28.57s/it]


### Ignore

In [3]:
df = pd.read_csv("speech-accent-archive-march-version.csv")

In [1]:
print("hi")

hi


In [4]:
df

Unnamed: 0.1,Unnamed: 0,index,url-id,age,sex,accent
0,0,0,428,47.0,male,arabic
1,1,7,421,21.0,male,uk
2,2,8,420,26.0,female,usa
3,3,9,419,21.0,male,uk
4,4,12,416,19.0,male,usa
...,...,...,...,...,...,...
1468,1468,3086,442,38.0,female,usa
1469,1469,3087,441,18.0,male,spanish
1470,1470,3089,439,31.0,male,uk
1471,1471,3091,437,40.0,male,arabic
