In [2]:
import pandas as pd
import requests
import re

In [24]:
# CSV files were generated using webscraper.io to pull metadata from every speaker in the Speech Accent Archive on February 17th, 2023
df1 = pd.read_csv('speech-accent-archive-part-1.csv')
df2 = pd.read_csv('speech-accent-archive-part-2.csv')
df = pd.concat([df1, df2], axis=0).reset_index()

In [29]:
# Dropping rows with native language not specified
df = df.dropna(subset=['native-language'])

In [None]:
# Creating cleaned data columns using regex
df['url-id'] = df.apply(lambda x: re.findall('=(\d+)', x['web-scraper-start-url'])[0], axis=1)
df['length-residence2'] = df.apply(lambda x: re.findall('length of english residence: (\d+\.*\d*) years', x['length-residence'])[0], axis=1)
df['english-residence2'] = df.apply(lambda x: re.findall('english residence: (.*)', x['english-residence'])[0] if len(re.findall('english residence: (.*)', x['english-residence'])) > 0 else "", axis=1)
df['age-of-english-onset2'] = df.apply(lambda x: re.findall('age of english onset: (\d*)', x['age-of-english-onset'])[0], axis=1)
df['other-languages2'] = df.apply(lambda x: re.findall('other language\(s\): (.*)', x['other-languages'])[0] if len(re.findall('other language\(s\): (.*)', x['other-languages'])) > 0 else "", axis=1)
df['birth-place2'] = df.apply(lambda x: re.findall('birth place: (.*) \(map\)', x['birth-place'])[0] if len(re.findall('birth place: (.*) \(map\)', x['birth-place'])) > 0 else "", axis=1)
df['age2'] = df.apply(lambda x: re.findall('age, sex: (\d+),', x['age-sex'])[0] if len(re.findall('age, sex: (\d+),', x['age-sex'])) > 0 else pd.NA, axis=1)
df['sex2'] = df.apply(lambda x: re.findall('age, sex: \d+, (.*)', x['age-sex'])[0] if len(re.findall('age, sex: \d+, (.*)', x['age-sex'])) > 0 else pd.NA, axis=1)
df['english-dialect'] = df['birth-place2']
df['english-dialect'] = df.apply(lambda x: x['birth-place2'] if x['native-language'] == 'english' else "", axis=1)
df['english-dialect'] = df.apply(lambda x: 'usa' if 'usa' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'uk' if 'uk' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'canada' if 'canada' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'ireland' if 'ireland' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'new zealand' if 'new zealand' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'jamaica' if 'jamaica' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'india' if 'india' in x['english-dialect'] else x['english-dialect'], axis=1)
df['english-dialect'] = df.apply(lambda x: 'south africa' if 'south africa' in x['english-dialect'] else x['english-dialect'], axis=1)

In [132]:
df['native-language'].value_counts()[:30]

english       652
spanish       235
arabic        200
mandarin      156
korean         97
french         85
russian        81
portuguese     69
dutch          54
japanese       45
german         45
turkish        42
vietnamese     40
italian        39
farsi          39
polish         39
cantonese      35
hindi          34
urdu           29
macedonian     27
tagalog        27
amharic        27
romanian       24
thai           23
swedish        23
bengali        22
bulgarian      20
serbian        19
nepali         18
gujarati       17
Name: native-language, dtype: int64

In [None]:
# Creating the first version of the dataset, containing the 10 most frequent native languages, English broken down into dialects

march = df[df['native-language'].isin(['english', 'spanish', 'arabic', 'mandarin', 'korean', 'french', 'russian', 'portuguese', 'dutch'])]
def accent_class(x):
    if x['native-language'] == 'english':
        if x['english-dialect'] == 'usa':
            return 'usa'
        if x['english-dialect'] == 'uk':
            return 'uk'
        else:
            return pd.NA
    else:
        return x['native-language']

march['accent'] = march.apply(accent_class, axis=1)

In [163]:
march['accent'].value_counts()

usa           426
spanish       235
arabic        200
mandarin      156
korean         97
french         85
russian        81
uk             70
portuguese     69
dutch          54
Name: accent, dtype: int64

In [165]:
march = march.dropna(subset=['accent'])
march['age'] = march['age2']
march['sex'] = march['sex2']
march = march[['url-id', 'age', 'sex', 'accent']]
march = march.reset_index()
march.to_csv('speech-accent-archive-march-version.csv')
march

Unnamed: 0,index,url-id,age,sex,accent
0,0,428,47,male,arabic
1,7,421,21,male,uk
2,8,420,26,female,usa
3,9,419,21,male,uk
4,12,416,19,male,usa
...,...,...,...,...,...
1468,3086,442,38,female,usa
1469,3087,441,18,male,spanish
1470,3089,439,31,male,uk
1471,3091,437,40,male,arabic


In [45]:
# Downloading the corresponding mp3 files from the web

saa_url = "https://accent.gmu.edu/"

with requests.Session() as req:

    for index, row, in df.iterrows():
        mp3_url = row['mp3-link']
        name = row['url-id']

        download = req.get(saa_url + mp3_url)
        if download.status_code == 200:
            with open(name + ".mp3", 'wb') as f:
                f.write(download.content)
        else:
            print(f"Download Failed For File {name}")