In [28]:
import pandas as pd
import requests

## Audio Sentence Processing

In [2]:
ingest = pd.read_table('sentences_with_audio.csv',
                       names=['sentence_id',
                              'audio_id',
                              'username',
                              'license',
                              'attribution_url'])

In [3]:
ingest

Unnamed: 0,sentence_id,audio_id,username,license,attribution_url
0,61,1,fucongcong,,
1,68,2,fucongcong,,
2,78,754915,mramosch,,
3,85,566395,driini,CC BY-NC 4.0,https://tatoeba.org/deu/user/profile/driini
4,88,592881,driini,CC BY-NC 4.0,https://tatoeba.org/deu/user/profile/driini
...,...,...,...,...,...
1195633,12858087,1238345,PaulP,CC BY-NC 4.0,
1195634,12865980,1239468,PaulP,CC BY-NC 4.0,
1195635,12867905,1239469,PaulP,CC BY-NC 4.0,
1195636,12875115,1239470,PaulP,CC BY-NC 4.0,


In [4]:
vocab_list = pd.read_table('vocab_basket.tsv',
                           names=['sentence_id',
                                  'text',
                                  'translation'])

In [5]:
vocab_list

Unnamed: 0,sentence_id,text,translation
0,1729338,Ich nehme Geschenke an.,I accept gifts.
1,1907195,Es war ein Geschenk.,It was a gift.
2,2776108,Tom bleibt bei uns.,Tom stays with us.
3,2776108,Tom bleibt bei uns.,Tom will stay with us.
4,6960575,Tom akzeptierte mein Geschenk.,Tom accepted my present.
5,7636008,Tom schickte mir ein Geschenk.,Tom sent me a present.


In [11]:
# using pd.merge can cause problems if there are NaN values.
# So maybe double check your understanding of what's going on here.
with_audio = pd.merge(ingest, vocab_list, on='sentence_id', how='inner')

In [10]:
with_audio

Unnamed: 0,sentence_id,audio_id,username,license,attribution_url,text,translation
0,1907195,87415,gretelen,CC BY-NC 4.0,,Es war ein Geschenk.,It was a gift.
1,2776108,166329,Yeti,CC BY 4.0,,Tom bleibt bei uns.,Tom stays with us.
2,2776108,166329,Yeti,CC BY 4.0,,Tom bleibt bei uns.,Tom will stay with us.
3,6960575,484943,moskytoo,CC BY-NC 4.0,,Tom akzeptierte mein Geschenk.,Tom accepted my present.
4,7636008,757809,mramosch,,,Tom schickte mir ein Geschenk.,Tom sent me a present.


In [24]:
# Trying to do this with the .join method.
ingest_reindex = ingest.set_index('sentence_id')
vocab_reindex = vocab_list.set_index('sentence_id')

In [23]:
vocab_reindex

Unnamed: 0_level_0,text,translation
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1729338,Ich nehme Geschenke an.,I accept gifts.
1907195,Es war ein Geschenk.,It was a gift.
2776108,Tom bleibt bei uns.,Tom stays with us.
2776108,Tom bleibt bei uns.,Tom will stay with us.
6960575,Tom akzeptierte mein Geschenk.,Tom accepted my present.
7636008,Tom schickte mir ein Geschenk.,Tom sent me a present.


In [25]:
ingest_reindex

Unnamed: 0_level_0,audio_id,username,license,attribution_url
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
61,1,fucongcong,,
68,2,fucongcong,,
78,754915,mramosch,,
85,566395,driini,CC BY-NC 4.0,https://tatoeba.org/deu/user/profile/driini
88,592881,driini,CC BY-NC 4.0,https://tatoeba.org/deu/user/profile/driini
...,...,...,...,...
12858087,1238345,PaulP,CC BY-NC 4.0,
12865980,1239468,PaulP,CC BY-NC 4.0,
12867905,1239469,PaulP,CC BY-NC 4.0,
12875115,1239470,PaulP,CC BY-NC 4.0,


In [26]:
with_audio_reindex = ingest_reindex.join(vocab_reindex, how='inner')

In [27]:
with_audio_reindex

Unnamed: 0_level_0,audio_id,username,license,attribution_url,text,translation
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1907195,87415,gretelen,CC BY-NC 4.0,,Es war ein Geschenk.,It was a gift.
2776108,166329,Yeti,CC BY 4.0,,Tom bleibt bei uns.,Tom stays with us.
2776108,166329,Yeti,CC BY 4.0,,Tom bleibt bei uns.,Tom will stay with us.
6960575,484943,moskytoo,CC BY-NC 4.0,,Tom akzeptierte mein Geschenk.,Tom accepted my present.
7636008,757809,mramosch,,,Tom schickte mir ein Geschenk.,Tom sent me a present.


## Get the audio files

In [68]:
audio_url_template = 'https://tatoeba.org/audio/download/{0}'

In [71]:
for audio_id in with_audio_reindex['audio_id']:
    request_url = audio_url_template.format(audio_id)
    mp3data_request = requests.get(request_url)
    mp3data = mp3data_request.content
    with open('{0}.mp3'.format(audio_id), 'wb') as mp3file:
        mp3file.write(mp3data)