### Scrape Location Data for Family Dollar in U.S.

In [381]:
import requests
from bs4 import BeautifulSoup
import json
from pandas import DataFrame as df
import re
import base64
from random import randint, shuffle
from time import sleep
import os
import genanki

In [382]:
dummy_headers = {'User-Agent': 'Foo bar'}

In [383]:
language = 'fr'

In [384]:
page = requests.get("https://minimalpairs.net/en/" + language)
soup = BeautifulSoup(page.text, 'html.parser')

In [385]:
# find all 
rows = soup.find_all('tr')

In [386]:
pairs = []

for row in rows:
    ipa_cells = row.find_all('td', class_ = 'ipa')
    is_ipa = len(ipa_cells) > 0
    if is_ipa:
        wordpair_spans = row.find_all('span', class_ = 'wordpair')
        
        wordpairs = []
        
        for wordpair_span in wordpair_spans:
            word_1 = wordpair_span.contents[0].strip()
            word_2 = wordpair_span.contents[2].replace('|', '').strip()
            
            words[word_1] = { 'ipa': wordpair_span.contents[1].contents[0] }
            words[word_2] = { 'ipa': wordpair_span.contents[3].contents[0] }
            
            wordpairs.append([ word_1, word_2 ])

        pairs.append({
            "characters": [ipa_cells[0].contents[0], ipa_cells[1].contents[0]],
            "wordpairs": wordpairs
        })

{
    'words': words,
    'pairs': pairs
}

{'words': {'b': {'ipa': '/be/'},
  'blanc': {'ipa': '/blɑ̃/'},
  'p': {'ipa': '/pe/'},
  'pain': {'ipa': '/pɛ̃/'},
  'par': {'ipa': '/paʁ/'},
  'pas': {'ipa': '/pa/'},
  'peau': {'ipa': '/po/'},
  'pis': {'ipa': '/pi/'},
  'pont': {'ipa': '/pɔ̃/'},
  'port': {'ipa': '/pɔʁ/'},
  'puce': {'ipa': '/pys/'},
  'prix': {'ipa': '/pʁi/'},
  't': {'ipa': '/te/'},
  'tard': {'ipa': '/taʁ/'},
  'taux': {'ipa': '/to/'},
  'tire': {'ipa': '/tiʁ/'},
  'tort': {'ipa': '/tɔʁ/'},
  'tour': {'ipa': '/tuʁ/'},
  'trop': {'ipa': '/tʁo/'},
  'tu': {'ipa': '/ty/'},
  'deux': {'ipa': '/dø/'},
  'pair': {'ipa': '/pɛʁ/'},
  'pire': {'ipa': '/piʁ/'},
  'pose': {'ipa': '/poz/'},
  'pu': {'ipa': '/py/'},
  'pure': {'ipa': '/pyʁ/'},
  'pape': {'ipa': '/pap/'},
  'peur': {'ipa': '/pœʁ/'},
  'plan': {'ipa': '/plɑ̃/'},
  'puis': {'ipa': '/pɥi/'},
  'quai': {'ipa': '/kɛ/'},
  'qui': {'ipa': '/ki/'},
  'gare': {'ipa': '/ɡaʁ/'},
  'gros': {'ipa': '/ɡʁo/'},
  'guerre': {'ipa': '/ɡɛʁ/'},
  'langue': {'ipa': '/lɑ̃ɡ/'},
  'p

In [387]:
def sleep_random():
    sleep(randint(25, 75) / 100)

In [415]:
def audio_file_name(word):
    return 'output/' + language + '/audios/' + language + '-' + word + '.mp3'

In [416]:
def download_audio(audio_url, word):
    audio = requests.get(audio_url, headers=dummy_headers)
    full_file_name = audio_file_name(word)
    with open(full_file_name, 'wb') as f:
        f.write(audio.content)
        f.close()
        print('File saved to ' + full_file_name)

# download_audio(audio_url = 'https://audio00.forvo.com/mp3/9505057/49/9505057_49_4013.mp3', file_name = 'pain.mp3')

In [417]:
def download_audio_for_word(word):
    forvo_url = "https://forvo.com/word/" + word
    page = requests.get(forvo_url, headers=dummy_headers)
    soup = BeautifulSoup(page.text, 'html.parser')

    div = soup.find('div', id = 'language-container-' + language)
    
    if div is None:
        print('No french audio file for word "' + word + '"')
        return
    
    spans = div.find_all('span', class_ = 'play')

    if len(spans) > 0:
        span = spans[0]
        matched = re.search("^Play\([0-9]+,'([0-9a-zA-Z=]+)'", span.attrs['onclick'])
        decoded = base64.b64decode(matched.group(1)).decode("utf-8")
        audio_url = "https://audio00.forvo.com/mp3/" + decoded
        download_audio(audio_url, word)
        
# download_audio_for_word('plan')

In [418]:
for word in words:
    full_file_name = audio_file_name(word)
    if not os.path.isfile(full_file_name):
        download_audio_for_word(word)
        sleep_random()
    else:
        print('File already exist at ' + full_file_name)
        
print('Done!')

File already exist at output/fr/audios/fr-b.mp3
File already exist at output/fr/audios/fr-blanc.mp3
File already exist at output/fr/audios/fr-p.mp3
File already exist at output/fr/audios/fr-pain.mp3
File already exist at output/fr/audios/fr-par.mp3
File already exist at output/fr/audios/fr-pas.mp3
File already exist at output/fr/audios/fr-peau.mp3
File already exist at output/fr/audios/fr-pis.mp3
File already exist at output/fr/audios/fr-pont.mp3
File already exist at output/fr/audios/fr-port.mp3
File already exist at output/fr/audios/fr-puce.mp3
File already exist at output/fr/audios/fr-prix.mp3
File already exist at output/fr/audios/fr-t.mp3
File already exist at output/fr/audios/fr-tard.mp3
File already exist at output/fr/audios/fr-taux.mp3
File already exist at output/fr/audios/fr-tire.mp3
File already exist at output/fr/audios/fr-tort.mp3
File already exist at output/fr/audios/fr-tour.mp3
File already exist at output/fr/audios/fr-trop.mp3
File already exist at output/fr/audios/fr-

No french audio file for word "nomme"
File already exist at output/fr/audios/fr-nuit.mp3
File already exist at output/fr/audios/fr-colloque.mp3
File already exist at output/fr/audios/fr-flot.mp3
File already exist at output/fr/audios/fr-folie.mp3
File already exist at output/fr/audios/fr-folle.mp3
File already exist at output/fr/audios/fr-colle.mp3
File already exist at output/fr/audios/fr-queue.mp3
File already exist at output/fr/audios/fr-vache.mp3
File already exist at output/fr/audios/fr-voter.mp3
File already exist at output/fr/audios/fr-cent.mp3
File already exist at output/fr/audios/fr-sas.mp3
File already exist at output/fr/audios/fr-sœur.mp3
File already exist at output/fr/audios/fr-chapeau.mp3
File already exist at output/fr/audios/fr-manque.mp3
File already exist at output/fr/audios/fr-marque.mp3
File already exist at output/fr/audios/fr-marquer.mp3
File already exist at output/fr/audios/fr-camp.mp3
File already exist at output/fr/audios/fr-colis.mp3
File already exist at ou

No french audio file for word "klein"
File already exist at output/fr/audios/fr-plainte.mp3
File already exist at output/fr/audios/fr-loin.mp3
File already exist at output/fr/audios/fr-soin.mp3
File already exist at output/fr/audios/fr-louis.mp3
File already exist at output/fr/audios/fr-cru.mp3
File already exist at output/fr/audios/fr-u.mp3
File already exist at output/fr/audios/fr-ou.mp3
File already exist at output/fr/audios/fr-sous.mp3
File already exist at output/fr/audios/fr-et.mp3
File already exist at output/fr/audios/fr-j.mp3
File already exist at output/fr/audios/fr-creux.mp3
File already exist at output/fr/audios/fr-eux.mp3
File already exist at output/fr/audios/fr-meuse.mp3
File already exist at output/fr/audios/fr-i.mp3
File already exist at output/fr/audios/fr-aura.mp3
File already exist at output/fr/audios/fr-dos.mp3
File already exist at output/fr/audios/fr-o.mp3
File already exist at output/fr/audios/fr-ait.mp3
File already exist at output/fr/audios/fr-il.mp3
File alre

No french audio file for word "der"
File already exist at output/fr/audios/fr-dose.mp3
File already exist at output/fr/audios/fr-cap.mp3
File already exist at output/fr/audios/fr-cri.mp3
File already exist at output/fr/audios/fr-q.mp3
File already exist at output/fr/audios/fr-lampes.mp3
File already exist at output/fr/audios/fr-glace.mp3
File already exist at output/fr/audios/fr-mari.mp3
File already exist at output/fr/audios/fr-feu.mp3
File already exist at output/fr/audios/fr-file.mp3
File already exist at output/fr/audios/fr-frappe.mp3
File already exist at output/fr/audios/fr-pense.mp3
File already exist at output/fr/audios/fr-passe.mp3
File already exist at output/fr/audios/fr-couper.mp3
File already exist at output/fr/audios/fr-juge.mp3
File already exist at output/fr/audios/fr-étage.mp3
File already exist at output/fr/audios/fr-top.mp3
File already exist at output/fr/audios/fr-la.mp3
File already exist at output/fr/audios/fr-lié.mp3
File already exist at output/fr/audios/fr-libr

No french audio file for word "process"
File already exist at output/fr/audios/fr-écran.mp3
File already exist at output/fr/audios/fr-défend.mp3
File already exist at output/fr/audios/fr-sobre.mp3
File already exist at output/fr/audios/fr-votre.mp3
File already exist at output/fr/audios/fr-campo.mp3
File already exist at output/fr/audios/fr-afin.mp3
Done!


In [419]:
css = '.card { font-size: 12px; text-transform: uppercase; text-align: center; } .word { font-size: 32px; text-transform: none; font-weight: bold; } .ipa { font-size: 32px; text-transform: none; } .correct { color: green } .wrong { color: red }'
fields = [
    {'name': 'Name'},
    {'name': 'Character 1'},
    {'name': 'Character 2'},
    {'name': 'Word 1'},
    {'name': 'Word 2'},
    {'name': 'IPA 1'},
    {'name': 'IPA 2'},
    {'name': 'Audio 1'},
    {'name': 'Audio 2'}
]

model_sound_1 = genanki.Model(
    1607392322,
    'Minimal Pair (1)',
    fields = fields,
    templates = [{
        'name': 'Sound 1',
        'qfmt': '{{Audio 1}}<br/><br/><span class="word">{{Word 1}}</span> <span class="ipa">{{IPA 1}}</span> or <span class="word">{{Word 2}}</span> <span class="ipa">{{IPA 2}}</span>',
        'afmt': '{{FrontSide}}<hr id="answer"><span class="correct">Correct: <span class="word">{{Word 1}}</span> <span class="ipa">{{IPA 1}}</span></span><br/><br/>{{Audio 1}}<br/><br/><span class="wrong">Wrong: <span class="word">{{Word 2}}</span> <span class="ipa">{{IPA 2}}</span></span><br/><br/>{{Audio 2}}',
    }],
    css = css
)

model_sound_2 = genanki.Model(
    1607392323,
    'Minimal Pair (2)',
    fields = fields,
    templates = [{
        'name': 'Sound 2',
        'qfmt': '{{Audio 2}}<br/><br/><span class="word">{{Word 1}}</span> <span class="ipa">{{IPA 1}}</span> or <span class="word">{{Word 2}}</span> <span class="ipa">{{IPA 2}}</span>',
        'afmt': '{{FrontSide}}<hr id="answer"><span class="correct">Correct: <span class="word">{{Word 2}}</span> <span class="ipa">{{IPA 2}}</span></span><br/><br/>{{Audio 2}}<br/><br/><span class="wrong">Wrong: <span class="word">{{Word 1}}</span> <span class="ipa">{{IPA 1}}</span></span><br/><br/>{{Audio 1}}',
    }],
    css = css
)

models = {
    1: model_sound_1,
    2: model_sound_2
}

In [420]:
def create_anki_note(model, characterpair, wordpair):
    note = genanki.Note(
        model = models[model],
        fields = [
            characterpair[0] + '/' + characterpair[1] + ' - ' + wordpair[0] + '/' + wordpair[1] + ' (' + str(model) + ')',
            characterpair[0],
            characterpair[1],
            wordpair[0],
            wordpair[1],
            words[wordpair[0]]['ipa'],
            words[wordpair[1]]['ipa'],
            '[sound:' + wordpair[0] + '.mp3]',
            '[sound:' + wordpair[1] + '.mp3]',
        ]
    )
    return note

In [421]:
def generate_anki_deck(pairs, deck_name, output_file):
    deck_id = abs(hash(deck_name)) % (10 ** 10)
    
    print('Generating deck ' + str(deck_id) + ' - ' + deck_name)
    
    deck = genanki.Deck(
      deck_id,
      deck_name)

    words = set()
    
    for pair in pairs:
        notes = []
        for wordpair in pair['wordpairs']:
            if os.path.isfile(audio_file_name(wordpair[0])) and os.path.isfile(audio_file_name(wordpair[1])):
                notes.append(create_anki_note(model = 1, characterpair = pair['characters'], wordpair = wordpair))
                notes.append(create_anki_note(model = 2, characterpair = pair['characters'], wordpair = wordpair))
                words.add(wordpair[0])
                words.add(wordpair[1])

        shuffle(notes)

        for note in notes:
            deck.add_note(note)

    package = genanki.Package(deck)

    package.media_files = []
    
    for word in words:
        full_file_name = audio_file_name(word)
        if os.path.isfile(full_file_name):
            package.media_files.append(full_file_name)

    package.write_to_file(output_file)

In [422]:
generate_anki_deck(
    pairs = pairs,
    deck_name = 'Minimal Pairs - ' + language,
    output_file = 'output/' + language + '/' + language + '-all.apkg'
)

Generating deck 566674856 - Minimal Pairs - fr


In [423]:
def generate_anki_deck_for_pair(pair):
    generate_anki_deck(
        pairs = [pair],
        deck_name = 'Minimal Pairs - ' + language + ' (' + pair['characters'][0] + '/' + pair['characters'][1] + ')',
        output_file = 'output/' + language + '/' + language + '-' + pair['characters'][0] + '-' + pair['characters'][1] + '.apkg'
    )

In [424]:
for pair in pairs:
    generate_anki_deck_for_pair(pair)

Generating deck 9560275807 - Minimal Pairs - fr (p/b)
Generating deck 3020951074 - Minimal Pairs - fr (p/t)
Generating deck 3604637837 - Minimal Pairs - fr (p/d)
Generating deck 1524405358 - Minimal Pairs - fr (p/k)
Generating deck 6469855419 - Minimal Pairs - fr (p/ɡ)
Generating deck 3149189225 - Minimal Pairs - fr (p/m)
Generating deck 9053345303 - Minimal Pairs - fr (p/n)
Generating deck 5733807195 - Minimal Pairs - fr (p/f)
Generating deck 2705648764 - Minimal Pairs - fr (p/v)
Generating deck 1708375932 - Minimal Pairs - fr (p/s)
Generating deck 2372971867 - Minimal Pairs - fr (p/z)
Generating deck 6075223329 - Minimal Pairs - fr (p/ʃ)
Generating deck 5065344034 - Minimal Pairs - fr (p/ʒ)
Generating deck 3045943078 - Minimal Pairs - fr (p/ʁ)
Generating deck 7179853408 - Minimal Pairs - fr (p/l)
Generating deck 8013430368 - Minimal Pairs - fr (p/j)
Generating deck 780427506 - Minimal Pairs - fr (p/w)
Generating deck 159053870 - Minimal Pairs - fr (b/p)
Generating deck 6705159422 - M

Generating deck 3522252690 - Minimal Pairs - fr (f/d)
Generating deck 2884839767 - Minimal Pairs - fr (f/k)
Generating deck 1279504804 - Minimal Pairs - fr (f/ɡ)
Generating deck 6742391762 - Minimal Pairs - fr (f/m)
Generating deck 1522510895 - Minimal Pairs - fr (f/n)
Generating deck 9969261683 - Minimal Pairs - fr (f/v)
Generating deck 1409779525 - Minimal Pairs - fr (f/s)
Generating deck 6843831238 - Minimal Pairs - fr (f/z)
Generating deck 5253250762 - Minimal Pairs - fr (f/ʃ)
Generating deck 8777224028 - Minimal Pairs - fr (f/ʒ)
Generating deck 2755734526 - Minimal Pairs - fr (f/ʁ)
Generating deck 977823418 - Minimal Pairs - fr (f/l)
Generating deck 139251002 - Minimal Pairs - fr (f/j)
Generating deck 2016416386 - Minimal Pairs - fr (v/p)
Generating deck 9019242843 - Minimal Pairs - fr (v/b)
Generating deck 5095326521 - Minimal Pairs - fr (v/t)
Generating deck 9618702162 - Minimal Pairs - fr (v/d)
Generating deck 2318253212 - Minimal Pairs - fr (v/k)
Generating deck 5856401919 - M

Generating deck 6294432559 - Minimal Pairs - fr (w/t)
Generating deck 643561896 - Minimal Pairs - fr (w/d)
Generating deck 649898838 - Minimal Pairs - fr (w/k)
Generating deck 676394561 - Minimal Pairs - fr (w/m)
Generating deck 6412794483 - Minimal Pairs - fr (w/n)
Generating deck 2232551714 - Minimal Pairs - fr (w/v)
Generating deck 9399460607 - Minimal Pairs - fr (w/s)
Generating deck 7743010883 - Minimal Pairs - fr (w/ʒ)
Generating deck 4698425913 - Minimal Pairs - fr (w/ʁ)
Generating deck 9610129089 - Minimal Pairs - fr (w/l)
Generating deck 7431578786 - Minimal Pairs - fr (w/j)
Generating deck 8341077039 - Minimal Pairs - fr (w/ɥ)
Generating deck 295242872 - Minimal Pairs - fr (ɥ/b)
Generating deck 8030859161 - Minimal Pairs - fr (ɥ/d)
Generating deck 6565290126 - Minimal Pairs - fr (ɥ/k)
Generating deck 7688613613 - Minimal Pairs - fr (ɥ/m)
Generating deck 5930610368 - Minimal Pairs - fr (ɥ/v)
Generating deck 3095297167 - Minimal Pairs - fr (ɥ/s)
Generating deck 5586861003 - Min

Generating deck 2293473925 - Minimal Pairs - fr (ɔ/e)
Generating deck 6969051885 - Minimal Pairs - fr (ɔ/ø)
Generating deck 8989209423 - Minimal Pairs - fr (ɔ/ə)
Generating deck 9055176113 - Minimal Pairs - fr (ɔ/ɛ)
Generating deck 5923511216 - Minimal Pairs - fr (ɔ/ɛ̃)
Generating deck 6496891657 - Minimal Pairs - fr (ɔ/œ)
Generating deck 4892658482 - Minimal Pairs - fr (ɔ/ɔ̃)
Generating deck 9516897381 - Minimal Pairs - fr (ɔ/a)
Generating deck 2771799557 - Minimal Pairs - fr (ɔ/ɑ̃)
Generating deck 9616189917 - Minimal Pairs - fr (ɔ̃/i)
Generating deck 6051727511 - Minimal Pairs - fr (ɔ̃/y)
Generating deck 9282460641 - Minimal Pairs - fr (ɔ̃/u)
Generating deck 1891332083 - Minimal Pairs - fr (ɔ̃/e)
Generating deck 9991847987 - Minimal Pairs - fr (ɔ̃/ø)
Generating deck 5151420368 - Minimal Pairs - fr (ɔ̃/ə)
Generating deck 2033452018 - Minimal Pairs - fr (ɔ̃/o)
Generating deck 5596888360 - Minimal Pairs - fr (ɔ̃/ɛ)
Generating deck 496026415 - Minimal Pairs - fr (ɔ̃/ɛ̃)
Generating deck 