In [2]:
fh = open('cmudict-0.7b-ipa.txt','r')

specialchars = '!@#$%^&*()-+"\',./:1234567890'

pronunciations = [[line.split('\t')[0], line.split('\t')[1].replace('ɝ', 'ər').rstrip('\n')] for line in fh if (not(any(c in specialchars for c in line.split('\t')[0])) and ',' not in line.split('\t')[1] and len(line.split('\t')[1])<7)]

In [4]:
for i in range(10):
    print(pronunciations[i])

['AAH', 'ˈɑː']
['AAKER', 'ˈɑːkər']
['AARGH', 'ˈɑːrg']
['AARON', 'ˈɛrən']
['AASE', 'ˈɑːs']
['AB', 'ˈæb']
['ABACK', 'əˈbæk']
['ABAIR', 'əˈbɛr']
['ABASH', 'əˈbæʃ']
['ABBA', 'ˈæbə']


In [5]:
len(pronunciations)

11490

In [6]:
allipa = " ".join([word[1] for word in pronunciations])
print(set(allipa))

{'ː', 'w', 'p', 'v', 'ð', ' ', 'æ', 'θ', 'm', 'f', 'ɪ', 'ʒ', 'ʃ', 'e', 'r', 'o', 'z', 'ɛ', 'j', 'b', 'a', 'd', 'ŋ', 'ˈ', 'i', 'ə', 'h', 'k', 'ɑ', 'u', 'g', 'n', 's', 'ʌ', 'ɔ', 'l', 'ʊ', 't'}


In [7]:
error_dict = {
    'p':['b'], 
    'b':['p'], 
    'f':['v'], 
    'v':['f'], 
    'ð':['θ', 'th'], 
    'θ':['ð', 'th'], 
    't':['d'], 
    'd':['t'], 
    's':['z', 'ss'], 
    'z': ['s', 'zz'], 
    'ʃ': ['ʒ', 'sh'], 
    'ʒ': ['ʃ', 'zh'],
    'k':['g'], 
    'g':['k'], 
    'h':['x'], 
    'n': ['ŋ','m'], 
    'm':['n', 'ŋ'], 
    'ŋ':['n','m'], 
    'j':['y'], 
    'w':['u'],
    'r':['w'], 
    'l':['w'], 
    'ˌ':['NONE'], 
    ' ': ['NONE'], 
    'ː': ['NONE'], 
    'ˈ':['NONE'],
    'ɛ':['e', 'æ', 'ɪ'], 
    'æ': ['e', 'ɛ', 'a'], 
    'u':['ʊ', 'ɔ', 'oo'], 
    'ʌ':['uh','ɔ','o'], 
    'ʊ':['u','oo'], 
    'ə':['uh','ʌ'],
    'i':['ɪ', 'ee', 'ie'],
    'ɑ':['ah', 'æ'],
    'a':['ah', 'æ'],
    'e':['ɛ', 'ɪ', 'æ'],
    'o':['oh', 'ɔ'],
    'ɔ':['o'],
    'ɪ':['i', 'ie', 'ee']
}





In [8]:
import random

In [9]:
def generate_answers(pron):
    answer_choices = [pron[:i]+random.choice(error_dict[pron[i]])+pron[i+1:] for i in range(len(pron)) if error_dict[pron[i]] != ['NONE']]
    answer_choices.append(pron)
    answer_choices = list(set(answer_choices))
    random.shuffle(answer_choices)
    return answer_choices

In [10]:
generate_answers('əˈbʌz')

['əˈbɔz', 'əˈbʌz', 'ʌˈbʌz', 'əˈbʌzz', 'əˈpʌz']

In [11]:
all_answers = [[word[0], word[1], generate_answers(word[1])] for word in pronunciations]

In [12]:
for i in range(10):
    print(all_answers[i])

['AAH', 'ˈɑː', ['ˈɑː', 'ˈahː']]
['AAKER', 'ˈɑːkər', ['ˈɑːkər', 'ˈɑːgər', 'ˈæːkər', 'ˈɑːkʌr', 'ˈɑːkəw']]
['AARGH', 'ˈɑːrg', ['ˈæːrg', 'ˈɑːrk', 'ˈɑːrg', 'ˈɑːwg']]
['AARON', 'ˈɛrən', ['ˈɛwən', 'ˈɛrʌn', 'ˈɛrən', 'ˈærən', 'ˈɛrəŋ']]
['AASE', 'ˈɑːs', ['ˈahːs', 'ˈɑːz', 'ˈɑːs']]
['AB', 'ˈæb', ['ˈeb', 'ˈæp', 'ˈæb']]
['ABACK', 'əˈbæk', ['əˈbɛk', 'əˈbæk', 'əˈbæg', 'əˈpæk', 'uhˈbæk']]
['ABAIR', 'əˈbɛr', ['əˈpɛr', 'ʌˈbɛr', 'əˈbɛw', 'əˈber', 'əˈbɛr']]
['ABASH', 'əˈbæʃ', ['uhˈbæʃ', 'əˈbæʃ', 'əˈbaʃ', 'əˈpæʃ', 'əˈbæʒ']]
['ABBA', 'ˈæbə', ['ˈæbʌ', 'ˈæbə', 'ˈæpə', 'ˈɛbə']]


In [13]:
with open('answer_choices.json', 'w+') as js_file:
    for entry in all_answers:
        js_file.write('{ question: "How is the word \''+entry[0]+'\' transcribed?",')
        js_file.write('answers: [')
        for option in entry[2]:
            if option == entry[1]:
                js_file.write('{ text:"' +option+ '", correct: true },')
            else:
                js_file.write('{ text:"' +option+ '", correct: false },')
                              
        js_file.write(']},\n')