In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from typing import List

In [2]:
WORD_LIST = 'top_all'

In [3]:
PHONEMES = ['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER', 'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UW', 'V', 'W', 'Y', 'Z', 'ZH']
phoneme_to_idx = {p:i for i, p in enumerate(PHONEMES)}
idx_to_phoneme = {i:p for i, p in enumerate(PHONEMES)}

pairs = []
for p1 in PHONEMES:
    for p2 in PHONEMES:
        pairs.append((p1,p2))
        
print(len(pairs))
print(phoneme_to_idx)
print(idx_to_phoneme)

1521
{'AA': 0, 'AE': 1, 'AH': 2, 'AO': 3, 'AW': 4, 'AY': 5, 'B': 6, 'CH': 7, 'D': 8, 'DH': 9, 'EH': 10, 'ER': 11, 'EY': 12, 'F': 13, 'G': 14, 'HH': 15, 'IH': 16, 'IY': 17, 'JH': 18, 'K': 19, 'L': 20, 'M': 21, 'N': 22, 'NG': 23, 'OW': 24, 'OY': 25, 'P': 26, 'R': 27, 'S': 28, 'SH': 29, 'T': 30, 'TH': 31, 'UH': 32, 'UW': 33, 'V': 34, 'W': 35, 'Y': 36, 'Z': 37, 'ZH': 38}
{0: 'AA', 1: 'AE', 2: 'AH', 3: 'AO', 4: 'AW', 5: 'AY', 6: 'B', 7: 'CH', 8: 'D', 9: 'DH', 10: 'EH', 11: 'ER', 12: 'EY', 13: 'F', 14: 'G', 15: 'HH', 16: 'IH', 17: 'IY', 18: 'JH', 19: 'K', 20: 'L', 21: 'M', 22: 'N', 23: 'NG', 24: 'OW', 25: 'OY', 26: 'P', 27: 'R', 28: 'S', 29: 'SH', 30: 'T', 31: 'TH', 32: 'UH', 33: 'UW', 34: 'V', 35: 'W', 36: 'Y', 37: 'Z', 38: 'ZH'}


In [4]:
# Parse text file and create CSV
def parse_line(line):
    line = line.strip()
    parts = line.split('\t')
    assert len(parts) == 2 # testing
    phonemes = []
    for phoneme in parts[1].split(' '):
        phonemes.append(phoneme.strip('012')) # remove stress numbers
    parts[1] = phonemes
    return parts


def get_phoneme_vector(phonemes: List[str]) -> List[int]:
    vector = [0 for i in range(len(PHONEMES))]
    for p in phonemes:
        idx = phoneme_to_idx[p]
        vector[idx] = 1
    return vector

with open('{}.txt'.format(WORD_LIST)) as source:
    with open('{}.csv'.format(WORD_LIST), 'w') as dest:
        heading = 'word,phonemes\n'
        dest.write(heading)
        for line in source.readlines():
            parts = parse_line(line)
            #phoneme_vector = [str(i) for i in get_phoneme_vector(parts[1])]
            l = parts[0].upper() + ',' + ' '.join(parts[1]) + '\n'
            dest.write(l)

In [5]:
class User:
    def __init__(self):
        self.performance = [0 for i in range(39)]
        self.counts = [0 for i in range(39)]
        self.window = 10 # change to class variable?
    def update_performance(self, new_score: int, phoneme_idx: int):
        """
        Estimate exponential moving average
        Source: https://stackoverflow.com/questions/12636613/how-to-calculate-moving-average-without-keeping-the-count-and-data-total
        """
        if self.counts[phoneme_idx] < self.window:
            self.counts[phoneme_idx] += 1
        self.performance[phoneme_idx] = self.performance[phoneme_idx] + (new_score - self.performance[phoneme_idx]) / min(self.window, self.counts[phoneme_idx])
    
user = User()


#user.update_performance(50, 0)
#print(user.performance)

In [6]:
# Create similarity matrix

ds = pd.read_csv('{}.csv'.format(WORD_LIST))
cf = CountVectorizer(analyzer='word', binary=True, ngram_range=(1, 2))
count_matrix = cf.fit_transform(ds['phonemes'])

In [7]:
cosine_similarities = linear_kernel(count_matrix, count_matrix)
similar_words = {}
for idx, row in ds.iterrows():
    if idx % 100 == 0:
        print(idx)
    sorted_indices = cosine_similarities[idx].argsort()[:-100:-1] # Take only top 100
    sorted_words = []
    for i in sorted_indices:
        if str(ds['word'][i]) != str(row['word']):
            sorted_words.append((cosine_similarities[idx][i], ds['word'][i]))
    similar_words[row['word']] = sorted_words

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600


In [8]:
def top_n_similar_words(word, n):
    return similar_words[word.upper()][:n]

test_words = ['property', 'area', 'quote', 'apple', 'student']

for w in test_words:
    print(w, top_n_similar_words(w, 10))
    print('\n')

property [(5.0, 'PROPERLY'), (5.0, 'POVERTY'), (4.0, 'CONTROVERSY'), (4.0, 'OPPORTUNITY'), (3.0, 'WORRIED'), (3.0, 'LIBERTY'), (3.0, 'ANNIVERSARY'), (3.0, 'STARTER'), (3.0, 'SCHOLARSHIP'), (3.0, 'FIRMLY')]


area [(5.0, 'ANYBODY'), (5.0, 'VARIABLE'), (5.0, 'VARIOUS'), (5.0, 'EVERYONE'), (4.0, 'NECESSARY'), (4.0, 'VARIATION'), (4.0, 'SECRETARY'), (4.0, 'ORDINARY'), (4.0, 'COMMENTARY'), (4.0, 'ASSEMBLY')]


quote [(1.0, 'ROSE'), (1.0, 'ROW'), (1.0, 'HORMONE'), (1.0, 'SUPPOSE'), (1.0, 'COASTAL'), (1.0, 'OWNER'), (1.0, 'HOUSEHOLD'), (1.0, 'VOTER'), (1.0, 'POEM'), (1.0, 'MOMENTUM')]


apple [(3.0, 'CLASSIFY'), (3.0, 'ANNIVERSARY'), (3.0, 'SATELLITE'), (3.0, 'GALLON'), (3.0, 'SAMPLE'), (3.0, 'ANALYST'), (3.0, 'CAMPUS'), (3.0, 'RANDOM'), (3.0, 'STRATEGY'), (3.0, 'RAPIDLY')]


student [(3.0, 'COMMUNICATE'), (3.0, 'COMMUNICATION'), (3.0, 'UNIFORM'), (3.0, 'UNIVERSE'), (3.0, 'BEAUTIFUL'), (3.0, 'RITUAL'), (3.0, 'SPIRITUAL'), (3.0, 'UNION'), (3.0, 'APPROVAL'), (3.0, 'USEFUL')]




In [9]:
from joblib import dump, load
dump(similar_words, 'similar_words_{}.joblib'.format(WORD_LIST))

['similar_words_top_all.joblib']