In [60]:
import numpy as np
import pandas as pd
import re
import statistics

In [61]:
df = pd.read_csv("forR_input.csv", dtype=str)
df = df.apply(lambda col: col.str.lower() if col.dtype == 'object' else col)

# df = df[df['STRING'].isin(['all', 'cat', 'con', 'hat', 'slam', 'break', 'shock'])] # USE THIS FOR TESTING (COMMENT LINE BELOW WHEN TESTING)

df = df.loc[df['initial2'].isna()]

df

Unnamed: 0,STRING,initial1,initial1.1,initial1.2,1a,1a.1,1a.2,1b,1b.1,1b.2,...,initial6.2,6a,6a.1,6a.2,6b,6b.1,6b.2,6c,6c.1,6c.2
218,all,ɔ,a,ɔ+a,l,ll,l+ll,,,,...,,,,,,,,,,
1189,break,b,b,b+b,r,r,r+r,eɪ,ea,eɪ+ea,...,,,,,,,,,,
1546,cat,k,c,k+c,æ,a,æ+a,t,t,t+t,...,,,,,,,,,,
2028,con,k,c,k+c,ɑ,o,ɑ+o,n,n,n+n,...,,,,,,,,,,
4422,hat,h,h,h+h,æ,a,æ+a,t,t,t+t,...,,,,,,,,,,
8336,shock,ʃ,sh,ʃ+sh,ɑ,o,ɑ+o,k,ck,k+ck,...,,,,,,,,,,
8506,slam,s,s,s+s,l,l,l+l,æ,a,æ+a,...,,,,,,,,,,


In [62]:
df.insert(1,'complexity', 0)
df.insert(2,'syllables', 1)
df

Unnamed: 0,STRING,complexity,syllables,initial1,initial1.1,initial1.2,1a,1a.1,1a.2,1b,...,initial6.2,6a,6a.1,6a.2,6b,6b.1,6b.2,6c,6c.1,6c.2
218,all,0,1,ɔ,a,ɔ+a,l,ll,l+ll,,...,,,,,,,,,,
1189,break,0,1,b,b,b+b,r,r,r+r,eɪ,...,,,,,,,,,,
1546,cat,0,1,k,c,k+c,æ,a,æ+a,t,...,,,,,,,,,,
2028,con,0,1,k,c,k+c,ɑ,o,ɑ+o,n,...,,,,,,,,,,
4422,hat,0,1,h,h,h+h,æ,a,æ+a,t,...,,,,,,,,,,
8336,shock,0,1,ʃ,sh,ʃ+sh,ɑ,o,ɑ+o,k,...,,,,,,,,,,
8506,slam,0,1,s,s,s+s,l,l,l+l,æ,...,,,,,,,,,,


In [63]:
vowels = ['a', 'e', 'i', 'o', 'u']

def find_onset_and_rime(row):
    word = str(row['STRING'])
    onset = ''
    rime = ''
    index = 0
    while index < len(word) and word[index] not in vowels:
        onset += word[index]
        index += 1
    rime = word[index:]
    return pd.Series([onset, rime], index=['onset', 'rime'])

df[['onset', 'rime']] = df.apply(find_onset_and_rime, axis=1)

df['onset'] = df['onset'].replace('', np.nan)
df['rime'] = df['rime'].replace('', np.nan)

df

Unnamed: 0,STRING,complexity,syllables,initial1,initial1.1,initial1.2,1a,1a.1,1a.2,1b,...,6a.1,6a.2,6b,6b.1,6b.2,6c,6c.1,6c.2,onset,rime
218,all,0,1,ɔ,a,ɔ+a,l,ll,l+ll,,...,,,,,,,,,,all
1189,break,0,1,b,b,b+b,r,r,r+r,eɪ,...,,,,,,,,,br,eak
1546,cat,0,1,k,c,k+c,æ,a,æ+a,t,...,,,,,,,,,c,at
2028,con,0,1,k,c,k+c,ɑ,o,ɑ+o,n,...,,,,,,,,,c,on
4422,hat,0,1,h,h,h+h,æ,a,æ+a,t,...,,,,,,,,,h,at
8336,shock,0,1,ʃ,sh,ʃ+sh,ɑ,o,ɑ+o,k,...,,,,,,,,,sh,ock
8506,slam,0,1,s,s,s+s,l,l,l+l,æ,...,,,,,,,,,sl,am


In [53]:
pg_df = pd.DataFrame()
all_phonemes = df['initial1'].tolist() + df['1a'].tolist() + df['1b'].tolist()
all_graphemes = df['initial1.1'].tolist() + df['1a.1'].tolist() + df['1b.1'].tolist()
all_pg_pairs = df['initial1.2'].tolist() + df['1a.2'].tolist() + df['1b.2'].tolist()
pg_df['phoneme'] = all_phonemes
pg_df['grapheme'] = all_graphemes
pg_df['pg_pair'] = all_pg_pairs
pg_df = pg_df.dropna()
    
pg_df

Unnamed: 0,phoneme,grapheme,pg_pair
0,eɪ,a,eɪ+a
1,ə,a,ə+a
2,ɑ,aah,ɑ+aah
3,eɪ,a,eɪ+a_e
4,eɪ,a,eɪ+a_e
...,...,...,...
11505,p,p,p+p
11506,t,t,t+t
11507,n,n,n+n
11509,m,m,m+m


In [None]:
onset_rime_df = pd.DataFrame()

def get_onset(row):
    onset = str(row['onset'])
    if onset.lower() == 'nan':
        return np.nan()
    all_phonemes = row['initial1'].tolist() + row['1a'].tolist() + row['1b'].tolist()
    all_graphemes = row['initial1.1'].tolist() + row['1a.1'].tolist() + row['1b.1'].tolist()
    onset_phonemes = []
    onset_graphemes = []
    grapheme_start_index = 0
    # while the word onset is not completely looked thru yet
    for g in all_graphemes:
        index = 0
        while onset[grapheme_start_index : grapheme_start_index + index] != g and index < len(onset):
            index += 1
        grapheme_start_index = index
            
    return pd.Series([onset, rime], index=['onset', 'rime'])

In [54]:
def conditional_prob_p_to_g(p, g):
    p_g_instances = len((pg_df.loc[(pg_df['phoneme'] == p) & (pg_df['grapheme'] == g)]).index)
    p_instances = len((pg_df.loc[(pg_df['phoneme'] == p)]).index)
    return p_g_instances / p_instances

def conditional_prob_g_to_p(g, p):
    p_g_instances = len((pg_df.loc[(pg_df['phoneme'] == p) & (pg_df['grapheme'] == g)]).index)
    g_instances = len((pg_df.loc[(pg_df['grapheme'] == g)]).index)
    return p_g_instances / g_instances

def p_plus_g_frequency(p, g):
    p_g_instances = len((pg_df.loc[(pg_df['phoneme'] == p) & (pg_df['grapheme'] == g)]).index)
    all_instances = len(pg_df.index)
    return p_g_instances / all_instances

print("p->g for 'ɔ' to 'a':", conditional_prob_p_to_g('ɔ', 'a'))
print("g->p for 'a' to 'ɔ':", conditional_prob_g_to_p('a', 'ɔ'))
print("ɔ+a frequency:", p_plus_g_frequency('ɔ', 'a'))
    


p->g for 'ɔ' to 'a': 0.2
g->p for 'a' to 'ɔ': 0.06463878326996197
ɔ+a frequency: 0.004544644448404919


In [56]:
p_input = input("Enter phoneme:\n")
g_input = input("Enter grapheme:\n")

print(conditional_prob_p_to_g(p_input, g_input))
print(conditional_prob_g_to_p(g_input, p_input))
print(p_plus_g_frequency(p_input, g_input))

option = input("OPTIONS:\n1 for p->g conditional probability.\n2 for g->p conditional probability.\n3 for p+g frequency.\nEnter option:")


ZeroDivisionError: division by zero

In [None]:
if option == '1':
    print(conditional_prob_p_to_g(p_input, g_input))
elif option == '2':
    print(conditional_prob_g_to_p(g_input, p_input))
elif option == '3':
    print(p_plus_g_frequency(p_input, g_input))

In [124]:
# P->G PHONOGRAPHEME CONSISTENCY

def calculate_p_to_g_phonographeme_probs(word):
    word = word.lower()
    p_to_g_probs = []
    if word in df['STRING'].values:
        word_phonemes = [df.loc[(df['STRING'] == word), 'initial1'].values[0], df.loc[(df['STRING'] == word), '1a'].values[0], df.loc[(df['STRING'] == word), '1b'].values[0]]
        word_phonemes = [x for x in word_phonemes if str(x) != 'nan']
        word_graphemes = [df.loc[(df['STRING'] == word), 'initial1.1'].values[0], df.loc[(df['STRING'] == word), '1a.1'].values[0], df.loc[(df['STRING'] == word), '1b.1'].values[0]]
        word_graphemes = [x for x in word_graphemes if str(x) != 'nan']
        for index in range(len(word_phonemes)):
            p_to_g_probs.append(conditional_prob_p_to_g(word_phonemes[index], word_graphemes[index]))
    return p_to_g_probs
    
def p_to_g_phonographeme_consistency_median(word):
    p_to_g_probs = calculate_p_to_g_phonographeme_probs(word)
    if len(p_to_g_probs) != 0:
        return statistics.median(p_to_g_probs)
    return None

def p_to_g_phonographeme_consistency_mean(word):
    p_to_g_probs = calculate_p_to_g_phonographeme_probs(word)
    if len(p_to_g_probs) != 0:
        return statistics.mean(p_to_g_probs)
    return None

def p_to_g_phonographeme_consistency_max(word):
    p_to_g_probs = calculate_p_to_g_phonographeme_probs(word)
    if len(p_to_g_probs) != 0:
        return max(p_to_g_probs)
    return None

def p_to_g_phonographeme_consistency_min(word):
    p_to_g_probs = calculate_p_to_g_phonographeme_probs(word)
    if len(p_to_g_probs) != 0:
        return min(p_to_g_probs)
    return None

def p_to_g_phonographeme_consistency_distribution_quantile(word, q): # e.g. q = 0.25 for Q1, q = 0.7 for 70th percentile
    p_to_g_probs = calculate_p_to_g_phonographeme_probs(word)
    if len(p_to_g_probs) != 0 and q >= 0 and q <= 100:
        return np.quantile(p_to_g_probs, q)
    return None

def p_to_g_phonographeme_consistency_IQR(word):
    return p_to_g_phonographeme_consistency_distribution_quantile(word, 0.75) - p_to_g_phonographeme_consistency_distribution_quantile(word, 0.25)

def p_to_g_phonographeme_consistency_range(word):
    return p_to_g_phonographeme_consistency_max(word) - p_to_g_phonographeme_consistency_min(word)

In [125]:
word_input = input("P->G PHONOGRAPHEME CONSISTENCY\nEnter word:\n")
if word_input.lower() not in df['STRING'].values:
    print("Word not found. Exiting.")
else:
    print("MEDIAN:", p_to_g_phonographeme_consistency_median(word_input))
    print("MEAN:", p_to_g_phonographeme_consistency_mean(word_input))
    print("MAX:", p_to_g_phonographeme_consistency_max(word_input))
    print("MIN:", p_to_g_phonographeme_consistency_min(word_input))
    print("Q1:", p_to_g_phonographeme_consistency_distribution_quantile(word_input, 0.25))
    print("IQR:", p_to_g_phonographeme_consistency_IQR(word_input))
    print("RANGE:", p_to_g_phonographeme_consistency_range(word_input))
    # p_to_g_photographeme_consistency_option = input("OPTIONS:\n1 for median.\n2 for mean.\n3 for max.\n4 for min.\nEnter option:")
    
    # if p_to_g_photographeme_consistency_option == '1':
    #     print(p_to_g_phonographeme_consistency_median(word_input))
    # elif p_to_g_photographeme_consistency_option == '2':
    #     print(p_to_g_phonographeme_consistency_mean(word_input))
    # elif p_to_g_photographeme_consistency_option == '3':
    #     print(p_to_g_phonographeme_consistency_max(word_input))
    # elif p_to_g_photographeme_consistency_option == "4":
    #     print(p_to_g_phonographeme_consistency_min(word_input))



MEDIAN: 0.14383735705209658
MEAN: 0.14383735705209658
MAX: 0.2
MIN: 0.08767471410419314
Q1: 0.11575603557814486
IQR: 0.05616264294790345
RANGE: 0.11232528589580687


In [126]:
# G->P PHONOGRAPHEME CONSISTENCY

def calculate_g_to_p_phonographeme_probs(word):
    word = word.lower()
    g_to_p_probs = []
    if word in df['STRING'].values:
        word_phonemes = [df.loc[(df['STRING'] == word), 'initial1'].values[0], df.loc[(df['STRING'] == word), '1a'].values[0], df.loc[(df['STRING'] == word), '1b'].values[0]]
        word_phonemes = [x for x in word_phonemes if str(x) != 'nan']
        word_graphemes = [df.loc[(df['STRING'] == word), 'initial1.1'].values[0], df.loc[(df['STRING'] == word), '1a.1'].values[0], df.loc[(df['STRING'] == word), '1b.1'].values[0]]
        word_graphemes = [x for x in word_graphemes if str(x) != 'nan']
        for index in range(len(word_phonemes)):
            g_to_p_probs.append(conditional_prob_g_to_p(word_graphemes[index], word_phonemes[index]))
    return g_to_p_probs
    
def g_to_p_phonographeme_consistency_median(word):
    g_to_p_probs = calculate_g_to_p_phonographeme_probs(word)
    if len(g_to_p_probs) != 0:
        return statistics.median(g_to_p_probs)
    return None

def g_to_p_phonographeme_consistency_mean(word):
    g_to_p_probs = calculate_g_to_p_phonographeme_probs(word)
    if len(g_to_p_probs) != 0:
        return statistics.mean(g_to_p_probs)
    return None

def g_to_p_phonographeme_consistency_max(word):
    g_to_p_probs = calculate_g_to_p_phonographeme_probs(word)
    if len(g_to_p_probs) != 0:
        return max(g_to_p_probs)
    return None

def g_to_p_phonographeme_consistency_min(word):
    g_to_p_probs = calculate_g_to_p_phonographeme_probs(word)
    if len(g_to_p_probs) != 0:
        return min(g_to_p_probs)
    return None

def g_to_p_phonographeme_consistency_distribution_quantile(word, q): # e.g. q = 0.25 for Q1, q = 0.7 for 70th percentile
    g_to_p_probs = calculate_g_to_p_phonographeme_probs(word)
    if len(g_to_p_probs) != 0 and q >= 0 and q <= 100:
        return np.quantile(g_to_p_probs, q)
    return None

def g_to_p_phonographeme_consistency_IQR(word):
    return g_to_p_phonographeme_consistency_distribution_quantile(word, 0.75) - g_to_p_phonographeme_consistency_distribution_quantile(word, 0.25)

def g_to_p_phonographeme_consistency_range(word):
    return g_to_p_phonographeme_consistency_max(word) - g_to_p_phonographeme_consistency_min(word)

In [128]:
word_input = input("G->P PHONOGRAPHEME CONSISTENCY\nEnter word:\n")
if word_input.lower() not in df['STRING'].values:
    print("Word not found. Exiting.")
else:
    print("MEDIAN:", g_to_p_phonographeme_consistency_median(word_input))
    print("MEAN:", g_to_p_phonographeme_consistency_mean(word_input))
    print("MAX:", g_to_p_phonographeme_consistency_max(word_input))
    print("MIN:", g_to_p_phonographeme_consistency_min(word_input))
    print("Q1:", g_to_p_phonographeme_consistency_distribution_quantile(word_input, 0.7)) # changed this so its not currently Q1
    print("IQR:", g_to_p_phonographeme_consistency_IQR(word_input))
    print("RANGE:", g_to_p_phonographeme_consistency_range(word_input))
    
    # g_to_p_photographeme_consistency_option = input("OPTIONS:\n1 for median.\n2 for mean.\n3 for max.\n4 for min.\nEnter option:")
    
    # if g_to_p_photographeme_consistency_option == '1':
    #     print(g_to_p_phonographeme_consistency_median(word_input))
    # elif g_to_p_photographeme_consistency_option == '2':
    #     print(g_to_p_phonographeme_consistency_mean(word_input))
    # elif g_to_p_photographeme_consistency_option == '3':
    #     print(g_to_p_phonographeme_consistency_max(word_input))
    # elif g_to_p_photographeme_consistency_option == "4":
    #     print(g_to_p_phonographeme_consistency_min(word_input))

    # 0.25 - 0.5 range? not just 0.7 and lower etc etc

MEDIAN: 0.532319391634981
MEAN: 0.532319391634981
MAX: 1.0
MIN: 0.06463878326996197
Q1: 0.7193916349809886
IQR: 0.46768060836501907
RANGE: 0.935361216730038
