Remove row where some column values are empty on asr_data.csv (18 rows have empty column values)

In [5]:
import pandas as pd

asr_data = pd.read_csv("asr_data.csv")

num_rows_before = len(asr_data)

asr_data_cleaned = asr_data.dropna(subset=['kaldi_data', 'kaldiNa_data', 'wav2vec_transcript_words', 'wav2vec_transcript_phonemes'], how='any')

num_rows_after = len(asr_data_cleaned)

num_rows_deleted = num_rows_before - num_rows_after

print("Number of rows deleted:", num_rows_deleted)

Number of rows deleted: 18


In [31]:
c1 = 'wav2vec_transcript_phonemes'
c2 = 'story_text'
same = 0
diff = 0

same_length_df = pd.DataFrame(columns=asr_data.columns)
different_length_df = pd.DataFrame(columns=asr_data.columns)

for index, row in asr_data.iterrows():
    # Retrieve data from the specified columns for the current row
    data_from_column1 = row[c1]
    data_from_column2 = row[c2]
    
    # Initialize lengths
    l1 = 0
    l2 = 0
    
    # Process the data as needed
    if pd.notna(data_from_column1):
        l1 = len(data_from_column1.split())
    if pd.notna(data_from_column2):
        l2 = len(data_from_column2.split())
    
    if l1 == l2:
        same_length_df = pd.concat([same_length_df, row.to_frame().T], ignore_index=True)
        same += 1
    else:
        different_length_df = pd.concat([different_length_df, row.to_frame().T], ignore_index=True)
        diff += 1

print("Same length rows:", same)
print("Different length rows:", diff)


Same length rows: 2115
Different length rows: 3924


In [33]:
print("First 5 rows of same_length_df:")
print(same_length_df.head())

First 5 rows of same_length_df:
                     activityId  \
0  001FFD4535C911EC89641635D148   
1  002B522B38B711EC89641635D148   
2  002B522B38B711EC89641635D148   
3  002B522B38B711EC89641635D148   
4  002B522B38B711EC89641635D148   

                                          story_text phrase_index  \
0          I love water. I enjoy playing with water.            0   
1            The man stopped to rest under the tree.            1   
2  The monkeys joined hands and sang songs, swapp...            8   
3  The hats disappeared from the monkey's heads a...           16   
4  The man carried on towards the market with all...           17   

                                         amazon_data  \
0  {"text": "i love water i don't play i am sure ...   
1  {"text": "the man stalked to rest under the tr...   
2  {"text": "the nucky's join hands and song song...   
3  {"text": "the hats has disappeared from the bu...   
4  {"text": "the man's back a man cried on powers...   

     

In [34]:
print("First 5 rows of same_length_df:")
print(different_length_df.head())

First 5 rows of same_length_df:
                     activityId                                   story_text  \
0  001FFD4535C911EC89641635D148          It is raining. Rain gives us water.   
1  001FFD4535C911EC89641635D148  I enjoy playing outside. But it is raining.   
2  001FFD4535C911EC89641635D148             I enjoy playing out in the rain.   
3  001FFD4535C911EC89641635D148     My hair is wet from playing in the rain.   
4  001FFD4535C911EC89641635D148                 My father is fixing my hair.   

  phrase_index                                        amazon_data  \
0            1  {"text": "it is training raining raining very ...   
1            2  {"text": "oh okay done it's out during works i...   
2            3  {"text": "i enjoy playing at it's rained", "co...   
3            4  {"text": "my ham is what oh please and is well...   
4            5  {"text": "mm hmm my father oh the exports to l...   

                                          kaldi_data  \
0  {"text": "it 

In [38]:
import pandas as pd

file_path = "labels.csv"
columns_to_drop = ["storyId","phraseIndex","word_index"]
labels = pd.read_csv(file_path, usecols=lambda column: column not in columns_to_drop)

pd.set_option('display.max_rows', None)
print("Preview of the data:")
print(labels.head())
c1 = "activityId"
c2 = "label"
dict = {}
for index, row in labels.iterrows():
    data_from_column1 = row[c1]
    data_from_column2 = row[c2]

    if data_from_column2 and data_from_column2 not in dict:
        dict[data_from_column1] = 1
    else:
        dict[data_from_column1] = 0

diff_len_errors = 0
diff_len_no_errors = 0
same_len_errors = 0
same_len_no_errors = 0

for index, row in different_length_df.iterrows():
    k = row["activityId"]
    if dict[k] == 1:
        diff_len_no_errors += 1
    else:
        diff_len_errors += 1

print(diff_len_no_errors, diff_len_errors)
    

Preview of the data:
                     activityId expected_text  label
0  001FFD4535C911EC89641635D148             i      1
1  001FFD4535C911EC89641635D148          love      1
2  001FFD4535C911EC89641635D148         water      1
3  001FFD4535C911EC89641635D148             i      1
4  001FFD4535C911EC89641635D148         enjoy      1
3691 233


In [41]:
for index, row in same_length_df.iterrows():
    k = row["activityId"]
    if dict[k] == 1:
        same_len_no_errors += 1
    else:
        same_len_errors += 1

print(same_len_no_errors, same_len_errors)

6156 189


For rows in asr_data.csv where length of words in story_text and wav2vec_transcript_phonemes are equal, error rate is just 3.2% and if length of words in story_text and wav2vec_transcript_phonemes are not equal, error rate is just 6.3%. Hence, the error rate increases by approximately 50% if the length of words in story_text and wav2vec_transcript_phonemes are not equal. So this should be one of the features used for training the Machine Learning model.

Hence, this means that students are 50% less likely to make an error if length of words in story_text and wav2vec_transcript_phonemes are equal(assuming our initial dataset in random).

In [74]:
import json

total_sum = 0
num_rows_with_conf = 0
num_rows_without_conf = 0
dict_with_conf = {}
for index, row in different_length_df.iterrows():
    activity = row["activityId"]
    if pd.isna(row["kaldiNa_data"]):
        num_rows_without_conf += 1
        continue
        
    try:
        kaldiNa_data_json = json.loads(row["kaldiNa_data"])
        if 'raw' in kaldiNa_data_json and 'ref-conf' in kaldiNa_data_json['raw']:
            conf = kaldiNa_data_json['raw']['ref-conf']
            t = [entry[1] for entry in conf]
            temp_sum = 0
            for entry in conf:
                temp_sum += entry[1]
            dict_with_conf[activity] = {'label':dict[activity],'avg_conf':temp_sum}
            total_sum += temp_sum / len(conf)
            num_rows_with_conf += 1
        else:
            num_rows_without_conf += 1
    except KeyError:
        num_rows_without_conf += 1

if num_rows_with_conf > 0:
    avg_confidence = total_sum / num_rows_with_conf
else:
    avg_confidence = 0  # Handle division by zero if there are no rows with confidence

print(f"Avg confidence for rows with confidence: {avg_confidence}")
print(f"Number of rows without confidence: {num_rows_without_conf}")



Avg confidence for rows with confidence: 0.7575266387788445
Number of rows without confidence: 26


In [69]:
import json

total_sum = 0
num_rows_with_conf = 0
num_rows_without_conf = 0

for index, row in same_length_df.iterrows():
    activity = row["activityId"]
    if pd.isna(row["kaldiNa_data"]):
        num_rows_without_conf += 1
        continue
        
    try:
        kaldiNa_data_json = json.loads(row["kaldiNa_data"])
        
        if 'raw' in kaldiNa_data_json and 'ref-conf' in kaldiNa_data_json['raw']:
            conf = kaldiNa_data_json['raw']['ref-conf']
            t = [entry[1] for entry in conf]
            temp_sum = 0
            for entry in conf:
                temp_sum += entry[1]
            dict_with_conf[activity] = {'label':dict[activity],'avg_conf':temp_sum}
            total_sum += temp_sum / len(conf)
            num_rows_with_conf += 1
        else:
            num_rows_without_conf += 1
    except KeyError:
        num_rows_without_conf += 1

if num_rows_with_conf > 0:
    avg_confidence = total_sum / num_rows_with_conf
else:
    avg_confidence = 0  # Handle division by zero if there are no rows with confidence

print(f"Avg confidence for rows with confidence: {avg_confidence}")
print(f"Number of rows without confidence: {num_rows_without_conf}")



Avg confidence for rows with confidence: 0.882456959846706
Number of rows without confidence: 4


In [71]:
total_confidence_label_0 = 0
total_confidence_label_1 = 0
count_label_0 = 0
count_label_1 = 0

for activity_id, info in dict_with_conf.items():
    label = info['label']
    avg_conf = info['avg_conf']
    
    if label == 0:
        total_confidence_label_0 += avg_conf
        count_label_0 += 1
    elif label == 1:
        total_confidence_label_1 += avg_conf
        count_label_1 += 1

avg_conf_label_0 = total_confidence_label_0 / count_label_0 if count_label_0 != 0 else 0
avg_conf_label_1 = total_confidence_label_1 / count_label_1 if count_label_1 != 0 else 0

print("Average confidence for label 0:", avg_conf_label_0)
print("Average confidence for label 1:", avg_conf_label_1)

    

Average confidence for label 0: 4.950755430685328
Average confidence for label 1: 7.283170798631513


Using confidence of Kaldi_Na column
Average confidence for label 0: 4.950755430685328
Average confidence for label 1: 7.283170798631513

Using confidence of Kaldi column
Average confidence for label 0: 0.6466020774622308
Average confidence for label 1: 0.802685579253625

Using Kaldi_data

In [93]:
import json
import pandas as pd

dict_with_conf_kaldi = {}
def calculate_avg_confidence(df, json_column):
    total_sum = 0
    num_rows_with_conf = 0
    num_rows_without_conf = 0
    
    
    for index, row in df.iterrows():
        activity = row["activityId"]
        if pd.isna(row[json_column]):
            num_rows_without_conf += 1
            continue
            
        try:
            data_json = json.loads(row[json_column])
            
            if 'raw' in data_json and 'ref-conf' in data_json['raw']:
                conf = data_json['raw']['ref-conf']
                t = [entry[1] for entry in conf]
                temp_sum = 0
                for entry in conf:
                    temp_sum += entry[1]
                dict_with_conf_kaldi[activity] = {'label': dict[activity], 'avg_conf': temp_sum / len(conf)}
                total_sum += temp_sum / len(conf)
                num_rows_with_conf += 1
            else:
                num_rows_without_conf += 1
        except KeyError:
            num_rows_without_conf += 1
    
    if num_rows_with_conf > 0:
        avg_confidence = total_sum / num_rows_with_conf
    else:
        avg_confidence = 0  # Handle division by zero if there are no rows with confidence
    
    print(f"Avg confidence for rows with confidence: {avg_confidence}")
    print(f"Number of rows without confidence: {num_rows_without_conf}")
    
    return dict_with_conf

result = calculate_avg_confidence(different_length_df, 'kaldi_data')


Avg confidence for rows with confidence: 0.7499239832057832
Number of rows without confidence: 26


In [94]:
result = calculate_avg_confidence(same_length_df, 'kaldi_data')

Avg confidence for rows with confidence: 0.8457600372537353
Number of rows without confidence: 4


In [95]:
def calculate_avg_confidence_by_label(dict_with_conf_kaldi):
    total_confidence_label_0 = 0
    total_confidence_label_1 = 0
    count_label_0 = 0
    count_label_1 = 0

    for activity_id, info in dict_with_conf_kaldi.items():
        label = info['label']
        avg_conf = info['avg_conf']

        if label == 0:
            total_confidence_label_0 += avg_conf
            count_label_0 += 1
        elif label == 1:
            total_confidence_label_1 += avg_conf
            count_label_1 += 1

    avg_conf_label_0 = total_confidence_label_0 / count_label_0 if count_label_0 != 0 else 0
    avg_conf_label_1 = total_confidence_label_1 / count_label_1 if count_label_1 != 0 else 0

    return avg_conf_label_0, avg_conf_label_1

avg_conf_label_0, avg_conf_label_1 = calculate_avg_confidence_by_label(dict_with_conf_kaldi)
print("Average confidence for label 0:", avg_conf_label_0)
print("Average confidence for label 1:", avg_conf_label_1)


Average confidence for label 0: 0.6466020774622308
Average confidence for label 1: 0.802685579253625


Correlation between time taken and label

In [107]:
import pandas as pd

labels_df = pd.read_csv('labels.csv')
asr_df = pd.read_csv('asr_data.csv')

total_time_1 = 0
total_time_0 = 0
count_1 = 0
count_0 = 0

for index, row in labels_df.iterrows():
    activity_id = row['activityId']
    phrase_index = row['phraseIndex']
    word_index = row['word_index']
    expected_text = row['expected_text']
    label = row['label']
    
    matching_rows = asr_df[(asr_df['activityId'] == activity_id) & 
                           (asr_df['phrase_index'] == phrase_index)]
    
    time_taken_list = []
    
    for _, asr_row in matching_rows.iterrows():
        kaldina_data = asr_row['kaldiNa_data']
        
        if not pd.isnull(kaldina_data) and len(kaldina_data)>0:
            try:
                kaldi_data_json = eval(kaldina_data)  # Convert JSON string to dictionary
            except:
                print(len(kaldina_data))
                print(kaldina_data)
            
            for word_data in kaldi_data_json['transcription']:
                if word_data['word'] == expected_text:
                    time_taken = word_data['end_time'] - word_data['start_time']
                    time_taken_list.append(time_taken)
                
    total_time = 0
    for t in time_taken_list:
        total_time += t
    
    if label == 1:
        total_time_1 += total_time
        count_1 += len(time_taken_list)
    else:
        total_time_0 += total_time
        count_0 += len(time_taken_list)

avg_time_1 = total_time_1 / count_1 if count_1 > 0 else 0
avg_time_0 = total_time_0 / count_0 if count_0 > 0 else 0

print("Average time taken for labels with 1:", avg_time_1)
print("Average time taken for labels with 0:", avg_time_0)


424
{"text": "", "transcription": [], "raw": {"data": [{"confidence": null, "log-like": 250.31170654296875, "text": []}], "model": "/home/kaldi/amira/am/amira-acoustic-model/final.mdl", "phrase-id": "our_gift_to_the_beach_2", "ref-conf": [["the", 0.0], ["beach", 0.0], ["was", 0.0], ["messy", 0.0], ["we", 0.0], ["wanted", 0.0], ["to", 0.0], ["help", 0.0], ["the", 0.0], ["beach", 0.0]], "status": "ok", "total-samples": 49197}}
424
{"text": "", "transcription": [], "raw": {"data": [{"confidence": null, "log-like": 250.31170654296875, "text": []}], "model": "/home/kaldi/amira/am/amira-acoustic-model/final.mdl", "phrase-id": "our_gift_to_the_beach_2", "ref-conf": [["the", 0.0], ["beach", 0.0], ["was", 0.0], ["messy", 0.0], ["we", 0.0], ["wanted", 0.0], ["to", 0.0], ["help", 0.0], ["the", 0.0], ["beach", 0.0]], "status": "ok", "total-samples": 49197}}
424
{"text": "", "transcription": [], "raw": {"data": [{"confidence": null, "log-like": 250.31170654296875, "text": []}], "model": "/home/kald

Average time taken for labels with 1: 0.40954961710195475
Average time taken for labels with 0: 0.4912787150087315

In [109]:
import pandas as pd

labels_df = pd.read_csv('labels.csv')
asr_df = pd.read_csv('asr_data.csv')

word_info_1 = {}
word_info_0 = {}

for index, row in labels_df.iterrows():
    activity_id = row['activityId']
    phrase_index = row['phraseIndex']
    word_index = row['word_index']
    expected_text = row['expected_text']
    label = row['label']
    
    matching_rows = asr_df[(asr_df['activityId'] == activity_id) & 
                           (asr_df['phrase_index'] == phrase_index)]
    
    time_taken_list = []
    
    for _, asr_row in matching_rows.iterrows():
        kaldina_data = asr_row['kaldiNa_data']
        
        if not pd.isnull(kaldina_data) and len(kaldina_data) > 0:
            try:
                kaldi_data_json = eval(kaldina_data)  # Convert JSON string to dictionary
            except:
                print(len(kaldina_data))
                print(kaldina_data)
            
            for word_data in kaldi_data_json['transcription']:
                if word_data['word'] == expected_text:
                    time_taken = word_data['end_time'] - word_data['start_time']
                    time_taken_list.append(time_taken)
                    
                    word_info = word_info_1 if label == 1 else word_info_0
                    if expected_text in word_info:
                        word_info[expected_text]['total_time'] += time_taken
                        word_info[expected_text]['count'] += 1
                    else:
                        word_info[expected_text] = {'total_time': time_taken, 'count': 1}
                
    # Calculate the total time taken for the word
    total_time = 0
    for t in time_taken_list:
        total_time += t
    
# Calculate average time taken for labels with 1 and 0
avg_time_1 = total_time_1 / count_1 if count_1 > 0 else 0
avg_time_0 = total_time_0 / count_0 if count_0 > 0 else 0

sorted_words_1 = sorted(word_info_1.items(), key=lambda x: x[1]['count'], reverse=True)
sorted_words_0 = sorted(word_info_0.items(), key=lambda x: x[1]['count'], reverse=True)

print("Top incorrect words and their average time taken:")
for word, info in sorted_words_0[:10]:
    avg_time = info['total_time'] / info['count']
    print(f"Word: {word}, Count: {info['count']}, Average Time Taken: {avg_time} seconds")

print("\nTop correct words and their average time taken:")
for word, info in sorted_words_1[:10]:
    avg_time = info['total_time'] / info['count']
    print(f"Word: {word}, Count: {info['count']}, Average Time Taken: {avg_time} seconds")


424
{"text": "", "transcription": [], "raw": {"data": [{"confidence": null, "log-like": 250.31170654296875, "text": []}], "model": "/home/kaldi/amira/am/amira-acoustic-model/final.mdl", "phrase-id": "our_gift_to_the_beach_2", "ref-conf": [["the", 0.0], ["beach", 0.0], ["was", 0.0], ["messy", 0.0], ["we", 0.0], ["wanted", 0.0], ["to", 0.0], ["help", 0.0], ["the", 0.0], ["beach", 0.0]], "status": "ok", "total-samples": 49197}}
424
{"text": "", "transcription": [], "raw": {"data": [{"confidence": null, "log-like": 250.31170654296875, "text": []}], "model": "/home/kaldi/amira/am/amira-acoustic-model/final.mdl", "phrase-id": "our_gift_to_the_beach_2", "ref-conf": [["the", 0.0], ["beach", 0.0], ["was", 0.0], ["messy", 0.0], ["we", 0.0], ["wanted", 0.0], ["to", 0.0], ["help", 0.0], ["the", 0.0], ["beach", 0.0]], "status": "ok", "total-samples": 49197}}
424
{"text": "", "transcription": [], "raw": {"data": [{"confidence": null, "log-like": 250.31170654296875, "text": []}], "model": "/home/kald

In [110]:
sorted_words_0

[('the', {'total_time': 62.07175984978676, 'count': 219}),
 ('to', {'total_time': 13.822248339653015, 'count': 42}),
 ('they', {'total_time': 12.245291352272034, 'count': 30}),
 ('and', {'total_time': 8.136034727096558, 'count': 29}),
 ('in', {'total_time': 8.374390125274658, 'count': 28}),
 ('a', {'total_time': 12.057051487267017, 'count': 27}),
 ('was', {'total_time': 11.042044162750244, 'count': 26}),
 ('their', {'total_time': 8.620776891708374, 'count': 23}),
 ('like', {'total_time': 9.091710329055786, 'count': 22}),
 ('i', {'total_time': 7.651761114597321, 'count': 22}),
 ('cut', {'total_time': 11.243272304534912, 'count': 22}),
 ('is', {'total_time': 5.583200216293335, 'count': 21}),
 ('or', {'total_time': 10.059689044952393, 'count': 21}),
 ('its', {'total_time': 6.694409132003784, 'count': 20}),
 ('hop', {'total_time': 9.088325142860413, 'count': 20}),
 ('were', {'total_time': 7.723349332809448, 'count': 19}),
 ('we', {'total_time': 8.019213318824768, 'count': 19}),
 ('an', {'t

In [111]:
sorted_words_1

[('the', {'total_time': 1489.67365424335, 'count': 5992}),
 ('a', {'total_time': 565.5008073300123, 'count': 2390}),
 ('to', {'total_time': 411.55913849920034, 'count': 1560}),
 ('and', {'total_time': 472.3986601382494, 'count': 1517}),
 ('i', {'total_time': 520.3269797936082, 'count': 1418}),
 ('in', {'total_time': 323.7312120050192, 'count': 1228}),
 ('of', {'total_time': 260.7015891075134, 'count': 1041}),
 ('is', {'total_time': 307.02325972914696, 'count': 917}),
 ('they', {'total_time': 234.37919472903013, 'count': 769}),
 ('are', {'total_time': 230.67101150006056, 'count': 742}),
 ('it', {'total_time': 136.7015202641487, 'count': 543}),
 ('on', {'total_time': 168.56786536425352, 'count': 493}),
 ('she', {'total_time': 172.2128221988678, 'count': 478}),
 ('their', {'total_time': 150.88655281066895, 'count': 476}),
 ('was', {'total_time': 178.44330203533173, 'count': 475}),
 ('you', {'total_time': 139.05753219127655, 'count': 455}),
 ('he', {'total_time': 137.10297655314207, 'count

In [113]:
import json

def load_arpabet_to_amirabet(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

def load_all_story_words(filepath):
    with open(filepath, 'r') as f:
        return {line.split()[0]: ' '.join(line.split()[1:]) for line in f}

def get_phonetic_expression(sentence, arpabet_to_amirabet, all_story_words):
    words = sentence.split()
    phonetic_expression = []
    for word in words:
        if word.upper() in all_story_words:
            arpabet = all_story_words[word.upper()]
            amirabet = [arpabet_to_amirabet[arp] for arp in arpabet.split() if arp in arpabet_to_amirabet]
            phonetic_expression.append(''.join(amirabet))
    return ' '.join(phonetic_expression)

def get_sentence_phonetic_expression(sentence):
    arpabet_to_amirabet_filepath = 'arpabet_to_amirabet.json'
    all_story_words_filepath = 'all_story_words.dic'

    arpabet_to_amirabet = load_arpabet_to_amirabet(arpabet_to_amirabet_filepath)
    all_story_words = load_all_story_words(all_story_words_filepath)

    return get_phonetic_expression(sentence, arpabet_to_amirabet, all_story_words)

# Example usage
sentence = "I love water. I enjoy playing with water"
phonetic_expression = get_sentence_phonetic_expression(sentence)
print(phonetic_expression)

phonetic_expressions_of_incorrectly_spelled_words = []
for word_details in sorted_words_0:
    phonetic_expression = get_sentence_phonetic_expression(word_details[0])
    phonetic_expressions_of_incorrectly_spelled_words.append(phonetic_expression)

print(phonetic_expressions_of_incorrectly_spelled_words)

γ lʌv γ ɛnjω plaɪŋ wɪθ wɔtɝ
['θʌ', 'tu', 'ða', 'ænd', 'ɪn', 'ʌ', 'wɑz', 'ðɛɹ', 'lγk', 'γ', 'kʌt', 'ɪz', 'ɔɹ', 'ɪts', 'hɑp', 'wɝ', 'wi', 'ʌn', 'ɑɹ', 'ɪt', 'ʌs', 'ɔn', 'si', 'kæn', 'doz', 'wʌn', 'yuɑn', 'wɔtɝ', 'bɔt', 'ɹan', 'yu', 'ʌnʌðɝ', 'kɛɹʌlγnʌ', 'hɛɹ', 'fɔɹ', 'hæd', 'kati', 'θɹu', 'æt', 'sɔ', 'wɛn', 'spɑɹks', 'lɪv', 'wʌt', 'pʌps', 'hi', 'lʊkt', 'wɪθ', 'ʌmɛɹʌkʌn', 'wʊd', 'ðʌt', 'ʌv', 'nɑt', 'ʃi', 'ðɛn', 'ʌminʌ', 'wɪl', 'gɹæshɑpɝz', 'hæz', 'wɛnt', 'it', 'hæv', 'yɔɹ', 'kæt', 'ʌðɝ', 'hɑɹd', 'du', 'ɹæn', 'ðo', 'ɛksɝsγz', 'sɪz', 'pʌp', 'mɑnɑɹk', 'gɹu', 'plasɪz', 'ðɪs', 'gɹændmɑ', 'wɪʃ', 'mad', 'mγ', 'ʌbαt', 'pla', 'ðɛɹ', 'sad', 'bɑni', 'fild', 'yuz', 'mɪks', 'bæt', 'kɛnt', 'naoki', 'kʌdim', 'mγt', 'ɹʌn', 'kʊd', 'sæk', 'gɹɪnd', 'wɝld', 'kɑtiʌ', 'lʊk', 'no', 'læm', 'ʃɛlz', 'nam', 'mun', 'flαɝ', 'fɑɹli', 'okɹʌ', 'ɑntu', 'kɑlʌnɪsts', 'fɹʌm', 'pɹa', 'wok', 'livz', 'stænli', 'mɛni', 'bɛn', 'kol', 'vɑlkano', 'ðɛm', 'ɹγdɪŋ', 'mʌg', 'mæn', 'tɹi', 'wɑnt', 'hʌndɹʌd', 'bi', 'pʌl', 'ɹol', 'kɹæbz', 's

In [115]:
import pickle
with open('error_phonemes.pickle', 'wb') as f:
    pickle.dump(phonetic_expressions_of_incorrectly_spelled_words, f)

In [118]:
from collections import Counter

def find_top_symbols(phonetic_expressions, n=1, top=5):
    symbol_counts = Counter()
    for expression in phonetic_expressions:
        symbols = expression.split()  # Split the expression into symbols
        for i in range(len(symbols) - n + 1):
            # Count occurrences of n consecutive symbols
            symbol_counts[tuple(symbols[i:i+n])] += 1
            
    # Get the top n occurrences
    top_symbols = symbol_counts.most_common(top)
    return top_symbols

# Example list of phonetic expressions

# Find the top 5 pairs of consecutive symbols
top_5_pairs = find_top_symbols(phonetic_expressions_of_incorrectly_spelled_words, n=1, top=5)
print("Top 5 pairs of consecutive symbols:", top_5_pairs)

# Find the top 5 triplets of consecutive symbols
top_5_triplets = find_top_symbols(phonetic_expressions_of_incorrectly_spelled_words, n=3, top=5)
print("Top 5 triplets of consecutive symbols:", top_5_triplets)


Top 5 pairs of consecutive symbols: [(('tu',), 3), (('ðɛɹ',), 2), (('hɛɹ',), 2), (('θɹu',), 2), (('wʊd',), 2)]
Top 5 triplets of consecutive symbols: []


In [121]:
from collections import Counter
import re

def find_top_symbols(phonetic_expressions, n=1, top=5):
    symbol_counts = Counter()
    for expression in phonetic_expressions:
        symbols = re.findall('.{%d}' % n, expression)  # Split the expression into symbols of length n
        for i in range(len(symbols) - n + 1):
            # Count occurrences of n consecutive symbols
            symbol_counts[tuple(symbols[i:i+n])] += 1
            
    # Get the top n occurrences
    top_symbols = symbol_counts.most_common(top)
    return top_symbols

# Example list of phonetic expressions
# Find the top 2 pairs of consecutive symbols
top_1_pairs = find_top_symbols(phonetic_expressions_of_incorrectly_spelled_words, n=1, top=2)
print("Top 2 pairs of consecutive symbols:", top_1_pairs)

top_2_pairs = find_top_symbols(phonetic_expressions_of_incorrectly_spelled_words, n=2, top=2)
print("Top 2 pairs of consecutive symbols:", top_2_pairs)

top_3_pairs = find_top_symbols(phonetic_expressions_of_incorrectly_spelled_words, n=3, top=2)
print("Top 2 pairs of consecutive symbols:", top_3_pairs)

Top 2 pairs of consecutive symbols: [(('ʌ',), 458), (('n',), 410)]
Top 2 pairs of consecutive symbols: [(('kʌ', 'nt'), 6), (('hɛ', 'lp'), 4)]
Top 2 pairs of consecutive symbols: [(('ɔɹg', 'ʌnɪ', 'zʌm'), 2), (('gɹæ', 'shɑ', 'pɝz'), 1)]


Top 2 pairs of consecutive symbols: [(('ʌ',), 458), (('n',), 410)]
Top 2 pairs of consecutive symbols: [(('kʌ', 'nt'), 6), (('hɛ', 'lp'), 4)]
Top 2 pairs of consecutive symbols: [(('ɔɹg', 'ʌnɪ', 'zʌm'), 2), (('gɹæ', 'shɑ', 'pɝz'), 1)]
