In [None]:
import os 
import pandas as pd
from tqdm import tqdm

In [None]:
df_dict = {}
dataframe_dir = '../dataframes/dataframes_by_pos'
for pos in os.listdir(dataframe_dir):
    pos_dir = os.path.join(dataframe_dir, pos)
    for file in os.listdir(pos_dir):
        path = os.path.join(pos_dir, file)
        assert(os.path.isfile(path))
        assert(file.endswith('.csv'))
        df_dict[path] = pd.read_csv(path)

In [None]:
# ref_df = pd.read_csv('/Users/stevie/repos/lingo_kit_data/dataframes/combined_and_reorganized.csv')
ref_df = pd.read_csv('/Users/stevie/repos/lingo_kit_data/dataframes/dataframe_with_audio.csv')
len(ref_df), ref_df.columns

In [None]:
# Normalize part_of_speech values
allowed_pos = {'adj', 'adv', 'art', 'conj', 'det', 'noun', 'prep', 'pron', 'verb'}
pos_mapping = {
    'adj': 'adj',
    'adjective': 'adj',
    'adjectives': 'adj',
    'adv': 'adv',
    'adverb': 'adv',
    'adverbs': 'adv',
    'art': 'art',
    'article': 'art',
    'articles': 'art',
    'conj': 'conj',
    'conjunction': 'conj',
    'conjunctions': 'conj',
    'det': 'det',
    'determiner': 'det',
    'determiners': 'det',
    'noun': 'noun',
    'nouns': 'noun',
    'prep': 'prep',
    'preposition': 'prep',
    'prepositions': 'prep',
    'pron': 'pron',
    'pronoun': 'pron',
    'pronouns': 'pron',
    'verb': 'verb',
    'verbs': 'verb',
}
def normalize_pos(value):
    if pd.isna(value):
        return value
    text = str(value).strip()
    key = text.lower()
    if key in pos_mapping:
        return pos_mapping[key]
    if key in allowed_pos:
        return key
    raise ValueError(f"Unexpected part_of_speech value: {value!r}")

In [None]:
ref_df['italian_audio_hash'].isna().sum()

In [None]:
for path, df in tqdm(df_dict.items(), total=len(df_dict)):
    for i, row in df.iterrows():
        pos = normalize_pos(row['part_of_speech'])
        # print(pos)
        df.loc[i, 'part_of_speech'] = pos
        sel_df = ref_df[ref_df['example_sentence_english'] == row['example_sentence_english']]
        sel_df = sel_df[sel_df['example_sentence_italian'] == row['example_sentence_italian']]
        sel_df = sel_df[sel_df['term_italian'] == row['term_italian']]
        sel_df = sel_df[sel_df['translation_english'] == row['translation_english']]
        sel_df = sel_df[sel_df['part_of_speech'] == pos]
        # print(sel_df['part_of_speech'])
        # print(row['part_of_speech'])
        if len(sel_df) == 0:
            print(f'No match for {row["example_sentence_english"]} / {row["example_sentence_italian"]}')
            print(path)
            raise Exception('No match found')
        if len(sel_df) > 1:
            print(f'Multiple matches for {row["example_sentence_english"]} / {row["example_sentence_italian"]}')
            print(path)
            print(sel_df)
            assert(sel_df['italian_audio_hash'].nunique() == 1)
            assert(sel_df['english_audio_hash'].nunique() == 1)
        # assert(len(sel_df) == 1)
        # assert(row['term_italian'] == sel_df.iloc[0]['term_italian'])
        # assert(row['translation_english'] == sel_df.iloc[0]['translation_english'])
        df.loc[i, 'italian_audio_hash'] = sel_df.iloc[0]['italian_audio_hash']
        df.loc[i, 'english_audio_hash'] = sel_df.iloc[0]['english_audio_hash']
        df.to_csv(path, index=False)