In [11]:
import os
import shutil
import pandas as pd

In [12]:
# get all csv files recursively
def get_csv_files(dir_path):
    csv_files = []
    for root, _, files in os.walk(dir_path):
        for filename in files:
            if filename.lower().endswith('.csv'):
                csv_files.append(os.path.join(root, filename))
    return sorted(csv_files)

all_files = get_csv_files('/Users/stevie/repos/lingo_kit_data/dataframes/v1.1.1/dataframes')


In [13]:
all_df = pd.DataFrame()
dataframes = []
for file_path in all_files:
    dataframes.append(pd.read_csv(file_path))
if dataframes:
    all_df = pd.concat(dataframes, ignore_index=True)


In [14]:
all_df = all_df.drop(columns=['is_base'], errors='ignore')


In [15]:
# Normalize part_of_speech values
allowed_pos = {'adj', 'adv', 'art', 'conj', 'det', 'noun', 'prep', 'pron', 'verb'}
pos_mapping = {
    'adj': 'adj',
    'adjective': 'adj',
    'adjectives': 'adj',
    'adv': 'adv',
    'adverb': 'adv',
    'adverbs': 'adv',
    'art': 'art',
    'article': 'art',
    'articles': 'art',
    'conj': 'conj',
    'conjunction': 'conj',
    'conjunctions': 'conj',
    'det': 'det',
    'determiner': 'det',
    'determiners': 'det',
    'noun': 'noun',
    'nouns': 'noun',
    'prep': 'prep',
    'preposition': 'prep',
    'prepositions': 'prep',
    'pron': 'pron',
    'pronoun': 'pron',
    'pronouns': 'pron',
    'verb': 'verb',
    'verbs': 'verb',
}
if "part_of_speech" in all_df.columns:
    def normalize_pos(value):
        if pd.isna(value):
            return value
        text = str(value).strip()
        key = text.lower()
        if key in pos_mapping:
            return pos_mapping[key]
        if key in allowed_pos:
            return key
        raise ValueError(f"Unexpected part_of_speech value: {value!r}")
    all_df['part_of_speech'] = all_df['part_of_speech'].apply(normalize_pos)
    unexpected_pos = set(all_df['part_of_speech'].dropna().unique()) - allowed_pos
    if unexpected_pos:
        raise ValueError(f"Unexpected part_of_speech values after normalization: {sorted(unexpected_pos)}")


In [16]:
# Normalize gender values
allowed_gender = {'m', 'f', 'n/a'}
gender_mapping = {'masculine': 'm', 'feminine': 'f', '': 'n/a', 'none': 'n/a'}
if "gender" in all_df.columns:
    def normalize_gender(value):
        if pd.isna(value):
            return 'n/a'
        text = str(value).strip()
        key = text.lower()
        if key in gender_mapping:
            return gender_mapping[key]
        if key in allowed_gender:
            return key
        raise ValueError(f"Unexpected gender value: {value!r}")
    all_df['gender'] = all_df['gender'].apply(normalize_gender)
    unexpected_gender = set(all_df['gender'].dropna().unique()) - allowed_gender
    if unexpected_gender:
        raise ValueError(f"Unexpected gender values after normalization: {sorted(unexpected_gender)}")


In [17]:
# Normalize plurality values
allowed_plurality = {'s', 'p', 'n/a'}
plurality_mapping = {'plural': 'p', 'singular': 's', '': 'n/a', 'none': 'n/a'}
if "plurality" in all_df.columns:
    def normalize_plurality(value):
        if pd.isna(value):
            return 'n/a'
        text = str(value).strip()
        key = text.lower()
        if key in plurality_mapping:
            return plurality_mapping[key]
        if key in allowed_plurality:
            return key
        raise ValueError(f"Unexpected plurality value: {value!r}")
    all_df['plurality'] = all_df['plurality'].apply(normalize_plurality)
    unexpected_plurality = set(all_df['plurality'].dropna().unique()) - allowed_plurality
    if unexpected_plurality:
        raise ValueError(f"Unexpected plurality values after normalization: {sorted(unexpected_plurality)}")


In [18]:
for column in ['italian_audio_hash', 'english_audio_hash']:
    if column not in all_df.columns:
        all_df[column] = ''
    else:
        all_df[column] = all_df[column].fillna('')


In [19]:
all_df.to_csv('combined_and_reorganized.csv', index=False)