In [None]:
"""
This script finds all CSV files containing dialect translation pairs in a directory and turns them into JSONs readable by DSW/didigen.
"""

import pandas as pd
import os
import glob
from datetime import date
import json

# function that finds all CSVs in a directory
def find_csv_filenames(path_to_dir, suffix=".csv"):
    filenames = os.listdir(path_to_dir)
    return [filename for filename in filenames if filename.endswith(suffix)]

# function to generate json
def generate_json(data, dialect, nameSuffix):
    # create list for this dialect
    data_standard[dialect] = []
    
    # iterate over each row of the dataframe
    for index, row in data.iterrows():
        row['de'] = str(row['de']).strip()
        row['gsw'] = str(row['gsw']).strip()
        
        found_src = False
        for d in data_standard[dialect]:
            if d['src'] == row['de']:
                found_src = True
                found_translation = False
                # look inside the dictionary's "target" value (which is a list)
                # find the dictionary and increment its 'count' by 1
                for t in d['target']:
                    if t['translation'] == row['gsw']:
                        found_translation = True
                        t['count'] += 1
                        break

                # inside, check if there is a dictionary whose "translation" is equal to the current row's "gsw"
                if not found_translation:
                    # if not, create a new dict in the list with the appropriate syntax
                    d['target'].append({"translation": row['gsw'], "count": 1})
                break
        
        # check if there is a dictionary present in this dialect's list whose "src" is equal to the current row's "de"
        if not found_src:
            # if not, create a new dict in the list with the appropriate syntax
            data_standard[dialect].append({'src': row['de'], 'target': [{'translation': row['gsw'], 'count': 1}]})
    
    for item in data_standard[dialect]:
        item["target"] = sorted(item["target"], key=lambda x: x["count"], reverse=True)
    
    if not os.path.exists(csv_dir + '_parsed'):
        os.makedirs(csv_dir + '_parsed')
        
    with open(csv_dir + '_parsed/' + dialect + nameSuffix + '.json', 'w', encoding = "utf-8") as f:
        json.dump(data_standard[dialect], f, indent=4, ensure_ascii = False)

# csv_dir = input("Directory name: ")
csv_dir = 'words'

all_data = {}
data_standard = {}
data_dialect = {}

filenames = find_csv_filenames(csv_dir)
for name in filenames:
    # add the CSV as DataFrame keyed by file name without extension
    all_data[os.path.splitext(name)[0]] = pd.read_csv(f"{csv_dir}/{name}")

df_list = []

# go through all CSVs
for dialect in all_data:
    all_data[dialect]['gsw'] = all_data[dialect]['gsw'].astype(str).str.lower()

    generate_json(all_data[dialect], dialect, 'Standard')
    print(f'done\t{dialect}\tstandard')
    
    generate_json(all_data[dialect].rename(columns = {'de': 'gsw', 'gsw': 'de'}), dialect, 'Dialect')
    print(f'done\t{dialect}\tdialect')
    
    df_list.append(all_data[dialect])

all_data['all'] = pd.concat(df_list)
generate_json(all_data['all'], 'all', 'Standard')
print(f'done\tall\tstandard')

# set meta
meta = {
    "uniqueStandard": len(data_standard['all']),
    "allWords": len(all_data['all']),
    "date": {
        "year": date.today().year,
        "month": date.today().month,
        "day": date.today().day
    }
}

with open(csv_dir + '_parsed/meta.json', 'w', encoding = "utf-8") as f:
    json.dump(meta, f, indent=4, ensure_ascii = False)

generate_json(all_data['all'].rename(columns = {'de': 'gsw', 'gsw': 'de'}), 'all', 'Dialect')
print(f'done\tall\tdialect')