In [53]:
import pandas as pd
import os

csv_file = input("CSV file name (no extension): ")
df = pd.read_csv(csv_file + ".csv")

# remove all already processed rows, then remove the 'processed' and 'Timestamp' columns
df = df[df.processed != 1]
df.drop(['processed', 'Timestamp'], axis = 1, inplace = True)

# Define the canton names and abbreviations in alphabetical order
canton_dict = {
    'Aargau': 'ag',
    'Appenzell Ausserrhoden': 'ar',
    'Appenzell Innerrhoden': 'ai',
    'Basel-Landschaft': 'bl',
    'Basel-Stadt': 'bs',
    'Bern': 'be',
    'Freiburg': 'fr',
    'Glarus': 'gl',
    'Graubünden': 'gr',
    'Jura': 'ju',
    'Luzern': 'lu',
    'Nidwalden': 'nw',
    'Obwalden': 'ow',
    'Schaffhausen': 'sh',
    'Schwyz': 'sz',
    'Solothurn': 'so',
    'St. Gallen': 'sg',
    'Thurgau': 'tg',
    'Tessin': 'ti',
    'Uri': 'ur',
    'Wallis': 'vs',
    'Zug': 'zg',
    'Zürich': 'zh'
}

# replace the 'canton' values in the dataframe with their 2-letter abbreviations
df['canton'] = df['canton'].replace(canton_dict)

# create a folder for the output, unless it exists already
if not os.path.exists('parsed_' + csv_file):
    os.mkdir('parsed_' + csv_file)

total_translations = 0

# print progress header
print("\ncanton\tlength")

# create a separate dataframe for each unique value in the 'canton' column
for canton in df['canton'].unique():
    canton_df = df[df['canton'] == canton]
    
    # remove the now unnecessary 'canton' column
#     canton_df.drop(['canton'], axis = 1, inplace = True)
    
    # create new dataframe with de, gsw columns
    new_df = pd.DataFrame()
    new_df[['de', 'gsw']] = None
    
    # iterate over current column
    for col in canton_df.columns:
        
        if col == 'canton':
            continue
        
        # iterate over cells in the current column
        for idx, cell in canton_df[col].items():
            
            # if the cell isn't empty, add a new de/gsw row
            if not pd.isna(cell):
                new_df.loc[len(new_df)] = [col.strip(), cell.strip()]
                total_translations += 1
    
    # print progress info
    new_df.to_csv(f"parsed_{csv_file}/{canton}.csv", index = False)
    print(f"{canton}\t{new_df.size}")
    
print(f"Job done for {total_translations} translations from {df['canton'].unique().size} cantons")

CSV file name (no extension): verbs

canton	length
zh	4146
so	406
ag	1792
be	582
bl	502
sg	786
gr	316
fr	264
zg	714
bs	236
lu	332
tg	268
ow	386
sz	236
Job done for 5483 translations from 14 cantons
