## Notebook for transforming original data to tsv format

The notebook uses the ```.conllu``` files to create ```.tsv``` files containing metadata, full text and lemmatized text. Original data can be downloaded [here](https://www.clarin.si/repository/xmlui/handle/11356/1431). Don't forget to reference ParlaMint!

In [1]:
import fnmatch
from glob import glob as gb
from conllu import parse
import pandas as pd
import numpy as np
import os 

data_path = "path/to/original/data"
data_transformed_path = "path/to/transformed/data"

In [3]:
def parse_conllu_year(language,year,folder):
    filenames = gb(f"{folder}/*.conllu")
    file_id = []
    doc_id = []
    text = []
    lemmatized_text = []

    for file in sorted(filenames):

        if fnmatch.fnmatch(file.split("/")[-1], ".*") or file.startswith(
                "~$"):
            continue
        with open(file, 'r') as f:
            doc = f.read()
        sentences = parse(doc)
        utterance = None
        for sentence in sentences:
            if "newdoc id" in sentence.metadata.keys():
                utterance = sentence.metadata["newdoc id"]
            file_id.append(file)
            doc_id.append(utterance)
            text.append(sentence.metadata['text'])
            lemmatized_text.append(" ".join(["_".join([w['lemma'],w['upos']]) for w in sentence if w['lemma'].isalnum()]))

    df = pd.DataFrame(data=np.column_stack([file_id, doc_id, text,lemmatized_text]),
                      columns=['file', 'ID', 'text','lemmatized'])
    df.to_csv(os.path.join(data_transformed_path,language,f"ParlaMint-{language}-{year}.csv"), index=False)

    meta_all = []
    
    for i, file in enumerate(sorted(gb(f"{folder}/*-meta.tsv"))):
        if fnmatch.fnmatch(file.split("/")[-1], ".*") or file.startswith("~$"):
            continue
        meta = pd.read_csv(file, delimiter="\t")
        meta_all.append(meta)
    
    meta_all = pd.concat(meta_all)
    meta_all.to_csv(os.path.join(data_transformed_path,language,f"ParlaMint-{language}-{year}-meta.csv"),index=False)
    return os.path.join(data_transformed_path,language,f"ParlaMint-{language}-{year}.csv"), os.path.join(data_transformed_path,language,f"ParlaMint-{language}-{year}-meta.csv")

def parse_language_year(lan):
    print('working on',lan)
    if os.path.exists(os.path.join(data_transformed_path,lan)) == False:
        os.mkdir(os.path.join(data_transformed_path,lan))
    fp = os.path.join(data_path,f"ParlaMint-{lan.upper()}.conllu")
    if os.path.exists(fp) == False:
        print('path does not exist')

    if "2020" in os.listdir(fp):
        print(lan,"years found:",", ".join([x for x in os.listdir(fp) if len(x) == 4 and "20" in x]))
        for year in [x for x in os.listdir(fp) if len(x) == 4 and "20" in x]:
            print('\t working on',year)

            year_folder = os.path.join(fp,year)
            new_df,new_meta_df = parse_conllu(lan,year,year_folder)
            df = pd.read_csv(new_df)
            dfm = pd.read_csv(new_meta_df)
            dft = df.groupby(["ID"]).agg({"text": lambda x: " ".join(x)})
            dfl = df.groupby(["ID"]).agg({"lemmatized": lambda x: " ".join(x)})
            del df
            dft['lemmatized'] = dfl['lemmatized']
            del dfl
            df = pd.merge(dft, dfm, on="ID", how="outer")
            df.to_csv(os.path.join(data_transformed_path,lan,f"ParlaMint-{lan}-{year}.csv"),index=False)
            os.remove(new_meta_df)
            del df
    else:
        print('all files in one folder:','aborting')

def parse_conllu_month(language,month):
    folder = os.path.join(data_path,f"ParlaMint-{language.upper()}.conllu",month[:4])

    filenames = gb(f"{folder}/*.conllu")
    filenames = [f for f in filenames if month in f]
    
    # Text data
    file_id = []
    doc_id = []
    text = []
    lemmatized_text = []

    for file in sorted(filenames):

        if fnmatch.fnmatch(file.split("/")[-1], ".*") or file.startswith(
                "~$"):
            continue
        with open(file, 'r') as f:
            doc = f.read()
        sentences = parse(doc)
        utterance = None
        for sentence in sentences:
            if "newdoc id" in sentence.metadata.keys():
                utterance = sentence.metadata["newdoc id"]
            file_id.append(file)
            doc_id.append(utterance)
            text.append(sentence.metadata['text'])
            lemmatized_text.append(" ".join([w['lemma'] for w in sentence if w['lemma'].isalnum()]))

    df = pd.DataFrame(data=np.column_stack([file_id, doc_id, text,lemmatized_text]),columns=['file', 'ID', 'text','lemmatized'])
    dft = df.groupby(["ID"]).agg({"text": lambda x: " ".join(x)})
    dfl = df.groupby(["ID"]).agg({"lemmatized": lambda x: " ".join(x)})
    del df
    dft['lemmatized'] = dfl['lemmatized']
    del dfl

    ## Metadata
    meta_all = []
    filenames = gb(f"{folder}/*meta.tsv")
    filenames = [f for f in filenames if month in f and "meta" in f]

    for i, file in enumerate(sorted(filenames)):
        if fnmatch.fnmatch(file.split("/")[-1], ".*") or file.startswith("~$"):
            continue
        meta = pd.read_csv(file, delimiter="\t")
        meta_all.append(meta)
    
    meta_all = pd.concat(meta_all)
    df = pd.merge(dft, meta_all, on="ID", how="outer")
    df.to_csv(os.path.join(data_transformed_path,language,f"ParlaMint-{language}-{month}.csv"),index=False)


def parse_language_month(language):
    print('working on',language)
    if os.path.exists(os.path.join(data_transformed_path,language)) == False:
        os.mkdir(os.path.join(data_transformed_path,language))
    fp = os.path.join(data_path,f"ParlaMint-{language.upper()}.conllu")
    if os.path.exists(fp) == False:
        print('path does not exist')

    if "2020" in os.listdir(fp):
        print(language,"years found:",", ".join([x for x in os.listdir(fp) if len(x) == 4 and "20" in x]))
        for year in [x for x in os.listdir(fp) if len(x) == 4 and "20" in x]:

            year_folder = os.path.join(fp,year)
            for month in set([f.split('_')[1][:7] for f in gb(year_folder + "/*")]):
                print('\t working on',month)
                parse_conllu_month(language,month)

    else:
        print('all files in one folder:','aborting')

In [5]:
for l in "pl be cz dk gb it".split(' '):
    parse_language_month(l)

working on pl
pl years found: 2015, 2016, 2017, 2018, 2019, 2020
	 working on 2015-12
	 working on 2015-11
	 working on 2016-01
	 working on 2016-09
	 working on 2016-02
	 working on 2016-08
	 working on 2016-04
	 working on 2016-07
	 working on 2016-12
	 working on 2016-05
	 working on 2016-10
	 working on 2016-03
	 working on 2016-11
	 working on 2016-06
	 working on 2017-09
	 working on 2017-02
	 working on 2017-10
	 working on 2017-11
	 working on 2017-01
	 working on 2017-07
	 working on 2017-05
	 working on 2017-12
	 working on 2017-04
	 working on 2017-06
	 working on 2017-03
	 working on 2018-02
	 working on 2018-01
	 working on 2018-06
	 working on 2018-09
	 working on 2018-05
	 working on 2018-04
	 working on 2018-10
	 working on 2018-07
	 working on 2018-03
	 working on 2018-12
	 working on 2018-11
	 working on 2019-10
	 working on 2019-09
	 working on 2019-02
	 working on 2019-04
	 working on 2019-11
	 working on 2019-05
	 working on 2019-03
	 working on 2019-07
	 working o