# Reading and Cleaning Datasets
Each of the DTRS datasets have slightly different formats, so this notebook reads all of them, cleans up the text, and saves them as CSV files in the following format:

`dataset ; session ; speaker ; utterance `




In [1]:
import re
import pandas as pd

In [2]:
with open('data/filenames.txt') as fo:
    filenames = fo.read()
filenames=filenames.split('\n')

In [3]:
def read_file_into_string(file_name, dir_name='data/'):
    if file_name != '' :
        file_path = dir_name+file_name
        with open(file_path) as f:
            sentences_string = f.read()
        return sentences_string
    else :
        print('no file read')
        return ''

In [4]:
def clean_text(input_text):
    text_without_carriage_returns = input_text.replace('\r', ' ').replace('\n', ' ')
    text_without_tabs = text_without_carriage_returns.replace('\t', ' ')
    # add other code to remove special characters that need removing here
    cleaned_text = text_without_tabs
    return cleaned_text

In [5]:
def break_lines(input_str):
    text_linebreaks_str = re.sub("[a-zA-Z]+:", '\n'+'\g<0>', input_str)
    lines_output = text_linebreaks_str.split('\n')
    lines_without_blanks = [line.strip() for line in lines_output if line != '']
    return lines_without_blanks

In [6]:
def convert_text_to_columns(string_list, additional_columns):
    data_columns = []
    i=0
    for line in string_list:
        split_dialogue = line.split(':')
        if len(split_dialogue) > 2 :
            speaker = split_dialogue[0].strip()
            remaining_dialogue = ':'.join(split_dialogue[1:]).strip()
        elif len(split_dialogue) == 2 :
            speaker = split_dialogue[0].strip()
            remaining_dialogue = split_dialogue[1].strip()
        elif len(split_dialogue) < 2:
            print("weird format!")
            print(dialogue)
            print("-----")
            continue
        data_columns.append(additional_columns + [speaker, remaining_dialogue])
    return data_columns

In [7]:
def write_csv(df, file_name, column_names, write_path='output/'):
    df.applymap(lambda x: x.replace('"', ''))
    df.applymap(lambda x: x.replace(';', ','))
    df.to_csv(write_path+file_name, sep=';', columns=column_names, index=False, header=True)

In [8]:
def convert_file_to_dataframe(file_name,
                              dir_name='data/',
                              write_output_file=False,
                              output_dir='output/'):
    dataset_name = ('-').join(file_name.split('-')[0:2])
    session_name = ('-').join(file_name.split('-')[2:]).split('.')[0]
    file_content = read_file_into_string(file_name, dir_name=dir_name)
    column_names = ['dataset', 'session', 'speaker', 'utterance']
    lines = break_lines(clean_text(file_content))
    columns_to_add = [dataset_name, session_name]
    text_columns = convert_text_to_columns(lines, columns_to_add)
    df = pd.DataFrame(text_columns, columns=column_names)
    if write_output_file :
        write_csv(df, file_name, column_names)
    return df

In [9]:
filenames = [f for f in filenames if f != '' and f != ' ']

In [10]:
df = None
for filename in filenames:
    if df is None :
        df = convert_file_to_dataframe(filename, write_output_file=True)
    else :
        df_to_append = convert_file_to_dataframe(filename, write_output_file=True)
        df = df.append(df_to_append)

In [11]:
print(df.shape)
df[6240:6245]

(20902, 4)


Unnamed: 0,dataset,session,speaker,utterance
37,dtrs-07,engineers-meeting-02,MaleA,Erm you have to follow the You don't necessar...
38,dtrs-07,engineers-meeting-02,MaleE,So the quality of the image would depend on the-
39,dtrs-07,engineers-meeting-02,MaleA,The qual-
40,dtrs-07,engineers-meeting-02,MaleE,nominal speed and how you would match that if ...
41,dtrs-07,engineers-meeting-02,MaleA,Yeah absolutely yeah
