In [1]:
!pip install python-Levenshtein



In [2]:
import pandas as pd
import numpy as np
from Levenshtein import distance
from collections import Counter
import re
import numpy as np

In [3]:
### Data Reading

In [4]:
data = pd.read_csv('text_data_nathan.csv')
data['start'] = ''
data['end'] = ''
data.head(5)

Unnamed: 0,Channel,Utterance,Label,file,start,end
0,A,Okay.,"fo_o_fw_""_by_bc",2228.txt,,
1,B,"All right, uh,",b,2228.txt,,
2,B,feelings on what caused the S and L crisis,sd,2228.txt,,
3,B,I guess I don't have a real technical knowledg...,sd,2228.txt,,
4,B,I gather that there where large numbers of sit...,sd,2228.txt,,


In [5]:
# Pre-process text-data-nathan before alignment

In [6]:
## Performing cleanup both the channels

In [7]:
data['Utterance'] = data['Utterance'].str.lower()
data['Utterance'] = data['Utterance'].str.strip()

In [8]:
data['Utterance'] = data['Utterance'].replace('[^\w +]', '', regex=True)

In [9]:
data['Utterance'] = data['Utterance'].replace('uhhuh', 'umhum')

In [10]:
data

Unnamed: 0,Channel,Utterance,Label,file,start,end
0,A,okay,"fo_o_fw_""_by_bc",2228.txt,,
1,B,all right uh,b,2228.txt,,
2,B,feelings on what caused the s and l crisis,sd,2228.txt,,
3,B,i guess i dont have a real technical knowledge...,sd,2228.txt,,
4,B,i gather that there where large numbers of sit...,sd,2228.txt,,
...,...,...,...,...,...,...
199735,A,no,nn,3528.txt,,
199736,A,i dont,sd,3528.txt,,
199737,A,i dont either,sd,3528.txt,,
199738,B,but i i know thats certainly helped a lot in t...,sv,3528.txt,,


In [11]:
data.head(5)

Unnamed: 0,Channel,Utterance,Label,file,start,end
0,A,okay,"fo_o_fw_""_by_bc",2228.txt,,
1,B,all right uh,b,2228.txt,,
2,B,feelings on what caused the s and l crisis,sd,2228.txt,,
3,B,i guess i dont have a real technical knowledge...,sd,2228.txt,,
4,B,i gather that there where large numbers of sit...,sd,2228.txt,,


In [12]:
## Function to cleanup individual transcription files

In [13]:
def clean_transcripts(df):
    df = df[df['text'] != '[noise]']
    df = df[df['text'] != '[silence]']
    df = df[df['text'] != '[laughter]']
    df = df[df['text'] != '[vocalized-noise]']
    df['text'] = df['text'].str.lower()
    df['text'] = df['text'].replace('[^\w +]', '', regex=True)
    df['text'] = df['text'].str.strip()
    df = df[df['text'] != 'umhum']
#     data['Utterance'] = data['Utterance'].replace('uhhuh', 'umhum')
    return df

In [14]:
def clean_transcripts_lines(lines, sep=' '):
    cleaned = []
    for line in lines:
        try:
            splits = [x.strip() for x in line.split(' ') if len(x) != 0]
            if splits[3] == '[noise]' or splits[3] == "[silence]" or splits[3] == '[laughter]' or splits[3] == '[vocalized-noise]' or splits[3] == 'um-hum':
                continue
        except IndexError:
            splits = [x.strip() for x in line.split('\t') if len(x) != 0]
            if splits[3] == '[noise]' or splits[3] == "[silence]" or splits[3] == '[laughter]' or splits[3] == '[vocalized-noise]' or splits[3] == 'um-hum':
                continue
        splits[3] = re.sub('[^\w +]', '', splits[3].lower())
        if len(splits) > 4:
            print(f'Data from word level transcript file has garbage cols. Reading first 4...')
            cleaned.append(splits[:4])
        else:
            cleaned.append(splits)
    return cleaned

In [15]:
### Performing time alignments, after cleaning data in individual transcript files

In [16]:
switchboard_data_folder_path = 'Switchboard/swb_ms98_transcriptions/'

In [17]:
# Aim is to read word timings (word -level) and add start and end times to 'data' dataframe
# Doing this for channel 'A'
prev_file = None
prev_file_index = 0
processed_files = set()
for index, row in data[data['Channel'] == 'A'].iterrows():
    processed_files.add(row['file'])
    if not row['file'] == '3158.txt':
        continue
    if row['file'] == '3136.txt':
        continue
        
    if row['Utterance'] == 'umhum': continue
    folder_id = row['file'][:2]
    if prev_file != row['file']:
        prev_file = row['file']
        prev_file_index = 0
    
    # Processing for channel A
    transcript_file = f"{switchboard_data_folder_path}{folder_id}/{row['file'][:-4]}/sw{row['file'][:-4]}A-ms98-a-word.text"

    word_timings = open(transcript_file, mode='r')
    word_timings = word_timings.readlines()
    if row['file'] in ('3646.txt', '2776.txt', '3751.txt', '4628.txt', '2927.txt', '3187.txt'):
        word_timings = clean_transcripts_lines(word_timings, sep='\t')
    else:
        word_timings = clean_transcripts_lines(word_timings)
    word_timings = pd.DataFrame(word_timings, columns=['_', 'start', 'end', 'text'])
    word_timings_index = 0
    
    try:
        if ' ' not in row['Utterance'].strip():
            if distance(row['Utterance'].strip(), word_timings.iloc[prev_file_index]['text']) == 0:
                #print(f'Found the utterance in word-level transcript file, adding start and end timings')
                data.iloc[index]['start'] = word_timings.iloc[prev_file_index]['start']
                data.iloc[index]['end'] = word_timings.iloc[prev_file_index]['end']
                prev_file_index += 1
            else:
                continue
        else:
            uttr_splits = row['Utterance'].split(' ')
            first_uttr_word, last_uttr_word = uttr_splits[0], uttr_splits[-1]
            count_last_uttr = row['Utterance'].count(last_uttr_word)
            count_last_uttr = Counter(uttr_splits).get(last_uttr_word)
            if not distance(word_timings.iloc[prev_file_index]['text'], first_uttr_word) <= 4 or len(first_uttr_word) <= distance(word_timings.iloc[prev_file_index]['text'], first_uttr_word):
                continue
            temp = None
            while count_last_uttr != 0:
                if not temp:
                    temp = prev_file_index
                if prev_file_index - temp >= len(uttr_splits):
                    # Mainly to check for words such as loans and loan, but just for last words
                    if distance(word_timings.iloc[prev_file_index]['text'], last_uttr_word) <= 1:
                        prev_file_index += 1
                        count_last_uttr -= 1
                    else:
                        prev_file_index += 1
                elif distance(word_timings.iloc[prev_file_index]['text'], last_uttr_word) == 0:
                    prev_file_index += 1
                    count_last_uttr -= 1
                else:
                    prev_file_index += 1

                if count_last_uttr == 0:
                    data.iloc[index]['start'] = word_timings.iloc[temp]['start']
                    data.iloc[index]['end'] = word_timings.iloc[prev_file_index - 1]['end']
                    break
    except IndexError:
        continue


In [18]:
# Aim is to read word timings (word -level) and add start and end times to 'data' dataframe
# Doing this for channel 'B'
prev_file = None
prev_file_index = 0
processed_files = set()
for index, row in data[data['Channel'] == 'B'].iterrows():
    processed_files.add(row['file'])
    if row['file'] == '3136.txt':
        continue
        
    if row['Utterance'] == 'umhum': continue
#     print(f'Starting for row --> \n{row}')
    folder_id = row['file'][:2]
    if prev_file != row['file']:
        prev_file = row['file']
        prev_file_index = 0
    
    # Processing for channel A
    transcript_file = f"{switchboard_data_folder_path}{folder_id}/{row['file'][:-4]}/sw{row['file'][:-4]}B-ms98-a-word.text"

    word_timings = open(transcript_file, mode='r')
    word_timings = word_timings.readlines()
#     if '3646.txt' == row['file'] or '2776.txt' == row['file'] or '3751.txt' == row['file'] or '4628.txt' == row['file']:
    word_timings = clean_transcripts_lines(word_timings)
    word_timings = pd.DataFrame(word_timings, columns=['_', 'start', 'end', 'text'])
    word_timings_index = 0

    try:
        if ' ' not in row['Utterance'].strip():
            if distance(row['Utterance'].strip(), word_timings.iloc[prev_file_index]['text']) == 0:
                data.iloc[index]['start'] = word_timings.iloc[prev_file_index]['start']
                data.iloc[index]['end'] = word_timings.iloc[prev_file_index]['end']
                prev_file_index += 1
            else:
                continue
        else:
            uttr_splits = row['Utterance'].split(' ')
            first_uttr_word, last_uttr_word = uttr_splits[0], uttr_splits[-1]

            count_last_uttr = row['Utterance'].count(last_uttr_word)
            count_last_uttr = Counter(uttr_splits).get(last_uttr_word)
            if not distance(word_timings.iloc[prev_file_index]['text'], first_uttr_word) <= 4 or len(first_uttr_word) <= distance(word_timings.iloc[prev_file_index]['text'], first_uttr_word):
                continue
            temp = None
            while count_last_uttr != 0:
                if not temp:
                    temp = prev_file_index
                if prev_file_index - temp >= len(uttr_splits):
                    # Mainly to check for words such as loans and loan, but just for last words
                    if distance(word_timings.iloc[prev_file_index]['text'], last_uttr_word) <= 1:
                        prev_file_index += 1
                        count_last_uttr -= 1
                    else:
                        prev_file_index += 1
                elif distance(word_timings.iloc[prev_file_index]['text'], last_uttr_word) == 0:
                    prev_file_index += 1
                    count_last_uttr -= 1
                else:
                    prev_file_index += 1

                if count_last_uttr == 0:
                    data.iloc[index]['start'] = word_timings.iloc[temp]['start']
                    data.iloc[index]['end'] = word_timings.iloc[prev_file_index - 1]['end']
                    break
    except IndexError:
        continue


Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file h

Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file has garbage cols. Reading first 4...
Data from word level transcript file h

In [19]:
data.to_csv('text_data_nathan__with_timestamps.csv', index=False)