In [8]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib import colors
import pickle
import math
from datetime import datetime 
import os

In [2]:
# Data upload:
# input: specify the ID of the conversation, the path which contains the data,the speaker ('A' or 'B'), file type('trans' or 'word') 
# output dataframe of the transcript or the words

def download_data1(id_conv, path, speaker, tipo_file):
    if tipo_file == 'word':
        name_column = 'word'
    else: 
        name_column = 'utterance'
    doc = open(path + '\\R' + (str(id_conv)[:2]) + '\\R' + id_conv + '\sw' + id_conv + speaker + '-ms98-a-' + tipo_file + '.text')
    A = doc.readlines()
    df_A = pd.DataFrame(A, columns = ['raw'])
    raw = '([a-zA-Z][a-zA-Z](\d)(\d)(\d)(\d)[a-zA-Z](\-)[a-zA-Z][a-zA-Z](\d)(\d)(\-)(\w)(\-)([0-9]*))'
    Label = df_A['raw'].str.extract(raw,expand=True)
    Label.rename(columns={'':'label'}, inplace=True)
    Label.rename(columns={list(Label)[0]:'label'}, inplace=True)
    df_A['raw'] = df_A['raw'].str.replace('\s+', ' ')
    df_A['label'] = Label['label']
    df_A['start_time'] = df_A['raw'].str.extract('([0-9]*.\d\d\d\d\d\d\s[0-9]*.\d\d\d\d\d\d)', expand=True)
    df_A['start_time'] = df_A['start_time'].str.replace('(\s[0-9]*.\d\d\d\d\d\d)', '')
    df_A['end_time'] = df_A['raw'].str.extract('([0-9]*.\d\d\d\d\d\d\s[0-9]*.\d\d\d\d\d\d)', expand=True)
    df_A['end_time'] = df_A['end_time'].str.replace('([0-9]*.\d\d\d\d\d\d)\s', '')
    df_A[name_column] = df_A['raw'].str.replace('([a-zA-Z][a-zA-Z](\d)(\d)(\d)(\d)[a-zA-Z](\-)[a-zA-Z][a-zA-Z](\d)(\d)(\-)(\w)(\-)([0-9]*)(\s+)([0-9]*.\d\d\d\d\d\d(\s+)[0-9]*.\d\d\d\d\d\d)\s+)','')
    df_A['start_time'] = pd.to_numeric(df_A['start_time'])
    df_A['end_time'] = pd.to_numeric(df_A['end_time'])
    return df_A

In [3]:
 import time, sys

# update_progress() : Displays or updates a console progress bar
## Accepts a float between 0 and 1. Any int will be converted to a float.
## A value under 0 represents a 'halt'.
## A value at 1 or bigger represents 100%
def update_progress(progress):
    barLength = 20 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "Halt...\r\n"
    if progress >= 1:
        progress = 1
        status = "Done...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{0}] {1}% {2}".format( "="*block + " "*(barLength-block), progress*100, status)
    sys.stdout.write(text)
    sys.stdout.flush()

In [None]:
# This function collects all the words in the SWITCHBOARD corpus 
# INPUT: path where are the data, conv_dati that contains the conversations ID
# OUTPUT: data (the entire ensemble of words)
def vocabulary(path, conv_dati):
    #for id_conv in conv_dati['id_conv']:
    from datetime import datetime 
    startTime= datetime.now()
    data = pd.DataFrame(columns=['raw','label','start_time','end_time','word'])
    progress = (1/len(conv_dati['id_conv']))
    for id_conv in conv_dati['id_conv'].apply(str):    
        for speaker in ['A','B']:
            data_temp = download_data1(id_conv, path, speaker,'word')
            frames = [data, data_temp]
            data = pd.concat(frames, ignore_index = True)
        progress = progress + (1/len(conv_dati['id_conv']))
        update_progress(progress)
        
    data['word'] = data['word'].str.strip() 
    ID =  '(\d\d\d\d)'
    ID_utterance =  '(\-\d\d\d\d)'
    ID_speaker = '(\d\d\d\d[A-Z])'
    ID = '(\d\d\d\d[A-Z]-[a-z]{2}[0-9]{2}-[a-z]-\d\d\d\d)'
    data['ID_utterance'] = data['label'].str.extract(ID_utterance, expand=True)
    data['ID_utterance'] = data['ID_utterance'].str.replace('(\-)', '')
    data['ID_conv'] = data['label'].str.extract(ID, expand=True)
    data['ID_speaker'] = data['label'].str.extract(ID_speaker, expand=True)
    data['ID_speaker'] = data['ID_speaker'].str.replace('(\d\d\d\d)', '')
    data['ID'] = data['label'].str.extract(ID, expand=True)
    data['ID'] = data['ID'].str.replace('(-[a-z]{2}[0-9]{2}-[a-z]-)', '')
    data['duration'] = data['end_time'] - data['start_time']
    timeElapsed=datetime.now()-startTime 
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))   
    
    ## to save data
    data.to_pickle('..\output\data.pkl')
    return data    

In [5]:
### This function computes the counts, the median duration of each word in the vocabulary

# input: data (all the words in the vocabulary and its duration)
# output: frequency_voc ( median and cpunts for each word)

def frequency_median(data): 
    from datetime import datetime 
    startTime= datetime.now()
    frequency_voc = data.groupby(['word'])['word'].count()
    frequency_voc = pd.DataFrame(frequency_voc)
    frequency_voc['words'] = frequency_voc.index
    frequency_voc.columns = ['counts', 'words']
    frequency_voc= frequency_voc.sort_values('counts', ascending = False)
    N_word = frequency_voc.count()

    frequency_voc['words'] = frequency_voc['words'].str.strip()
    frequency_voc.columns = ['counts', 'word']
    
    progress = (1/len(frequency_voc['word']))
    for word in frequency_voc['word']:
##for word in ['[silence]\n','i\n', 'the\n' ,'you\n']:    
        temp = data[(data['word'] == word )]
        temp['duration'] = temp['end_time'] - temp['start_time']
        median_time_word = temp['duration'].median()
        frequency_voc.loc[word, 'median_duration'] = median_time_word
        
        progress = progress + (1/len(frequency_voc['word']))
        update_progress(progress)
        
    timeElapsed=datetime.now()-startTime 
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))        

    ### to save frequency_voc
    frequency_voc.to_pickle('..\\output\\frequency_vocabulary.pkl')
    return frequency_voc


In [4]:
# This function computes the percentage of the words covered by: 
#     the first n word ( specifiyng the int argument 'first_n_word' and setting the int 'N_min_occurence' as None)
#     the words which have at least a number min of occurence (specyfing the int argument 'N_min_occurences' and setting int 'first_n_word' as None)
#It is possibile to exclude some word specifying the string list 'word_to_exclude'

def frequency_word_analysis(frequency_voc, N_min_occurences, word_to_exclude, first_n_word):
    
    N_Tot = frequency_voc['counts'].sum()
    
    if (first_n_word is None) & (N_min_occurences is not None):    
        frequency_filter = frequency_voc[(frequency_voc['counts'] >= N_min_occurences) & (~frequency_voc['word'].isin(word_to_exclude))]
    elif (first_n_word is not None) & (N_min_occurences is None):
        frequency_filter = frequency_voc[(~frequency_voc['word'].isin(word_to_exclude))]
        frequency_filter = frequency_filter.head(first_n_word)
    else:
        print('error')
        return
    
    L = frequency_filter.count()
    L_Tot = frequency_filter['counts'].sum()
    perc_vocabulary = (L_Tot/N_Tot)*100
    
    print('\nFraction of the vocabulary covered by %d word = 0.%d, excluding %s ' % (L[1], perc_vocabulary, word_to_exclude))
    plt.figure(1)
    frequency_filter['counts'].plot(kind = 'bar', grid=False,  alpha = 0.3, figsize= (20,15))
    plt.xlabel('word') 
    plt.ylabel('count')
    plt.title('Word Frequency Vocabulary SWITCHBOARD')
    plt.figure(2)
    frequency_filter['median_duration'].plot(kind = 'bar', grid=False,  alpha = 0.3, figsize= (20,15))
    plt.xlabel('word') 
    plt.ylabel('time(s)')
    plt.title('Median words SWITCHBOARD')
    plt.show()

    return frequency_filter, perc_vocabulary

In [6]:
## This function compute a dataframe with the following information ['word','ID','lenght_utterance','distance_to_end','duration']
# input: frequency_voc, data, word_to_exclude(array of words to be excluded), n (the minimum lenght to take into account an utterance)
def processing_data(frequency_voc, data, word_to_exclude, n):
    from datetime import datetime 

    startTime= datetime.now()
    cols = ['word']
    frequency_voc_median = frequency_voc[['word', 'median_duration']]
    X = pd.DataFrame(columns=['word','ID','lenght_utterance','distance_to_end','duration','start_time','end_time'])
    data = data[(~data['word'].isin(word_to_exclude))]
    data = data[['duration', 'ID','word','start_time','end_time']]
    for ID in data.ID.unique():
        data_filter = data[(data['ID'] == ID )]
        N = data_filter['word'].count()
        if N >= n:
            data_filter['lenght_utterance'] = data_filter['word'].count()
            distance = list(range(0,N))
            distance = distance[::-1]
            data_filter['distance_to_end'] = distance
            data_filter = data_filter.join(frequency_voc_median.set_index(cols), on=cols, sort = False)
        else:
            continue
        frames = [X, data_filter]
        X = pd.concat(frames, ignore_index = True)

        
    timeElapsed=datetime.now()-startTime 
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))
   
    return X
     

In [7]:
## This function divides the dataframe DATA in intervals of size about n_each_interval(int)

def divide_interval_data(data, n_each_interval):
    i = 0 
    indices = [0]
    while(i < data.index[-1] - n_each_interval):
        i = i + n_each_interval
        while(data.iloc[i]['ID'] == data.iloc[i+1]['ID']):
            i = i + 1
        indices.append(i)
    indices.append(data.index[-1])
    return indices

In [9]:
def join_dataframe(indices, n_directory):
    dataframe = []
    for i in range(len(indices) - 1):
        name_file = '..\\output\\X' + str(n_directory) + '\\X_' + str(i) + '.pkl'
        temp_X = pd.read_pickle(name_file)
        dataframe.append(temp_X)
        
    while(len(dataframe) > 1):
        indices_delete = []
        if len(dataframe) % 2 == 1:
            n = len(dataframe) - 1 
        else:
            n = len(dataframe)
        for i in range(0,n,2):
            frames = [dataframe[i], dataframe[i+1]]
            dataframe[i] = pd.concat(frames, ignore_index = True)
        dataframe = dataframe[0:len(dataframe):2]
        name_file_to_save = '..\\output\\X' + str(n_directory) + '\\X' + '.pkl'
        dataframe[0]['ID_conversation'] = dataframe[0]['ID'].str.extract('(\d\d\d\d)', expand=True) 
        dataframe[0]['ID_speaker'] = dataframe[0]['ID'].str.extract('([A-Z])', expand=True)
        dataframe[0]['ID_utterance'] = dataframe[0]['ID'].str.extract('(\d\d\d\d$)', expand=True)

        dataframe[0].to_pickle(name_file_to_save)
    return dataframe     
        