#### Load Libraries

In [1]:
import numpy as np
import os
import librosa
import pandas as pd
from pydub import AudioSegment
import string
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re

##### Configuration settings

In [2]:
os.chdir("C:/Users/danie/Speaknow/projects")

# Specify the path to the directory containing the MP3 files
folder_path = './Data/audio'
audio_dir = "./Data/training/audio"
wav_file_directory = "./Data/training/audio"
chatgpt_transcripts = "./Data/training/chatgpt_transcripts"

# List all files in the directory
files = os.listdir(folder_path)

# Filter out only the MP3 files
mp3_files = [file for file in files if file.endswith('.mp3')]
# wav_files = [file for file in files if file.endswith('.wav')]

# Load pre-trained GloVe word embeddings
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings
    
glove_file = "./Model/pretrained/Glove6B/glove.6B.100d.txt"  # Path to pre-trained GloVe embeddings file
embeddings = load_glove_embeddings(glove_file)

##### Convert mp3 files to wav files

In [3]:
def convert_mp3_to_wav(audio_input_dir, audio_output_dir):
    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        # Check if it's an mp3 file
        if filename.endswith(".mp3"):
            # Define input and output filenames
            input_file = folder_path + "/" + filename
            output_file = os.path.splitext(input_file)[0] + ".wav"
    
            print(input_file)
            print(output_file)
            # Load the audio segment
            sound = AudioSegment.from_mp3(input_file)
    
            # Export the audio as wav
            sound.export(output_file, format="wav")
    
            print(f"Converted {input_file} to {output_file}")


##### Generate transcripts using the HuggingFace tokenizer 

Note: This model has not been fine-tuned (yet)

##### Load and preprocess the audio files so that they are normalized and noise-reduced

In [4]:
def preprocess_audio(audio_file, target_sr=16000, noise_threshold=0.005):
    """
    Preprocess audio file.
    
    Args:
    - audio_file: path to the audio file
    - target_sr: target sampling rate (default: 16000 Hz)
    - noise_threshold: threshold for noise reduction (default: 0.005)
    
    Returns:
    - audio_data: preprocessed audio data (numpy array)
    - sampling_rate: sampling rate of the audio data
    """
    try:
        # Load entire audio file
        audio_data, sampling_rate = librosa.load(audio_file, sr=target_sr, mono=False, duration=None)

        audio_duration = len(audio_data) / sampling_rate

        # # Ensure fixed duration
        # if len(audio_data) < target_sr * duration:
        #     shortage = target_sr * duration - len(audio_data)
        #     audio_data = np.pad(audio_data, (0, shortage), mode='constant')

        if audio_data.ndim == 1:
            channels = 1  # Mono audio
        else:
            channels = y.shape[0]

        # Normalize audio data
        audio_data = librosa.util.normalize(audio_data)

        # Apply noise reduction
        threshold = np.max(audio_data) * noise_threshold
        audio_data[np.abs(audio_data) < threshold] = 0

        return audio_data, sampling_rate, channels, audio_duration 
    
    except Exception as e:
        print(f"Error processing audio file {audio_file}: {e}")
        return None, None, None, None


##### Extract relevant features from the audio files to be used in the models
These are features that are derived from the sound file itself and not from the text

In [5]:
def extract_audio_features(audio_data, sampling_rate):
    """
    Extract relevant features from the audio data.
    
    Args:
    - audio_data: preprocessed audio data (numpy array)
    - sampling_rate: sampling rate of the audio data
    
    Returns:
    features containing:
    - formants: Resonant frequencies of the vocal tract
        --> flattened into mean, median, std, min and max
    - pitch: Fundamental frequency (F0) of the speech signal
    - intensity: Energy level of the speech signal
    - speech_rate: Number of words spoken per unit time
    - pauses_duration: Total duration of pauses in speech (in seconds)
    - pauses_frequency: Frequency of pauses in speech (number of pauses per second)
    - audio_duration: Duration of the audio signal (in seconds)
    """
    try:
        # Extract MFCCs (Mel-Frequency Cepstral Coefficients)
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sampling_rate, n_mfcc=13)
        mfccs_mean = np.mean(np.mean(mfccs, axis=1))
        mfccs_std = np.std(np.std(mfccs, axis=1))
        mfccs_var = np.var(np.var(mfccs, axis=1))
        
        # Extract spectral centroid
        spectral_centroid = librosa.feature.spectral_centroid(y=audio_data, sr=sampling_rate)
        spectral_centroid_mean = np.mean(spectral_centroid)
        
        # Extract spectral bandwidth
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_data, sr=sampling_rate)
        spectral_bandwidth_mean = np.mean(spectral_bandwidth)
    
        # Extract pitch (fundamental frequency)
        pitch, _ = librosa.piptrack(y=audio_data, sr=sampling_rate)
        pitch = np.mean(pitch[pitch > 0])  # Average non-zero pitch values
    
        # Extract intensity (energy level)
        intensity = np.mean(librosa.feature.rms(y=audio_data))
    
        # Extract formants (resonant frequencies)
        formants = librosa.effects.harmonic(audio_data)
        formants = librosa.feature.spectral_centroid(y=formants, sr=sampling_rate)
        
        # Flatten formants
        formants_flat = [np.mean(formants), np.median(formants), np.std(formants), np.min(formants), np.max(formants)]
    
        # Extract speech rate (number of words spoken per unit time)
        # We can estimate speech rate by counting the number of zero-crossings in the audio signal
        zcr = librosa.feature.zero_crossing_rate(y=audio_data)
        speech_rate = np.mean(zcr)

        intensity_to_speech_rate_ratio = intensity / speech_rate
        
        # Extract pauses (duration and frequency)
        # We can estimate pauses by detecting segments of low energy in the audio signal
        energy = librosa.feature.rms(y=audio_data)
        threshold = np.mean(energy) * 0.5  # Using 50% of the mean energy as threshold
        pauses_duration = np.sum(energy[0] < threshold) / sampling_rate  # Total duration of pauses in seconds
        pauses_frequency = np.mean(energy[0] < threshold)  # Frequency of pauses per second
    
        # Calculate the duration of the audio signal
        audio_duration = len(audio_data) / sampling_rate
    
        features = pd.DataFrame({"formants_mean": np.mean(formants), 
                                 "formants_median": np.median(formants), 
                                 "formants_std": np.std(formants), 
                                 "formants_min": np.min(formants), 
                                 "formants_max": np.max(formants), 
                                 "mfccs_mean": mfccs_mean,
                                 "mfccs_std": mfccs_std, 
                                 "mfccs_var": mfccs_var,
                                 "spectral_centroid_mean": spectral_centroid_mean, 
                                 "spectral_bandwidth_mean": spectral_bandwidth_mean, 
                                 "pitch": pitch, 
                                 "intensity": intensity, 
                                 "speech_rate": speech_rate, 
                                 "intensity_to_speech_rate_ratio": intensity_to_speech_rate_ratio,
                                 "pauses_duration": pauses_duration, 
                                 "pauses_frequency": pauses_frequency}, index=[0])
    
        return features.reset_index(drop=True)

    except Exception as e:
        print(f"Error extracting audio features: {e}")
        return None


##### Run the preprocessing and feature extraction functions for each audio file and return a corresponding feature set

In [6]:
def retrieve_audio_features(mp3_files):
    features = []
    features_df = pd.DataFrame([])
    
    for audio_file in mp3_files:
        audio_file = folder_path+"/"+audio_file
        
        preprocessed_audio, sampling_rate, channels, audio_duration = preprocess_audio(audio_file)
    
        _, audio_file = os.path.split(audio_file) #remove the folder path
        audio_file, _ = os.path.splitext(audio_file) # remove the file extension
        
        initial_features = pd.DataFrame({"audio_file": audio_file, "sampling_rate": sampling_rate, "channels": channels, "audio_duration": audio_duration}, index=[0]) 
        
        features = extract_audio_features(preprocessed_audio, sampling_rate)
    
        temp_df = pd.concat([initial_features, features], axis=1)
    
        features_df = pd.concat([features_df, temp_df], axis=0)

    return features_df

##### Load the csv file into a dataframe & add the transcripts

In [7]:
# Function to read and return the transcript text from file 
def read_transcript(transcript_dir, file_name, suffix):
    try:
        fn = os.path.join(transcript_dir, f"{file_name}-{suffix}.txt")
        with open(fn, 'r') as file:
            return file.read().strip()
    except FileNotFoundError:
        return None  

##### Reshape the dataframe so that all transcripts from common assesment id's are on separate rows

##### Compute the Brunets Index
This gives an idea of the lexical diversity of the text

In [20]:
df

Unnamed: 0,assessment_id,pronunciation_avg,vocab_avg,fluency_avg,cohesion_avg,grammar_avg,cefr_avg,question_1_transcript,question_2_transcript,question_3_transcript,question_4_transcript,question_5_transcript,transcript-1,transcript-2,transcript-3,transcript-4,transcript-5
0,1692530001326214,4.50,4.50,4.00,4.0,4.50,4.5,"So my favorite TV show would be the, uh the Fi...","Uh, there are actually many different jobs I d...","Uh, I consider charity organizations important...","I, I think that the home schooling becomes uh ...",I consider studying foreign languages. Uh real...,So my favorite TV show would be The Firefly. I...,There are actually many different jobs I don't...,I consider charity organizations important bec...,I think that homeschooling became really popul...,I consider studying foreign languages really i...
1,1647885061811312,3.33,4.33,3.67,4.0,3.67,4.0,"Yeah, I would be happy to go to the past and m...",I prefer to be most remembered for my great wi...,I prefer to be the first person to explore a p...,I think there should be restrictions for publi...,"Yeah, I prefer to be remembered for my great w...",I would be happy to go to the past and meet my...,I prefer to be most remembered for my great wi...,I prefer to be the first person to explore a p...,I think there should be restrictions for publi...,"Yeah, I prefer to be remembered for my great w..."
2,1667330281849843,5.50,5.00,4.50,5.0,5.00,5.0,I would much rather be a first version me than...,This is an excellent question. I think I would...,This is an interesting question. The potential...,I think that this is an excellent question. It...,"As a student for an All girls high school, I a...",I would much rather be a first version me than...,This is an excellent question. I think I would...,This is an interesting question. The potential...,I think that this is an excellent question. It...,"As a student for an all-girls high school, I a..."
3,1691402791754105,4.00,4.00,3.50,3.0,4.00,4.0,My advice to a worker in my office is if you d...,"Uh, I would, I would love to communicate with ...","That's a good question. I do not have, I don't...","Ok. Then if I'll have an, if I have an ideal p...",In my city. Uh the system is that uh everyone ...,My advice to a worker in my office is if you d...,I would love to communicate with dolphins. I d...,That's a good question. I do not have... I don...,"Okay then, if I have an ideal project, I would...","In my city, the system is that everyone should..."
4,1691402791754221,2.50,3.00,2.50,2.5,2.50,2.5,"I'm working in the production, in my, in, in m...","I'm, uh, got to talk in the teacher of the, an...","Er, I like to go to the, er, in engineering, i...","No, I just, uh, stay just me. I, I like the, w...","No, I don't want to move to a new city or, uh,...","I am working in the production in my job, I am...",I am going to talk to the teacher of the Engli...,I like to go to the engineering in my workplac...,"No, I just stay just me. I like what I do, I l...","No, I don't want to move to a new city or to a..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,1687964058482484,1.50,2.00,1.50,1.5,1.50,1.5,My favorite class in school is informatics. Er...,"My family is, uh, make me happy. Um, my family...","In the last, in the vacation of last year I tr...","In my time happy when my sister, not, not the ...","I don't think, er, planning, er, to last summe...",My favorite class in school is informatics bec...,My family is make me happy. My family is more....,"In the last, in the vacation of last year, I t...",I'm happy when my sister not destroy the house...,I don't think planning to next summer. Where w...
95,1685651809260972,3.50,3.50,2.50,2.0,2.50,2.5,"I use, uh, uh, computer because it, it's so fu...",We are six people and my father is uh is a war...,"I'm gonna talk about my friend, my best friend...","And for me, uh, an interesting, some, somethin...","To the next summer I will go to, to the don't ...",I use a computer because it's so fun to use it...,"We are six people and my father is a work, is ...","I'm gonna talk about my friend, my best friend...","And for me, an interesting, something interest...",to the next summer i will go to the down cost ...
96,1662420268098487,3.00,4.00,3.00,3.0,3.00,3.0,the person who is the most important people. N...,"my best teacher was maybe, I don't know. My my...",,"Well, I think um different um work office is n...","is is um, difficult question because, um, I th...",The person who is the most important person in...,"My best teacher was maybe, I don't know, my fi...",Thank you. Bye bye. Bye bye. Bye bye. Bye bye....,"Well, I think different work office is not usu...",It's a difficult question because I think the ...
97,1687976114519479,3.50,4.50,4.00,3.0,3.00,4.0,I will travel with my parents to Colombia beca...,I think the biggest problem for kids today is ...,I didn't know about the system of recycling in...,"If I start my own business, it will be a cafet...",I think high school students will have to wear...,I will travel with my parents to Colombia beca...,I think the biggest problem for kids today is ...,I didn't know about the system of recycling in...,If I start my own business it will be a cafete...,I think high school students should have to we...


In [21]:
def getfirstindexvalue(text):
    try:
        retval = text.split('_')[1]
    except:
        retval = " "
    return retval
    
def get_transcript_df(get_transcript_df, provided):
    df['assessment_id'] = df['assessment_id'].astype(str)
    
    # Reshape the DataFrame
    if provided == True:
        reshaped_df = pd.melt(df[['assessment_id', 'question_1_transcript','question_2_transcript','question_3_transcript','question_4_transcript','question_5_transcript']], id_vars=['assessment_id'], var_name='value', value_name='transcript')
        reshaped_df['assessment_id'] = reshaped_df['assessment_id'] + '-' + reshaped_df['value'].apply(getfirstindexvalue)
    else:
        reshaped_df = pd.melt(df[['assessment_id', 'transcript-1','transcript-2','transcript-3','transcript-4','transcript-5']], id_vars=['assessment_id'], var_name='value', value_name='transcript')
        reshaped_df['assessment_id'] = reshaped_df['assessment_id'] + '-' + reshaped_df['value'].str.split('-').str[-1]        

    # # Drop the 'value' column
    reshaped_df.drop(columns=['value'], inplace=True)
    
    return reshaped_df

In [9]:
def get_brunets_index(transcript):
    """
    Calculate Brunet's Index for lexical diversity of a text.

    Args:
    - text: List of words representing the text.

    Returns:
    - Brunet's Index value.
    """
    if not transcript is None: 
        transcript = str(transcript)
        text_without_punctuation = transcript.translate(str.maketrans('', '', string.punctuation)).lower()
        
        transcript_list = text_without_punctuation.split()
        
        # Calculate the total number of words in the text
        total_words = len(transcript)
    
        # Calculate the total number of unique words in the text
        unique_words = len(set(transcript))
    
        # Calculate Brunet's Index
        brunets_index = unique_words ** (0.165)
    else:
        brunets_index = None
        
    return brunets_index

##### Compute the average sentence length

In [10]:
# Function to tokenize text into sentences and words, and compute average sentence length
def average_sentence_length(transcript):

    if not transcript is None:
        # Tokenize text into sentences
        sentences = nltk.sent_tokenize(str(transcript))
    
        # Tokenize each sentence into words
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    
        # Calculate total number of words and sentences
        total_words = sum(len(sentence) for sentence in tokenized_sentences)
        total_sentences = len(tokenized_sentences)
    
        # Calculate average sentence length
        if total_sentences == 0:
            return 0
    else:
        return 0
        
    return total_words / total_sentences

##### Compute text cohesion
This refers to the degree of semantic relatedness or connectedness between words in a text. It is a measure of how well the words in a text are integrated or linked together in meaning

In [11]:
# Function to compute cohesion based on cosine similarity of word embeddings
def embedding_cohesion(text):

    if not text is None:
        text = str(text).lower()
        # Tokenize text into words
        words = nltk.word_tokenize(text)
        
        # Calculate pairwise cosine similarity between word embeddings
        similarities = []
        for i in range(len(words)):
            for j in range(i+1, len(words)):
                word1 = words[i]
                word2 = words[j]
                if word1 in embeddings and word2 in embeddings:
                    similarity = cosine_similarity([embeddings[word1]], [embeddings[word2]])[0][0]
                    similarities.append(similarity)
        
        # Calculate average similarity (cohesion)
        if similarities:
            return np.mean(similarities)
        else:
            return 0
    else:
        return 0

##### Compute the flesch_kincaid grade level and coleman_liau index of the transcript, to be added to the transcript data
Both are readability measures:
<br>- Flesch-Kincaid Grade Level: takes into account syllable count and sentence length
<br>- Coleman-Liau Index: primarily considers word length and sentence length

In [12]:
# Helper function to count syllables in a word
def syllable_count(word):
    word = word.lower()
    if len(word) <= 3:
        return 1
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for i in range(1, len(word)):
        if word[i] in vowels and word[i - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count = 1
    return count
    
# Function to compute Flesch-Kincaid Grade Level
def flesch_kincaid_grade_level(transcript):
    if transcript is None:
        return 0
    else:
        transcript = str(transcript)
        sentences = sent_tokenize(transcript)
        words = word_tokenize(transcript)
        num_sentences = len(sentences)
        num_words = len(words)
        num_syllables = sum([syllable_count(word) for word in words])
        
        # Compute Flesch-Kincaid Grade Level
        if num_sentences == 0 or num_words == 0:
            return 0
        return 0.39 * (num_words / num_sentences) + 11.8 * (num_syllables / num_words) - 15.59

# Function to compute Coleman-Liau Index
def coleman_liau_index(transcript):

    if transcript is None:
        return 0
    else:
        transcript = str(transcript)
        sentences = sent_tokenize(transcript)
        words = word_tokenize(transcript)
        num_sentences = len(sentences)
        num_words = len(words)
        
        if len(words) == 0:
            return 0
        
        num_characters = sum(len(word) for word in words)
        
        # Compute Coleman-Liau Index
        L = num_characters / num_words * 100
        S = num_sentences / num_words * 100
        return 0.0588 * L - 0.296 * S - 15.8

#### Generate the Automated Readability Index (ARI)

In [27]:
def automated_readability_index(transcript):
    """
    Calculate the Automated Readability Index (ARI) of a text.
    
    Parameters:
        transcript (text): the text of the transcript
        
    Returns:
        float: The Automated Readability Index (ARI) score.
    """
    transcript = str(transcript)
    
    characters = len(transcript)
    words = len(transcript.split())
    sentences = transcript.count('.') + transcript.count('!') + transcript.count('?')  # Assuming sentences end with '.', '!', or '?'

    if len(transcript)>0 and sentences == 0:
        ari = 1
    elif words > 0 and sentences > 0:
        ari = 4.71 * (characters / words) + 0.5 * (words / sentences) - 21.43
    else: 
        ari = 0
    
    return ari

##### Count repeated words in the transcript, to be added to the transcript data

In [13]:
# Function to detect repetitions in text
def detect_repetitions(transcript):
    # Define regular expression pattern to match repeated sequences of words
    pattern = re.compile(r'\b(\w+)\s+\1\b', re.IGNORECASE)

    transcript = str(transcript)
    
    if transcript is None:
        return 0
    else:
        # Find all repeated sequences in the text
        repetitions = re.findall(pattern, transcript)
    
        return len(repetitions)

##### Count filler words in the transcript, to be added to the transcript data

In [14]:
# Count filler words in the transcript
def filler_words(transcript):
    filler_words = ["um", "uh", "like", "you know", "actually", "basically", "hmm", "ahem", "i mean", "so"]  
    if not transcript is None:
        transcript = str(transcript)
        cnt = sum(str(transcript).lower().count(word) for word in filler_words)
    else:
        cnt = 0
    return cnt

##### Merge all the dataframes

In [15]:
def merge_datasets(df, transcript_df, features_df):
    df['assessment_id'] = df['assessment_id'].astype(str)

    # Duplicate the rows for each assessment_id and rename the assessment_id's with -1 to -5 suffixes accordingly
    temp = df[['assessment_id', 'pronunciation_avg', 'vocab_avg', 'fluency_avg', 'cohesion_avg', 'grammar_avg', 'cefr_avg']]
    df_duplicated = pd.concat([temp]*5, ignore_index=True)
    
    suffixes = ['-1', '-2', '-3', '-4', '-5']
    df_duplicated['suffix'] = df_duplicated.groupby(['assessment_id']).cumcount() % len(suffixes)
    df_duplicated['assessment_id'] = df_duplicated['assessment_id'] + '-' + df_duplicated['suffix'].add(1).astype(str)
    
    df_duplicated = df_duplicated.sort_values(by=['assessment_id']).reset_index(drop=True)
    
    df_full = df_duplicated.merge(transcript_df, left_on='assessment_id', right_on='assessment_id', how='inner')
    df_full = df_full.merge(features_df, left_on='assessment_id', right_on='audio_file', how='inner')
    df_full.drop(columns=['audio_file','audio_duration','sampling_rate','channels'], inplace=True)
    df_full = df_full.sort_values(by='assessment_id').reset_index(drop=True)
    
    return df_full

##### Prepare full dataset for modelling

##### Run all code

In [28]:
print(">>> Execution Initiated")

# print(">>> Running retrieve_audio_features()")

# Generate the initial audio feature set 
features_df = retrieve_audio_features(mp3_files)

print(">>> Loading the provided transcripts")

# Load the CSV file and add the transcripts
df = pd.read_csv('./Data/text/SpeakNow_test_data.csv')


#### First extract the provided transcripts ####

# Get and reshape the transcript data from the dataset  
transcripts_df = get_transcript_df(df, provided=True)

print(">>> Running filler_words()")

# Add the filler words to the data
transcripts_df['filler_word_count'] = transcripts_df.transcript.apply(filler_words)

print(">>> Running merge_datasets()")

# Merge the datasets
full_text_df = merge_datasets(df, transcripts_df, features_df)

print(">>> Running get_brunets_index()")

# Get Brunets index (measure of lexical diversity that indicates the richness of the vocabulary used)
full_text_df['brunets_index'] = full_text_df.apply(lambda row: get_brunets_index(row['transcript']), axis=1)

print(">>> Running average_sentence_length()")

full_text_df['average_sentence_length'] = full_text_df.apply(lambda row: average_sentence_length(row['transcript']), axis=1)

print(">>> Running embedding_cohesion()")

full_text_df['cohesion_score'] = full_text_df.apply(lambda row: embedding_cohesion(row['transcript']), axis=1)

print(">>> Running flesch_kincaid_grade_level()")

full_text_df['flesch_kincaid_score'] = full_text_df.apply(lambda row: flesch_kincaid_grade_level(row['transcript']), axis=1)

print(">>> Running coleman_liau_index()")

full_text_df['coleman_liau_index'] = full_text_df.apply(lambda row: coleman_liau_index(row['transcript']), axis=1)

print(">>> Running automated_readability_index")

full_text_df['automated_readability_index'] = full_text_df.apply(lambda row: automated_readability_index(row['transcript']), axis=1)

print(">>> Running detect_repetitions()")

full_text_df['repetitions'] = full_text_df.apply(lambda row: detect_repetitions(row['transcript']), axis=1)

print(">>> Writing the dataframe to file")

full_text_df.to_csv("full_df_provided_transcripts.csv", index=False)

### Next extract the transcripts as transcribed from the mp3 files ####

# Get and reshape the transcript data from the dataset
print(">>> Running read_transcript()")

# Read the transcripts
for i in range(1,6):
    df[f'transcript-{str(i)}'] = df.apply(lambda row: read_transcript(chatgpt_transcripts, row['assessment_id'], str(i)), axis=1)

print(">>> Running get_transcript_df() - extracting the provided transcripts")
transcripts_df = get_transcript_df(df, provided=False)

print(">>> Running filler_words()")

# Add the filler words to the data
transcripts_df['filler_word_count'] = transcripts_df.transcript.apply(filler_words)

print(">>> Running merge_datasets()")

# Merge the datasets
full_speech_df = merge_datasets(df, transcripts_df, features_df)

print(">>> Running get_brunets_index()")

# Get Brunets index (measure of lexical diversity that indicates the richness of the vocabulary used)
full_speech_df['brunets_index'] = full_speech_df.apply(lambda row: get_brunets_index(row['transcript']), axis=1)

print(">>> Running average_sentence_length()")

full_speech_df['average_sentence_length'] = full_speech_df.apply(lambda row: average_sentence_length(row['transcript']), axis=1)

print(">>> Running embedding_cohesion()")

full_speech_df['cohesion_score'] = full_speech_df.apply(lambda row: embedding_cohesion(row['transcript']), axis=1)

print(">>> Running flesch_kincaid_grade_level()")

full_speech_df['flesch_kincaid_score'] = full_speech_df.apply(lambda row: flesch_kincaid_grade_level(row['transcript']), axis=1)

print(">>> Running coleman_liau_index()")

full_speech_df['coleman_liau_index'] = full_speech_df.apply(lambda row: coleman_liau_index(row['transcript']), axis=1)

print(">>> Running automated_readability_index")

full_speech_df['automated_readability_index'] = full_speech_df.apply(lambda row: automated_readability_index(row['transcript']), axis=1)

print(">>> Running detect_repetitions()")

full_speech_df['repetitions'] = full_speech_df.apply(lambda row: detect_repetitions(row['transcript']), axis=1)

print(">>> Writing the dataframe to file")

full_speech_df.to_csv("full_df_transcribed_transcripts.csv", index=False)

print(">>> Execution complete")


>>> Running automated_readability_index
>>> Running detect_repetitions()
>>> Writing the dataframe to file
>>> Running read_transcript()
>>> Running get_transcript_df() - extracting the provided transcripts
>>> Running filler_words()
>>> Running merge_datasets()
>>> Running get_brunets_index()
>>> Running average_sentence_length()
>>> Running embedding_cohesion()
>>> Running flesch_kincaid_grade_level()
>>> Running coleman_liau_index()
>>> Running automated_readability_index
>>> Running detect_repetitions()
>>> Writing the dataframe to file
>>> Execution complete


In [29]:
full_text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 33 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   assessment_id                   495 non-null    object 
 1   pronunciation_avg               495 non-null    float64
 2   vocab_avg                       495 non-null    float64
 3   fluency_avg                     495 non-null    float64
 4   cohesion_avg                    495 non-null    float64
 5   grammar_avg                     495 non-null    float64
 6   cefr_avg                        495 non-null    float64
 7   suffix                          495 non-null    int64  
 8   transcript                      328 non-null    object 
 9   filler_word_count               495 non-null    int64  
 10  formants_mean                   495 non-null    float64
 11  formants_median                 495 non-null    float64
 12  formants_std                    495 

In [30]:
full_speech_df[~full_speech_df['transcript'].isna()]

Unnamed: 0,assessment_id,pronunciation_avg,vocab_avg,fluency_avg,cohesion_avg,grammar_avg,cefr_avg,suffix,transcript,filler_word_count,...,intensity_to_speech_rate_ratio,pauses_duration,pauses_frequency,brunets_index,average_sentence_length,cohesion_score,flesch_kincaid_score,coleman_liau_index,automated_readability_index,repetitions
0,1647885061811312-1,3.33,4.33,3.67,4.0,3.67,4.0,0,I would be happy to go to the past and meet my...,2,...,0.377500,0.047500,0.386965,1.771535,22.400000,0.625115,7.790643,3.826071,12.208367,0
1,1647885061811312-2,3.33,4.33,3.67,4.0,3.67,4.0,1,I prefer to be most remembered for my great wi...,6,...,0.414006,0.059125,0.377043,1.762279,21.800000,0.580784,10.124844,8.358165,15.898763,0
2,1647885061811312-3,3.33,4.33,3.67,4.0,3.67,4.0,2,I prefer to be the first person to explore a p...,4,...,0.437442,0.050563,0.348857,1.762279,17.375000,0.569591,5.872581,3.689784,9.678438,0
3,1647885061811312-4,3.33,4.33,3.67,4.0,3.67,4.0,3,I think there should be restrictions for publi...,5,...,0.337909,0.057375,0.363708,1.752770,29.000000,0.603732,12.707931,9.081724,19.104533,0
4,1647885061811312-5,3.33,4.33,3.67,4.0,3.67,4.0,4,"Yeah, I prefer to be remembered for my great w...",1,...,0.425983,0.050812,0.415432,1.762279,26.500000,0.604109,10.329906,6.048302,16.031154,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,1693181003292353-1,3.00,3.50,2.50,2.5,2.50,2.5,0,I live in Taipei and I like this area because ...,4,...,0.726157,0.044750,0.456051,1.722563,21.666667,0.578910,7.201538,2.916308,10.402204,0
491,1693181003292353-2,3.00,3.50,2.50,2.5,2.50,2.5,1,I think doctors are the most important job now...,2,...,0.542273,0.043625,0.416716,1.732930,19.750000,0.622965,7.497310,4.881519,11.708750,0
492,1693181003292353-3,3.00,3.50,2.50,2.5,2.50,2.5,2,I think in Taiwan the internet maybe is not ex...,3,...,0.598825,0.049625,0.408646,1.797924,21.000000,0.560056,8.333333,6.240476,16.359831,0
493,1693181003292353-4,3.00,3.50,2.50,2.5,2.50,2.5,3,I don't like to study online because I cannot ...,3,...,0.620194,0.045562,0.443971,1.752770,12.333333,0.615540,6.122703,4.207568,9.293177,0


In [31]:
speech_ax_features = full_speech_df[~full_speech_df['transcript'].isna()][['pronunciation_avg',
                              'vocab_avg',
                              'fluency_avg',
                              'cohesion_avg',
                              'grammar_avg',
                              'cefr_avg',
                              'filler_word_count',
                              'formants_mean',
                              'formants_median',
                              'formants_std',
                              'formants_min',
                              'formants_max',
                              'mfccs_mean',
                              'mfccs_std',
                              'mfccs_var',
                              'spectral_centroid_mean',
                              'spectral_bandwidth_mean',
                              'pitch',
                              'intensity',
                              'speech_rate',
                              'intensity_to_speech_rate_ratio',
                              'pauses_duration',
                              'pauses_frequency',
                              'average_sentence_length',
                              'cohesion_score',
                              'flesch_kincaid_score',
                              'coleman_liau_index',
                              'automated_readability_index',                                        
                              'repetitions',
                              'brunets_index']]

writing_ax_features = full_text_df[~full_text_df['transcript'].isna()][['pronunciation_avg',
                               'vocab_avg',
                               'fluency_avg',
                               'cohesion_avg',
                               'grammar_avg',
                               'cefr_avg',
                               'average_sentence_length',
                               'cohesion_score',
                               'flesch_kincaid_score',
                               'coleman_liau_index',
                               'automated_readability_index',                                        
                               'repetitions',
                               'brunets_index']]

all_features = full_df[~full_df['transcript'].isna()]

In [32]:
speech_ax_features

Unnamed: 0,pronunciation_avg,vocab_avg,fluency_avg,cohesion_avg,grammar_avg,cefr_avg,filler_word_count,formants_mean,formants_median,formants_std,...,intensity_to_speech_rate_ratio,pauses_duration,pauses_frequency,average_sentence_length,cohesion_score,flesch_kincaid_score,coleman_liau_index,automated_readability_index,repetitions,brunets_index
0,3.33,4.33,3.67,4.0,3.67,4.0,2,1575.008259,1280.371703,1077.019747,...,0.377500,0.047500,0.386965,22.400000,0.625115,7.790643,3.826071,12.208367,0,1.771535
1,3.33,4.33,3.67,4.0,3.67,4.0,6,1485.328376,1233.364249,1026.021623,...,0.414006,0.059125,0.377043,21.800000,0.580784,10.124844,8.358165,15.898763,0,1.762279
2,3.33,4.33,3.67,4.0,3.67,4.0,4,1577.452085,1300.776096,1015.432021,...,0.437442,0.050563,0.348857,17.375000,0.569591,5.872581,3.689784,9.678438,0,1.762279
3,3.33,4.33,3.67,4.0,3.67,4.0,5,1694.951272,1442.866943,1013.853015,...,0.337909,0.057375,0.363708,29.000000,0.603732,12.707931,9.081724,19.104533,0,1.752770
4,3.33,4.33,3.67,4.0,3.67,4.0,1,1694.526295,1396.984445,1132.584517,...,0.425983,0.050812,0.415432,26.500000,0.604109,10.329906,6.048302,16.031154,0,1.762279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,3.00,3.50,2.50,2.5,2.50,2.5,4,1905.328966,1838.152440,1015.867877,...,0.726157,0.044750,0.456051,21.666667,0.578910,7.201538,2.916308,10.402204,0,1.722563
491,3.00,3.50,2.50,2.5,2.50,2.5,2,1970.857456,1920.705853,1077.252573,...,0.542273,0.043625,0.416716,19.750000,0.622965,7.497310,4.881519,11.708750,0,1.732930
492,3.00,3.50,2.50,2.5,2.50,2.5,3,1820.651146,1492.729742,1060.049836,...,0.598825,0.049625,0.408646,21.000000,0.560056,8.333333,6.240476,16.359831,0,1.797924
493,3.00,3.50,2.50,2.5,2.50,2.5,3,1976.972462,1832.188552,1058.423466,...,0.620194,0.045562,0.443971,12.333333,0.615540,6.122703,4.207568,9.293177,0,1.752770
