# Analyse your transcript
What kind of behaviour would you like to analyse in your transcript? Use ChatGPT or any equivalent LLM-based application to give you a list of terms that relate to this particular behaviour. Use the prompt below if you like, or use your own.

---
#### NOTE:  
Replace **'______'** in the prompt below with a term that best describes the kind of behaviour you'd like to examine.

### Prompt to use:

Generate a list of words that could be used in a LIWC-like dictionary to identify instances of ______ in a transcript of a conversation between two or more people. Do not use wildcard characters, provide all inflected forms of words that might otherwise appear in a LIWC dictionary as wildcards. Make sure the list is all in lowercase, ordered alphabetically, with each set of inflected forms of a word together on the same line. Output this in the form of a list of strings that can be copied and pasted into python code as a list.

---


In [None]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from functools import reduce 
from IPython.display import Markdown, display

## Define your measure terms and give the measure a name

In [None]:
# copy the output from the LLM and paste it next to the variable named "measure_words" below 
# Make sure you have also copied the square braces [ ] that were part of the LLM output.

measure_words = [ ]

# Give your measure a single- or two-word name. 
# Replace the 'XXXX' below with the name.
# Make sure the name is enclosed in single or double quotation marks
measure_name = 'XXXX'

# Load Transcript

In [None]:
## **** USE THE CODE BELOW IF USING GOOGLE COLAB. COMMENT OUT THE REST. ****

from google.colab import files
uploaded = files.upload()

## **** USE THE CODE BELOW IF USING JUPYTER NOTEBOOK/LAB. COMMENT OUT THE REST. ****
# import io
# import ipywidgets as widgets
# uploader = widgets.FileUpload(
#     accept='.xlsx',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
#     multiple=False  # True to accept multiple files upload else False
# )
# display(uploader)

In [None]:
## **** USE THE CODE BELOW IF USING GOOGLE COLAB. COMMENT OUT THE REST. ****
# Copy the file path from the navigator to the left. Paste it into the '' sign within the function below.
transcript_df = pd.read_excel('')

## **** USE THE CODE BELOW IF USING JUPYTER NOTEBOOK/LAB. COMMENT OUT THE REST. ****
# uploaded_file = uploader.value[0]
# transcript_df = pd.read_excel(io.BytesIO(uploaded_file.content))

## **** KEEP THIS BIT OF THE CODE REGARDLESS OF COLAB/JUPYTER USAGE ****
print('###############################################')
print("Loaded", transcript_df.shape[0], "speech turns into a dataframe.")
print('###############################################')

In [None]:
# Print a random sample of the dataframe, showing 5 rows.
transcript_df.sample(5)

## Counting Relevant Terms in the Dataset
We now count the number of terms in each utterance that also exist in the above list of measure-related terms.

In [None]:
def make_regex(lex_list):
    separator = '\\b|\\b'
    reg_string = separator.join(lex_list)
    reg_string_whole_word = "\\b" + reg_string + "\\b"
    return reg_string_whole_word

def count_lexicon_terms(sentence, lexicon):
    reg_str = make_regex(lexicon)
    num_matches = len(list(re.finditer(reg_str, sentence.lower())))
    return num_matches

def count_words(sentence):
    words = sentence.split()
    return len(words)

In [None]:
def add_measure(df, lex_category_name, lex_category_list):
    if lex_category_name in df.columns :
        print("Column already exists for", lex_category_name, ". Repopulating...")
        df = df.drop(lex_category_name, axis=1)
    column_index = len(df.columns)
    measures = []
    measures_normalized = []
    word_counts = []
    for ind, utterance in enumerate(list(df['utterance'])) :
        if utterance != '' and utterance != ' ' :
            measure = count_lexicon_terms(utterance, lex_category_list)
            num_words = count_words(utterance)
            word_counts.append(num_words)
            measures_normalized.append(measure/num_words)
            measures.append(measure)
    df.insert(column_index, 'normalized '+lex_category_name+' measure', measures_normalized, True)
    df.insert(column_index, lex_category_name+' count', measures, True)
    if not 'word count' in df.columns :
        df.insert(column_index, 'word count', word_counts, True)
    return df

In [None]:
transcript_df = add_measure(transcript_df, measure_name, measure_words)

In [None]:
transcript_df.sample(5)

## Overview of speaker participation

In [None]:
speakers = list(set(transcript_df['speaker'].to_list()))
num_speakers = len(speakers)
word_count_df = transcript_df[['speaker', 'word count']].copy()
word_count_df['total word count'] = word_count_df.groupby('speaker')['word count'].transform('sum')
word_count_df = word_count_df[['speaker', 'total word count']].drop_duplicates()

measure_count_df = transcript_df[['speaker', measure_name+' count']].copy()
measure_count_df['total '+measure_name+' count'] = measure_count_df.groupby('speaker')[measure_name+' count'].transform('sum')
measure_count_df = measure_count_df[['speaker', 'total '+measure_name+' count']].drop_duplicates()


turn_counts = []
for speaker in speakers :
    turn_count = transcript_df[transcript_df['speaker'] == speaker].shape[0]
    turn_counts.append(turn_count)
turns_df = pd.DataFrame({'speaker': speakers, 'turn count': turn_counts})
# speaker_stats_df = pd.merge(word_count_df, turns_df, on="speaker")
speaker_stats_df = reduce(lambda  left,right: pd.merge(left,right,on='speaker'), [word_count_df, measure_count_df, turns_df])

In [None]:
speaker_stats_df['normalised '+measure_name+' count'] = speaker_stats_df['total '+measure_name+' count']/speaker_stats_df['total word count']
speaker_stats_df

In [None]:
sns.set_context('paper', font_scale = 1)
f, ax = plt.subplots(figsize=(7, num_speakers*0.5))
sns.barplot(data=speaker_stats_df, x='turn count', y='speaker', hue='speaker', palette ='Set2', linewidth=0)
ax.set(ylabel="", xlabel="Total "+ r"$\bf{speech~turns}$" + " by each speaker")
sns.despine(right=True, top=True)

In [None]:
sns.set_context('paper', font_scale = 1)
f, ax = plt.subplots(figsize=(7, num_speakers*0.5))
sns.barplot(data=speaker_stats_df, x='total word count', y='speaker', hue='speaker', palette ='Set2', linewidth=0)
ax.set(ylabel="", xlabel="Total "+ r"$\bf{words~spoken}$" + " by each speaker")
sns.despine(right=True, top=True)

In [None]:
sns.set_context('paper', font_scale = 1)
f, ax = plt.subplots(figsize=(7, num_speakers*0.5))
sns.barplot(data=speaker_stats_df, x='total word count', y='speaker', color ='#eee', linewidth=0)
sns.barplot(data=speaker_stats_df, x='total '+ measure_name + ' count', y='speaker', hue='speaker', palette ='Set2', linewidth=0)
ax.set(ylabel="", xlabel= r"$\bf{" + "~".join(measure_name.split(" ")) + "}$-related words in proportion to total words spoken by each speaker")
sns.despine(right=True, top=True)


In [None]:
sns.set_context('paper', font_scale = 1)
f, ax = plt.subplots(figsize=(7, num_speakers*0.5))
sns.barplot(data=speaker_stats_df, x='normalised '+ measure_name + ' count', y='speaker', hue='speaker', palette ='Set2', linewidth=0)
ax.set(ylabel="", xlabel=r"$\bf{" + "~".join(measure_name.split(" ")) + "}$-related words by each speaker (normalised)")
sns.despine(right=True, top=True)

-----

Note the differences between the number of words and the proportion (the two charts above). Is there a large difference in your case? 

Normalisation can distort reality, so it is important to examine the data from multiple perspectives.

-----

## Compute overall scores for the chosen measure
To compute the overall score for the measure, simply count all the occurrances of words from the dictionary category in the transcript, and divide that value by the total words in the transcript.

In [None]:
def printmd(string):
    display(Markdown(string))
total_word_count = speaker_stats_df['total word count'].sum()
total_measure_words = speaker_stats_df['total '+measure_name+' count'].sum()
normalised_measure_for_transcript = total_measure_words/total_word_count
printmd("------")
print("")
printmd("**%d** words from the **" % (total_measure_words) + measure_name + "** category found in the transcript out of **%d** total words." % (total_word_count))
printmd("Normalised **" + measure_name + "** score for the entire transcript: **%1.3f**" % (normalised_measure_for_transcript))
printmd("------")

## Examine word usage in detail

How do we know whether what you have identified (using dictionary categories and word counts) is accurate?

We can plot the occurrences at the speaker level to begin with.

### Plot turn-level word counts for each speaker

Let's start with visualising the number of words spoken for each turn. We do this separately for each speaker.

In [None]:
session_turns = transcript_df.shape[0]
transcript_df['turn index'] = np.arange(0,session_turns)
sns.set_style('ticks', {'axes.linewidth': 0.5, 'axes.edgecolor':'grey', 'axes.spinecolor' : 'grey'})
sns.set_context('talk', font_scale = 1)
# fig = plt.figure(dpi=200)
g = sns.FacetGrid(transcript_df, row="speaker", aspect=10)
g.map_dataframe(sns.barplot, x =transcript_df["turn index"], y="word count", color="cornflowerblue", linewidth=0)
p = plt.xticks(np.arange(0, session_turns, 20))
plt.xlabel('Speech Turn Index')
# plt.title("Word count for each speaker over the duration of the conversation", fontsize=14)
plt.savefig('./plots/speech_turns.pdf', bbox_inches='tight')

### Plot turn-level occurrances of words from chosen dictionary category

We can then examine the occurrences of the words from the chosen dictionary category in proportion to the total words per speech turn.

In [None]:
def plot_measure(df1, measure, show_all=False):
    session_turns = df1.shape[0]
    df1['zero_index'] = np.arange(0,session_turns)
    col = sns.color_palette("Set2")
    sns.set_style('ticks', {'axes.linewidth': 0.5, 'axes.edgecolor':'grey', 'axes.spinecolor' : 'grey'})
    sns.set_context('talk', font_scale = 1)
    fig = plt.figure(figsize=[round(session_turns/20),4], dpi=300)
    if show_all :
        sns.barplot(x=df1.zero_index, y='word count', data=df1, color ='#eee', linewidth=0)
        ax = sns.barplot(x=df1.zero_index, y=measure+' count', data=df1, hue='speaker', palette ='Set2', linewidth=0)
        plt.title("Speech turn-wise word count with non-normalised measure for: " + measure, fontsize=9)
        ax.set_ylabel("word / " + measure + " counts\n(non-normalised)")
    else :
        ax = sns.barplot(x=df1.zero_index, y='normalized '+measure+' measure', data=df1, hue='speaker', palette ='Set2', linewidth=0)
        plt.title("Speech turn-wise normalised measure for: " + measure, fontsize=13)
        ax.set_ylabel(measure+" score\n(normalised)")
        
    plt.xticks(np.arange(0, session_turns, 20))
    ax.set_xlabel("Turn Index")
    speakers = len(set(list(df1['speaker'])))
    legend_columns = round(speakers/4) + 1
    plt.legend(loc='best', ncol=legend_columns, borderaxespad=0.15)
    plt.savefig('./plots/'+ measure +'.pdf', bbox_inches='tight')
    plt.show()

#### HINT:
If you want only the normalised measure, set `show_all = False` in the function call below. 

However, if you want to see the number of words from the dictionary category plotted against the total words spoken for each turn, set `show_all = True`.

In [None]:
plot_measure(transcript_df, measure_name, show_all=False)

### Examine the word occurrances in context

Choose a particular interval from the figure above based on what activity around your chosen behaviour you would like to examine.

Select a turn index closest to the activity and examine the text that follows this turn.

In [None]:
def match_lexicon_terms(sentence, lexicon):
    reg_str = make_regex(lexicon)
    reg_matches = re.findall(reg_str, sentence.lower())
    reg_split = re.split(reg_str, sentence.lower())
    ind = 0
    matched_print = ""
    for split_string in reg_split:
        if ind == len(reg_matches) :
            str_to_concat = split_string
        else :
            str_to_concat = split_string + '**' + reg_matches[ind] + '**'
            ind += 1
        matched_print += str_to_concat
    return matched_print
    
def print_lines(df2, lexicon, start, num_lines=10):
    speakers_list = list(df2['speaker'])
    utt = list(df2['utterance'])
    end = start + num_lines
    for ind, line in enumerate(utt[start:end]) :
        index = ind + start
        line_matched = match_lexicon_terms(line, lexicon)
        print(str(index)+' : ', end='')
        print(speakers_list[index])
        printmd(line_matched)
        print('----------')
        
def concordance(text, word, width=80, occurrences=10,nospace=False):
    margin = int((width - len(word))/2)
    start = 0
    counter = 0
    while start < len(text) or counter == occurrences:
        ind = text.find(word, start, -1)
        if ind < 0 :
            break
        
        if nospace : 
            
            if ind - margin > 0 :
                print(text[ind-margin:ind+len(word)+margin], ":", str(ind))
            else :
                spaces = ' ' * (margin - ind)
                print(spaces + text[0:ind+len(word)+margin], ":", str(ind))
            
        else :
            
            if ind - margin > 0 :
                left_text = text[ind-margin:ind]
                mid_text = text[ind:ind+len(word)]
                right_text = text[ind+len(word):ind+len(word)+margin]
            else :
                left_text = text[0:ind]
                mid_text = text[ind:ind+len(word)]
                right_text = text[ind+len(word):ind+len(word)+margin]
                
            print(left_text + "  " + mid_text + "  " + right_text, ":", str(ind))
            
        start += ind + len(word)
        counter += 1
        

def print_concordances(df2, filter_keywords, match_limit=10, nospace=False):
    speakers_list = list(df2['speaker'])
    utt = list(df2['utterance'])
    utt_str = ' '.join(utt)
    if len(filter_keywords) > 0 :
        for term in filter_keywords :
            concordance(utt_str, term, width=80, occurrences=match_limit, nospace=nospace)
            print("-" * 80)

#### HINT:
If you want to see more speech turns, increase the `num_lines` value in the function call below.

In [None]:
print_lines(transcript_df, measure_words, 35, num_lines=10)