In [1]:
import os
import pandas as pd

from tqdm.notebook import tqdm

In [2]:
dir_name = "../data_preprocessed_csv/"

In [3]:
filenames = os.listdir(dir_name)

In [4]:
def create_df_from_filename(filename):
    df = pd.read_csv(dir_name + filename, index_col=0)
    return df


def create_list_of_questioners(df):
    return df[df.text_type == "q"].speaker.unique().tolist()


def find_filenames_with_multiples_questioners(filenames):
    filenames_multiple_questioners = []
    for filename in filenames:
        df = create_df_from_filename(filename)
        questioners = create_list_of_questioners(df)
        if len(questioners) > 1:
            print(f"{filename}\n{questioners}\n")
            filenames_multiple_questioners.append(filename)

    return filenames_multiple_questioners


def create_questioner_stats_from_filename(filename):
    df = create_df_from_filename(filename)
    df["num_words"] = df.text.map(lambda x: len(str(x).split()))
    df["objection"] = (df.text_type == "q") & (df.text_type.shift(-1) == "side_chat")
    df["strike_that"] = df.text.map(
        lambda x: "scratch that" in str(x).lower() or "strike that" in str(x).lower()
    )

    df_questioners = (
        df[df.text_type == "q"]
        .groupby("speaker")
        .agg({"speaker": "count", "num_words": "mean", "objection": "sum", 'strike_that': "sum"})
        .rename(
            columns={
                "speaker": "num_questions",
                "num_words": "av_num_words",
                "objection": "objection_ratio",
                "strike_that": 'strike_ratio'
            }
        )
    )

    df_questioners["objection_ratio"] = (
        df_questioners.objection_ratio / df_questioners.num_questions
    )
    df_questioners["strike_ratio"] = (
        df_questioners.strike_ratio / df_questioners.num_questions
    )
    df_questioners["filename"] = filename

    return df, df_questioners


def print_questions_with_objections(filename):
    df, _ = create_questioner_stats_from_filename(filename)
    # select indices of objectionable questions *and* the objection itself
    indices = (df.objection) | (df.objection.shift(1))
    for index, row in df[indices].iterrows():
        print(f"{row.speaker:15}: {row.text}")


def print_questions_striked(filename):
    df, _ = create_questioner_stats_from_filename(filename)
    indices = (df.strike_that)
    for index, row in df[indices].iterrows():
        print(f"{row.speaker:15}: {row.text}")
        

def create_questioner_stats_from_filenames(filenames):
    # initialise dataframe with first file
    _, df_corpus = create_questioner_stats_from_filename(filenames[0])

    for filename in tqdm(filenames[1:]):
        _, df_file = create_questioner_stats_from_filename(filename)
        df_corpus = df_corpus.append(df_file)
    return df_corpus

## investigate questioners stats across all files

In [5]:
df_questioners_corpus = create_questioner_stats_from_filenames(filenames)

  0%|          | 0/141 [00:00<?, ?it/s]

In [6]:
df_questioners_corpus.to_csv('lawyers_stats.csv')

### highest average number of words per question.
removed people with fewer than 20 questions

In [None]:
df_questioners_corpus.loc[
    df_questioners_corpus.num_questions > 20,
    ["num_questions", "av_num_words", "filename"],
].sort_values(by="av_num_words", axis=0, ascending=False).head(10)

### lowest average number of words per question.
removed people with fewer than 20 questions

In [None]:
df_questioners_corpus.loc[
    df_questioners_corpus.num_questions > 20,
    ["num_questions", "av_num_words", "filename"],
].sort_values(by="av_num_words", axis=0, ascending=True).head(10)

### highest objection ratio
removed entries with fewer than 20 questions

In [None]:
df_questioners_corpus.loc[
    df_questioners_corpus.num_questions > 20,
    ["num_questions", "objection_ratio", "filename"],
].sort_values(by="objection_ratio", axis=0, ascending=False).head(30)

In [None]:
filename = 'Morton,_David_-_Vol._1_-_Video.csv'
# filename = 'Lee_v_Hobart_-_8-26-19_-_Galatsis_-_FINAL.csv'

In [None]:
print_questions_with_objections(filename)

### highest strike ratio
removed entries with fewer than 20 questions

In [None]:
df_questioners_corpus.loc[
    (df_questioners_corpus.num_questions > 20)
    & ~(df_questioners_corpus.filename.isin(["8-20-19-B-TS.csv", "8-20-19-TS.csv", "82819_Sicilia_Saimesier(1).csv"])),
    ["num_questions", "strike_ratio", "filename"],
].sort_values(by="strike_ratio", axis=0, ascending=False).head(30)

In [None]:
filename = 'SPorterfield.csv'

In [None]:
print_questions_striked(filename)

## investigate questioner stats in an individual file

In [None]:
# filename = 'Morton,_David_-_Vol._1_-_Video.csv'
filename = 'Lee_v_Hobart_-_8-26-19_-_Galatsis_-_FINAL.csv'
df, df_questioners = create_questioner_stats_from_filename(filename)

In [None]:
df_questioners.head()

In [None]:
print_questions_with_objections(filename)

## files with more than one questioner

In [None]:
filenames_mult_questioners = find_filenames_with_multiples_questioners(filenames)