In [1]:
import pandas as pd
import scipy.stats as stats
from utils import ALL_SUBJECT_IDS

interview_durations = pd.read_csv("../metadata/interview_durations.csv")
transcript_lengths = pd.read_csv("../metadata/transcript_lengths.csv")
predicted_gender = pd.read_csv("../metadata/predicted_gender_nopii.csv")
predicted_gender["PredictedGender"] = (
    predicted_gender["PredictedGender"].str.strip().str.upper()
)

predicted_gender.rename(columns={"SubjectID": "subject_id"}, inplace=True)
predicted_gender = predicted_gender.query("subject_id in @ALL_SUBJECT_IDS")

print(interview_durations.columns)
print(transcript_lengths.columns)
print(predicted_gender.columns)

Index(['filename', 'subject_id', 'duration'], dtype='object')
Index(['filename', 'subject_id', 'num_words', 'num_lines', 'num_characters'], dtype='object')
Index(['subject_id', 'PredictedGender'], dtype='object')


In [2]:
print(predicted_gender["PredictedGender"].value_counts())

PredictedGender
M    51
F    42
Name: count, dtype: int64


In [3]:
print(
    "Transcripts were on average \n {:.2f} +/ {:.2f} s.d. characters, \n {:.2f} +/ {:.2f} s.d. words, and \n {:.2f} +/ {:.2f} s.d. lines.".format(
        transcript_lengths["num_characters"].mean(),
        transcript_lengths["num_characters"].std(),
        transcript_lengths["num_words"].mean(),
        transcript_lengths["num_words"].std(),
        transcript_lengths["num_lines"].mean(),
        transcript_lengths["num_lines"].std(),
    )
)

Transcripts were on average 
 22508.54 +/ 7203.33 s.d. characters, 
 4044.30 +/ 1348.34 s.d. words, and 
 280.59 +/ 61.50 s.d. lines.


In [4]:
# Note how interview_durations has Part1-Part2 issues
print(interview_durations.head(n=5))
print(interview_durations.shape)
print(interview_durations.subject_id.nunique())
# print(interview_durations["duration"].describe())

# Summing up durations for each unique subject_id
total_durations = (
    interview_durations.groupby("subject_id")["duration"].sum().reset_index()
)
total_durations.columns = ["subject_id", "duration"]
print(len(total_durations))  # Should be 93
print(total_durations.head())

print(
    "Interviews lasted on average {:.1f} +/- {:.1f} s.d. minutes".format(
        total_durations["duration"].mean() / 60,
        total_durations["duration"].std() / 60,
    )
)

              filename subject_id  duration
0  C001_audio_only.m4a       C001      2029
1  C002_audio_only.m4a       C002      2238
2  C003_audio_only.m4a       C003      2156
3  C004_audio_only.m4a       C004      1017
4  C005_audio_only.m4a       C005      1722
(96, 3)
93
93
  subject_id  duration
0       C001      2029
1       C002      2238
2       C003      2156
3       C004      1017
4       C005      1722
Interviews lasted on average 26.7 +/- 8.9 s.d. minutes


In [5]:
# Are differences between the transcripts of M/F statistically significant?
merged_transcript_data = pd.merge(transcript_lengths, predicted_gender, on="subject_id")
assert merged_transcript_data.shape[0] == 93, merged_transcript_data.shape[0]


average_metrics_by_gender = merged_transcript_data.groupby("PredictedGender")[
    ["num_words", "num_lines", "num_characters"]
].mean()

print(average_metrics_by_gender)

                   num_words   num_lines  num_characters
PredictedGender                                         
F                3832.595238  275.714286     21353.52381
M                4218.647059  284.607843     23459.72549


In [6]:
# T-tests for each metric
for metric in ["num_words", "num_lines", "num_characters"]:
    male_metrics = merged_transcript_data[
        merged_transcript_data["PredictedGender"] == "M"
    ][metric]
    female_metrics = merged_transcript_data[
        merged_transcript_data["PredictedGender"] == "F"
    ][metric]
    t_stat, p_value = stats.ttest_ind(male_metrics, female_metrics, equal_var=False)
    print(f"{metric} - T-statistic: {t_stat}, P-value: {p_value}")

num_words - T-statistic: 1.4018535862222636, P-value: 0.16436635207905567
num_lines - T-statistic: 0.6891871822932596, P-value: 0.4925582648912449
num_characters - T-statistic: 1.4275924098942627, P-value: 0.15685097102515808


In [7]:
# Are differences between the durations (seconds) of M/F statistically significant?
merged_data = pd.merge(total_durations, predicted_gender, on="subject_id", how="inner")
assert merged_data.shape[0] == 93, merged_data.shape[0]
# TODO: Why does the above assertion fail?

average_durations_by_gender = merged_data.groupby("PredictedGender")["duration"].mean()
print(average_durations_by_gender)

# T-test for the difference in means
male_durations = merged_data[merged_data["PredictedGender"] == "M"]["duration"]
female_durations = merged_data[merged_data["PredictedGender"] == "F"]["duration"]
t_stat, p_value = stats.ttest_ind(male_durations, female_durations, equal_var=False)
print(f"T-statistic: {t_stat}, P-value: {p_value}")

PredictedGender
F    1485.595238
M    1701.705882
Name: duration, dtype: float64
T-statistic: 1.9895127759804625, P-value: 0.04965473728160002
