In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [63]:
df = pd.read_csv('translatedDepText.csv', encoding='latin8')

In [64]:
df.shape

(386250, 12)

In [65]:
from tqdm.notebook import tqdm
import re

In [66]:
# import re

# def replace_cry(text):
#     # Define a regular expression pattern to match <cry> or <word cry> or <cry word> or any other variation
#     pattern = re.compile(r'<([^<>]*commotion[^<>]*)>', re.IGNORECASE)

#     # Replace occurrences with 'SAD'
#     replaced_text = re.sub(pattern, '<SAD>', text)

#     return replaced_text

# # Example usage
# original_text = "I feel <the commotion> when I see someone <filling in> or <fills in> or <fills>."
# modified_text = replace_cry(original_text)

# print(modified_text)

### Replacing the annotated tags in the english text based on Amir's suggestions

In [67]:
crying = re.compile(r'<([^<>]*cry[^<>]*)>', re.IGNORECASE)
laughter = re.compile(r'<([^<>]*laugh[^<>]*)>', re.IGNORECASE)
chars = re.compile(r'<([^<>]*character[^<>]*)>', re.IGNORECASE)
figure = re.compile(r'<([^<>]*figure[^<>]*)>', re.IGNORECASE)
location = re.compile(r'<([^<>]*location[^<>]*)>', re.IGNORECASE)
place = re.compile(r'<([^<>]*place[^<>]*)>', re.IGNORECASE)
hums = re.compile(r'<([^<>]*fill[^<>]*)>', re.IGNORECASE)
hums2 = re.compile(r'<([^<>]*commotion[^<>]*)>', re.IGNORECASE)

In [68]:
processed_text = []
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    text = row.eng_event_plaintext
    if text is not np.nan:
        text = re.sub(crying, '(crying)', text)
        text = re.sub(laughter, '(laughing)', text)
        text = re.sub(chars, 'them', text)
        text = re.sub(figure, 'them', text)
        text = re.sub(place, 'this place', text)
        text = re.sub(location, 'this place', text)
        text = re.sub(hums, 'hmm', text)
        text = re.sub(hums2, 'mm-hmm', text)
        processed_text.append(text)
    else:
        processed_text.append(np.NaN)

  0%|          | 0/386250 [00:00<?, ?it/s]

In [69]:
# df['eng_event_plaintext']

In [70]:
# processed_text[3]

In [71]:
df['eng_event_plaintext'] = processed_text

### Removing the first and last session of each treatment (c_code) from the dataset

In [79]:
for i, row in df.groupby('c_code')['session_n'].max().reset_index().iterrows():
    c_code = row.c_code
    max_n = row.session_n
    df = df[df.filterindex != f"{c_code}_{max_n}"]

In [80]:
df.shape

(360557, 12)

In [82]:
for i, row in df.groupby('c_code')['session_n'].min().reset_index().iterrows():
    c_code = row.c_code
    min_n = row.session_n
    df = df[df.filterindex != f"{c_code}_{min_n}"]

In [83]:
df.shape

(328689, 12)

### Flattening the dataframe - each timestamp is an individual row. melting this into a new column

In [84]:
temp = []
for i, row in df.iterrows():
    event = row.event_speaker
    if event=='Timestamp':
        temp.append(row.event_plaintext_full)
    else:
        temp.append(temp[-1])

In [85]:
df['timestamp'] = temp

In [86]:
df = df[df.event_speaker!='Timestamp']

In [87]:
df['timestamp'] = df.timestamp.apply(lambda x: pd.to_datetime("".join(x.split()), format='%H:%M:%S'))
# .time()

### Segmenting the data based on quality (session-level)

In [88]:
og = df.copy()

In [89]:
sbs = pd.read_csv('sbsDepDone.csv', encoding='latin8')

In [90]:
sbs = sbs[['c_code', 'session_n', 'c_b_ors_Pre_Session_1', 'c_a_ors_Post_Session_1']]

In [91]:
sbs['idx'] = sbs['c_code'] + '_' + sbs['session_n'].astype(str)

In [92]:
sbs['diff'] = sbs['c_a_ors_Post_Session_1'] - sbs['c_b_ors_Pre_Session_1']

In [93]:
conditions = [
    (sbs['diff']<-1.3),
    (sbs['diff']>1.3)
]
choices = [
    -1,
    1
]
sbs['quality'] = np.select(conditions, choices, default=0)

In [94]:
sbs['quality'].value_counts()

 0    637
 1    173
-1     40
Name: quality, dtype: int64

In [95]:
df = pd.merge(df, sbs[['idx', 'quality']], left_on='filterindex', right_on='idx', how='left')

In [96]:
df = df[df['quality'].notna()]

In [97]:
df = df[df.quality.isin([-1, 1])]

### Extracting 10 minute segments from the whole session

In [98]:
training_ids = df.filterindex.unique()

In [99]:
training_ids.shape

(186,)

In [100]:
training_data = pd.DataFrame()

for doc_id in training_ids:
    subset = df[df.filterindex==doc_id]
    startTime = subset.timestamp.min()
    endTime = subset.timestamp.max()
    ten_min = pd.Timedelta(minutes=10)
    filtered_subset = subset[(subset.timestamp >= startTime+ten_min) & (subset.timestamp <= endTime-ten_min)]
    training_data = pd.concat([training_data, filtered_subset], ignore_index=True)

In [101]:
training_data.timestamp = training_data.timestamp.apply(lambda x: x.time())

In [102]:
# https://stackoverflow.com/questions/47492685/create-a-column-which-increments-value-for-changes-in-another-row
# https://stackoverflow.com/questions/70772311/remove-all-rows-until-some-value-in-a-column-is-met-using-pandas
# t = anno_data[anno_data.filterindex=='AA6090_10']
# t1 = t.dialog_turn_main_speaker
# t['turn_n'] = t1.ne(t1.shift()).cumsum()
# t[t.turn_n.eq(10).cummax()]

for idx, group in training_data.groupby('filterindex'):
    if group.shape[0]<10: continue
    speakers = group.dialog_turn_main_speaker
    group['turn_n'] = speakers.ne(speakers.shift()).cumsum()
    for i in range(group.turn_n.min(), group.turn_n.max(), 10):
        turns = group[(group.turn_n>=i) & (group.turn_n<i+10)][['dialog_turn_main_speaker', 'event_speaker', 'eng_event_plaintext', 'turn_n']]

        if group.quality.unique()[0]==1:
            turns.to_csv(f'./training_docs/good_sessions/{idx}_{i}:{i+10}.csv', index=None)
        elif group.quality.unique()[0]==-1:
            turns.to_csv(f'./training_docs/poor_sessions/{idx}_{i}:{i+10}.csv', index=None)

In [103]:
og = og[~og.filterindex.isin(training_ids)]

In [104]:
anno_doc_ids = np.random.choice(og.filterindex.unique(), size=350, replace=False)

In [105]:
anno_data = pd.DataFrame()

for doc_id in anno_doc_ids:
    subset = og[og.filterindex==doc_id]
    startTime = subset.timestamp.min()
    endTime = subset.timestamp.max()
    ten_min = pd.Timedelta(minutes=10)
    filtered_subset = subset[(subset.timestamp >= startTime+ten_min) & (subset.timestamp <= endTime-ten_min)]
    anno_data = pd.concat([anno_data, filtered_subset], ignore_index=True)

In [106]:
anno_data.timestamp = anno_data.timestamp.apply(lambda x: x.time())

In [107]:
# dialogue_turns = {}

# for idx, group in anno_data.groupby('filterindex'):
#     if group.shape[0]<15: continue
        
#     max_idx = group.index.max()
#     random_idx = max_idx+1
#     while random_idx+15>max_idx:
#         random_idx = np.random.choice(group.index, 1)[0]
        
#     group = group.reset_index()
#     turns = group[(group['index']>random_idx) & (group['index']<random_idx+15)][['dialog_turn_main_speaker', 'event_speaker', 'eng_event_plaintext']]
#     dialogue_turns[idx] = list(turns.itertuples(name=None, index=False))
#     turns.to_csv(f'./anno_docs/{idx}.csv', index=None)

In [108]:
# https://stackoverflow.com/questions/47492685/create-a-column-which-increments-value-for-changes-in-another-row
# https://stackoverflow.com/questions/70772311/remove-all-rows-until-some-value-in-a-column-is-met-using-pandas
# t = anno_data[anno_data.filterindex=='AA6090_10']
# t1 = t.dialog_turn_main_speaker
# t['turn_n'] = t1.ne(t1.shift()).cumsum()
# t[t.turn_n.eq(10).cummax()]

dialogue_turns = {}
for idx, group in anno_data.groupby('filterindex'):
    if group.shape[0]<10: continue
    speakers = group.dialog_turn_main_speaker
    group['turn_n'] = speakers.ne(speakers.shift()).cumsum()
    max_turn = group.turn_n.max()
    random_turn = max_turn+1
    while (random_turn+12>max_turn) and (random_turn!=1):
        random_turn = np.random.choice(group.turn_n, 1)[0]

    turns = group[(group.turn_n>=random_turn) & (group.turn_n<random_turn+11)][['dialog_turn_main_speaker', 'event_speaker', 'eng_event_plaintext', 'turn_n']]
    dialogue_turns[idx] = list(turns.itertuples(name=None, index=False))
    turns.to_csv(f'./anno_docs/{idx}.csv', index=None)

In [109]:
len(dialogue_turns)

348

In [112]:
dialogue_turns.keys()

dict_keys(['AA6090_4', 'AA6090_5', 'AA6090_8', 'AA7382_12', 'AA7382_15', 'AA7382_2', 'AA7382_5', 'AA7382_9', 'AC7672_11', 'AC7672_12', 'AC7672_14', 'AC7672_3', 'AC7672_5', 'AC7672_6', 'AC7672_7', 'AC7672_8', 'AD8893_4', 'AD8893_9', 'AL2797_12', 'AL2797_13', 'AL2797_3', 'AL2797_6', 'AL2797_7', 'AL2797_9', 'BH9982_10', 'BH9982_11', 'BH9982_14', 'BH9982_2', 'BH9982_4', 'BH9982_5', 'BH9982_6', 'BH9982_7', 'BH9982_9', 'DB0097_15', 'DB0097_7', 'DP6233_11', 'DP6233_12', 'DP6233_14', 'DP6233_15', 'DP6233_6', 'DY0994_11', 'DY0994_13', 'DY0994_14', 'DY0994_5', 'DY0994_6', 'DY0994_7', 'DY0994_8', 'EA2036_10', 'EA2036_11', 'EA2036_13', 'EA2036_14', 'EA2036_15', 'EA2036_3', 'EA2036_4', 'EA2036_6', 'EA2036_9', 'EB8723_10', 'EB8723_15', 'EB8723_2', 'EB8723_5', 'EB8723_7', 'EB8723_8', 'EC3599_10', 'EC3599_12', 'EC3599_15', 'EC3599_2', 'EC3599_3', 'EC3599_4', 'EC3599_5', 'EC3599_8', 'EC3599_9', 'EK4141_13', 'EK4141_14', 'EK4141_15', 'EL4745_10', 'EL4745_11', 'EL4745_15', 'EL4745_3', 'EL4745_4', 'EL4745