In [2]:
import pandas as pd
import numpy as np 
import re
import nltk
#from transformers import BertTokenizer

### Create a setence level data set using the filename, speaker, and timestamp as labels 

In [3]:
file_path = '/Users/steffenerickson/Box Sync/ECR Observation Data/2023-2024 Final Data/Quant Team - Results'
df = pd.read_csv(f'{file_path}/transcripts.csv')

In [4]:
df_lines_split = df['text'].str.split(r'\\n').apply(pd.Series).stack().to_frame('line_str')
df_lines_split.reset_index(drop=True, inplace=True)
df_lines_split['filename'] = df['filename'].repeat(df_lines_split.shape[0] // len(df)).reset_index(drop=True)
df_cleaned = df_lines_split[df_lines_split['line_str'].str.strip() != ""]

In [5]:
def is_teacher_timestamp(text):
    return bool(re.search(r'teacher \d+.*\d+:\d+', text, re.IGNORECASE))
def is_transcribed_by_otter(text):
    return bool(re.search(r'transcribed by https://otter\.ai', text, re.IGNORECASE))

In [6]:
df_cleaned = df_cleaned.copy()
df_cleaned.loc[:, 'is_teacher_timestamp'] = df_cleaned['line_str'].apply(is_teacher_timestamp)
df_cleaned.loc[:, 'firstrow'] = df_cleaned.groupby('filename')['is_teacher_timestamp'].transform(
    lambda x: x.idxmax() == x.index).fillna(False).astype(int)
df_cleaned.loc[:, 'is_transcribed_by_otter'] = df_cleaned['line_str'].apply(is_transcribed_by_otter)
df_cleaned.loc[:, 'lastrow'] = df_cleaned['is_transcribed_by_otter'].apply(lambda x: 1 if x else 0)

In [7]:
def keep_between_ones(group):
    start_idx = group['firstrow'].idxmax()  # First occurrence of 1 in the second column
    end_idx = group['lastrow'].idxmax()    # First occurrence of 1 in the third column
    return group.loc[start_idx:end_idx-1]
df_chopped = df_cleaned.groupby('filename', group_keys=False).apply(keep_between_ones)
df_chopped = df_chopped.drop(['firstrow', 'lastrow','is_teacher_timestamp','is_transcribed_by_otter'], axis=1)

In [8]:
# Define a function to classify the rows based on the 'line_str'
def classify_line_str(line):
    if re.match(r'Teacher.*\d{2}:\d{2}', line):
        return 1
    elif re.match(r'.*\d{2}:\d{2}', line):
        return 2
    else:
        return np.nan
df_chopped['classification'] = df_chopped['line_str'].apply(classify_line_str)

In [9]:
# Define a function to extract the person (text before the timestamp) and timestamp
def extract_person_and_timestamp(line):
    # Use regex to match the format "[text] [timestamp]"
    match = re.search(r'(.+?)\s+(\d{2}:\d{2})', line)  # Adjusted to find any text before the timestamp
    if match:
        return match.group(1).strip(), match.group(2)
    else:
        return np.nan, np.nan
df_chopped['person'], df_chopped['timestamp'] = zip(*df_chopped.apply(
    lambda row: extract_person_and_timestamp(row['line_str']) if pd.notna(row['classification']) else (np.nan, np.nan), axis=1))

In [10]:
df_chopped['keep'] = df_chopped['classification'].apply(lambda x: 1 if pd.isna(x) else 0)
df_chopped['classification'] = df_chopped['classification'].ffill()
df_chopped['person'] = df_chopped['person'].ffill()
df_chopped['timestamp'] = df_chopped['timestamp'].ffill()
df_filtered = df_chopped[df_chopped['keep'] == 1]
df_filtered = df_filtered.drop(['keep'], axis=1)
df_filtered.reset_index(drop=True, inplace=True)

In [11]:
df_sentences = df_filtered.line_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x))).stack().to_frame('sent_str')
df_sentences = df_sentences.reset_index(level=1, drop=True)
variables = ['filename', 'classification', 'person', 'timestamp','line_str']
for var in variables:
    df_sentences[var] = df_filtered[var].repeat(df_sentences.groupby(level=0).size()).values

In [12]:
#df_speaker = df_sentences[df_sentences['person'].str.contains('speaker', case=False, na=False)]

In [13]:
## Apply the conditions to locate the rows to be updated
#condition = (
#    (df_sentences['filename'] == "01_0101_G4_L03_transcript.txt") & 
#    (df_sentences['person'].str.contains('speaker', case=False, na=False))
#)
## Update the 'person' column
#df_sentences.loc[condition, 'person'] = "Teacher 01_0101"
## Update the 'classification' column to 1 for the same rows
#df_sentences.loc[condition, 'classification'] = 1

### Tokenize Sentences 

In [14]:
keep_whitespace = True
if keep_whitespace:
    # Return a tokenized copy of text
    # using NLTK's recommended word tokenizer.
    df_tokens = df_sentences.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
else:
    # Tokenize a string on whitespace (space, tab, newline).
    # In general, users should use the string ``split()`` method instead.
    # Returns fewer tokens.
    df_tokens = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
df_tokens['pos'] = df_tokens.pos_tuple.apply(lambda x: x[1])
df_tokens['token_str'] = df_tokens.pos_tuple.apply(lambda x: x[0])
df_tokens['term_str'] = df_tokens.token_str.str.lower().str.replace(r"\W+", "", regex=True)
df_tokens['pos_group'] = df_tokens.pos.str[:2]

In [15]:
variables = ['filename', 'classification', 'person', 'timestamp','sent_str','line_str']
repeat_counts = df_sentences['sent_str'].apply(lambda x: len(nltk.word_tokenize(x)))
for var in variables:
    df_tokens[var] = np.repeat(df_sentences[var].values, repeat_counts)

In [16]:
df_tokens = df_tokens[df_tokens.term_str != '']

In [17]:
df_tokens.head(50)

Unnamed: 0,Unnamed: 1,pos_tuple,pos,token_str,term_str,pos_group,filename,classification,person,timestamp,sent_str,line_str
0,0,"(You, PRP)",PRP,You,you,PR,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,1,"(could, MD)",MD,could,could,MD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,2,"(have, VB)",VB,have,have,VB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,3,"(wrote, VBD)",VBD,wrote,wrote,VB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,4,"(Jupiter, NNP)",NNP,Jupiter,jupiter,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,6,"(Saturn, NNP)",NNP,Saturn,saturn,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,8,"(Uranus, NNP)",NNP,Uranus,uranus,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,9,"(and, CC)",CC,and,and,CC,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,10,"(Neptune, NNP)",NNP,Neptune,neptune,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,0,"(Or, CC)",CC,Or,or,CC,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,Or,"You could have wrote Jupiter, Saturn, Uranus a..."


### Vocab Tables 

In [18]:
# Teacher Vocab 
df_tokens_teacher = df_tokens[df_tokens['classification'] == 1]
vocab_teacher = df_tokens_teacher['term_str'].value_counts().to_frame('n')
vocab_teacher.index.name = 'term_str'
vocab_teacher['p'] = vocab_teacher['n'] / vocab_teacher['n'].sum()  # Probability of each term
vocab_teacher['i'] = -np.log2(vocab_teacher['p'])  # Information content
vocab_teacher['n_chars'] = vocab_teacher.index.str.len()  # Number of characters in each term

In [19]:
#Student Vocab
df_tokens_student = df_tokens[df_tokens['classification'] == 2]
vocab_student = df_tokens_student['term_str'].value_counts().to_frame('n')
vocab_student.index.name = 'term_str'
vocab_student['p'] = vocab_student['n'] / vocab_student['n'].sum()  # Probability of each term
vocab_student['i'] = -np.log2(vocab_student['p'])  # Information content
vocab_student['n_chars'] = vocab_student.index.str.len()  # Number of characters in each term
vocab_student.head()

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
the,1123,0.042482,4.557019,3
it,872,0.032987,4.921977,2
i,640,0.02421,5.368234,1
and,589,0.022281,5.488038,3
a,537,0.020314,5.621383,1


In [20]:
vocab_teacher.head(20)

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
you,5355,0.04203,4.572437,3
the,4040,0.031709,4.978968,3
to,3381,0.026537,5.235873,2
it,3316,0.026026,5.263879,2
i,2649,0.020791,5.587876,1
and,2508,0.019685,5.666786,3
that,2321,0.018217,5.778577,4
a,2031,0.015941,5.971133,1
s,1999,0.01569,5.994045,1
we,1945,0.015266,6.033553,2


In [21]:
vocab_student.head(20)

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
the,1123,0.042482,4.557019,3
it,872,0.032987,4.921977,2
i,640,0.02421,5.368234,1
and,589,0.022281,5.488038,3
a,537,0.020314,5.621383,1
you,514,0.019444,5.684537,3
to,502,0.01899,5.718618,2
s,443,0.016758,5.898999,1
that,403,0.015245,6.035526,4
we,399,0.015094,6.049917,2


In [22]:
df_tokens_student.head(50)

Unnamed: 0,Unnamed: 1,pos_tuple,pos,token_str,term_str,pos_group,filename,classification,person,timestamp,sent_str,line_str
1,0,"(Gas, NNP)",NNP,Gas,gas,NN,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,46:36,Gas giants.,Gas giants.
1,1,"(giants, NNS)",NNS,giants,giants,NN,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,46:36,Gas giants.,Gas giants.
6,0,"(I, PRP)",PRP,I,i,PR,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,34:51,I'm trying to think of the song where it sings...,I'm trying to think of the song where it sings...
6,1,"('m, VBP)",VBP,'m,m,VB,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,34:51,I'm trying to think of the song where it sings...,I'm trying to think of the song where it sings...
6,2,"(trying, VBG)",VBG,trying,trying,VB,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,34:51,I'm trying to think of the song where it sings...,I'm trying to think of the song where it sings...
6,3,"(to, TO)",TO,to,to,TO,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,34:51,I'm trying to think of the song where it sings...,I'm trying to think of the song where it sings...
6,4,"(think, VB)",VB,think,think,VB,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,34:51,I'm trying to think of the song where it sings...,I'm trying to think of the song where it sings...
6,5,"(of, IN)",IN,of,of,IN,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,34:51,I'm trying to think of the song where it sings...,I'm trying to think of the song where it sings...
6,6,"(the, DT)",DT,the,the,DT,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,34:51,I'm trying to think of the song where it sings...,I'm trying to think of the song where it sings...
6,7,"(song, NN)",NN,song,song,NN,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,34:51,I'm trying to think of the song where it sings...,I'm trying to think of the song where it sings...


In [23]:
df_tokens_teacher.head(50)

Unnamed: 0,Unnamed: 1,pos_tuple,pos,token_str,term_str,pos_group,filename,classification,person,timestamp,sent_str,line_str
0,0,"(You, PRP)",PRP,You,you,PR,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,1,"(could, MD)",MD,could,could,MD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,2,"(have, VB)",VB,have,have,VB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,3,"(wrote, VBD)",VBD,wrote,wrote,VB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,4,"(Jupiter, NNP)",NNP,Jupiter,jupiter,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,6,"(Saturn, NNP)",NNP,Saturn,saturn,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,8,"(Uranus, NNP)",NNP,Uranus,uranus,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,9,"(and, CC)",CC,and,and,CC,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,10,"(Neptune, NNP)",NNP,Neptune,neptune,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,"You could have wrote Jupiter, Saturn, Uranus a...","You could have wrote Jupiter, Saturn, Uranus a..."
0,0,"(Or, CC)",CC,Or,or,CC,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,46:25,Or,"You could have wrote Jupiter, Saturn, Uranus a..."


In [24]:
df_tokens.to_csv(f'{file_path}/df_tokens.csv', index=False)