In [1]:
import pandas as pd
import numpy as np 
import re
import nltk
#from transformers import BertTokenizer

### Create a setence level data set using the filename, speaker, and timestamp as labels 

In [2]:
file_path = '/Users/steffenerickson/Box Sync/ECR Observation Data/2023-2024 Final Data/Quant Team - Results'
df = pd.read_csv(f'{file_path}/transcripts.csv')

In [3]:
df_lines_split = df['text'].str.split(r'\\n').apply(pd.Series).stack().to_frame('line_str')
df_lines_split.reset_index(drop=True, inplace=True)
df_lines_split['filename'] = df['filename'].repeat(df_lines_split.shape[0] // len(df)).reset_index(drop=True)
df_cleaned = df_lines_split[df_lines_split['line_str'].str.strip() != ""]

In [4]:
def is_teacher_timestamp(text):
    return bool(re.search(r'teacher \d+.*\d+:\d+', text, re.IGNORECASE))
def is_transcribed_by_otter(text):
    return bool(re.search(r'transcribed by https://otter\.ai', text, re.IGNORECASE))

In [5]:
df_cleaned = df_cleaned.copy()
df_cleaned.loc[:, 'is_teacher_timestamp'] = df_cleaned['line_str'].apply(is_teacher_timestamp)
df_cleaned.loc[:, 'firstrow'] = df_cleaned.groupby('filename')['is_teacher_timestamp'].transform(
    lambda x: x.idxmax() == x.index).fillna(False).astype(int)
df_cleaned.loc[:, 'is_transcribed_by_otter'] = df_cleaned['line_str'].apply(is_transcribed_by_otter)
df_cleaned.loc[:, 'lastrow'] = df_cleaned['is_transcribed_by_otter'].apply(lambda x: 1 if x else 0)

In [6]:
def keep_between_ones(group):
    start_idx = group['firstrow'].idxmax()  # First occurrence of 1 in the second column
    end_idx = group['lastrow'].idxmax()    # First occurrence of 1 in the third column
    return group.loc[start_idx:end_idx-1]
df_chopped = df_cleaned.groupby('filename', group_keys=False).apply(keep_between_ones)
df_chopped = df_chopped.drop(['firstrow', 'lastrow','is_teacher_timestamp','is_transcribed_by_otter'], axis=1)

In [7]:
# Define a function to classify the rows based on the 'line_str'
def classify_line_str(line):
    if re.match(r'Teacher.*\d{2}:\d{2}', line):
        return 1
    elif re.match(r'.*\d{2}:\d{2}', line):
        return 2
    else:
        return np.nan
df_chopped['classification'] = df_chopped['line_str'].apply(classify_line_str)

In [8]:
# Define a function to extract the person (text before the timestamp) and timestamp
def extract_person_and_timestamp(line):
    # Use regex to match the format "[text] [timestamp]"
    match = re.search(r'(.+?)\s+(\d{2}:\d{2})', line)  # Adjusted to find any text before the timestamp
    if match:
        return match.group(1).strip(), match.group(2)
    else:
        return np.nan, np.nan
df_chopped['person'], df_chopped['timestamp'] = zip(*df_chopped.apply(
    lambda row: extract_person_and_timestamp(row['line_str']) if pd.notna(row['classification']) else (np.nan, np.nan), axis=1))

In [9]:
df_chopped['keep'] = df_chopped['classification'].apply(lambda x: 1 if pd.isna(x) else 0)
df_chopped['classification'] = df_chopped['classification'].ffill()
df_chopped['person'] = df_chopped['person'].ffill()
df_chopped['timestamp'] = df_chopped['timestamp'].ffill()
df_filtered = df_chopped[df_chopped['keep'] == 1]
df_filtered = df_filtered.drop(['keep'], axis=1)
df_filtered.reset_index(drop=True, inplace=True)

In [10]:
df_sentences = df_filtered.line_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x))).stack().to_frame('sent_str')
df_sentences = df_sentences.reset_index(level=1, drop=True)
variables = ['filename', 'classification', 'person', 'timestamp','line_str']
for var in variables:
    df_sentences[var] = df_filtered[var].repeat(df_sentences.groupby(level=0).size()).values

In [11]:
#df_speaker = df_sentences[df_sentences['person'].str.contains('speaker', case=False, na=False)]

In [12]:
## Apply the conditions to locate the rows to be updated
#condition = (
#    (df_sentences['filename'] == "01_0101_G4_L03_transcript.txt") & 
#    (df_sentences['person'].str.contains('speaker', case=False, na=False))
#)
## Update the 'person' column
#df_sentences.loc[condition, 'person'] = "Teacher 01_0101"
## Update the 'classification' column to 1 for the same rows
#df_sentences.loc[condition, 'classification'] = 1

### Tokenize Sentences 

In [13]:
keep_whitespace = True
if keep_whitespace:
    # Return a tokenized copy of text
    # using NLTK's recommended word tokenizer.
    df_tokens = df_sentences.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
else:
    # Tokenize a string on whitespace (space, tab, newline).
    # In general, users should use the string ``split()`` method instead.
    # Returns fewer tokens.
    df_tokens = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
df_tokens['pos'] = df_tokens.pos_tuple.apply(lambda x: x[1])
df_tokens['token_str'] = df_tokens.pos_tuple.apply(lambda x: x[0])
df_tokens['term_str'] = df_tokens.token_str.str.lower().str.replace(r"\W+", "", regex=True)
df_tokens['pos_group'] = df_tokens.pos.str[:2]

In [14]:
variables = ['filename', 'classification', 'person', 'timestamp','sent_str','line_str']
repeat_counts = df_sentences['sent_str'].apply(lambda x: len(nltk.word_tokenize(x)))
for var in variables:
    df_tokens[var] = np.repeat(df_sentences[var].values, repeat_counts)

In [15]:
df_tokens = df_tokens[df_tokens.term_str != '']

In [16]:
df_tokens.head(50)

Unnamed: 0,Unnamed: 1,pos_tuple,pos,token_str,term_str,pos_group,filename,classification,person,timestamp,sent_str,line_str
0,0,"(You, PRP)",PRP,You,you,PR,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,1,"(ca, MD)",MD,ca,ca,MD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,2,"(n't, RB)",RB,n't,nt,RB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,3,"(see, VB)",VB,see,see,VB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,4,"(those, DT)",DT,those,those,DT,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,5,"(up, IN)",IN,up,up,IN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,6,"(there, RB)",RB,there,there,RB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,0,"(Which, JJ)",JJ,Which,which,JJ,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,Which ones are you going to put down?,You can't see those up there? Which ones are y...
0,1,"(ones, NNS)",NNS,ones,ones,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,Which ones are you going to put down?,You can't see those up there? Which ones are y...
0,2,"(are, VBP)",VBP,are,are,VB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,Which ones are you going to put down?,You can't see those up there? Which ones are y...


### Vocab Tables 

In [17]:
# Teacher Vocab 
df_tokens_teacher = df_tokens[df_tokens['classification'] == 1]
vocab_teacher = df_tokens_teacher['term_str'].value_counts().to_frame('n')
vocab_teacher.index.name = 'term_str'
vocab_teacher['p'] = vocab_teacher['n'] / vocab_teacher['n'].sum()  # Probability of each term
vocab_teacher['i'] = -np.log2(vocab_teacher['p'])  # Information content
vocab_teacher['n_chars'] = vocab_teacher.index.str.len()  # Number of characters in each term

In [18]:
#Student Vocab
df_tokens_student = df_tokens[df_tokens['classification'] == 2]
vocab_student = df_tokens_student['term_str'].value_counts().to_frame('n')
vocab_student.index.name = 'term_str'
vocab_student['p'] = vocab_student['n'] / vocab_student['n'].sum()  # Probability of each term
vocab_student['i'] = -np.log2(vocab_student['p'])  # Information content
vocab_student['n_chars'] = vocab_student.index.str.len()  # Number of characters in each term
vocab_student.head()

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
the,1139,0.040721,4.618092,3
it,952,0.034035,4.876826,2
i,797,0.028494,5.133208,1
you,573,0.020486,5.609253,3
and,561,0.020056,5.639787,3


In [19]:
vocab_teacher.head(20)

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
you,5848,0.043253,4.531072,3
the,4316,0.031922,4.969321,3
to,3658,0.027055,5.20796,2
it,3314,0.024511,5.350442,2
i,2816,0.020827,5.585368,1
that,2486,0.018387,5.765189,4
and,2484,0.018372,5.76635,3
s,2101,0.015539,6.007939,1
a,2093,0.01548,6.013443,1
we,2075,0.015347,6.025904,2


In [20]:
vocab_student.head(20)

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
the,1139,0.040721,4.618092,3
it,952,0.034035,4.876826,2
i,797,0.028494,5.133208,1
you,573,0.020486,5.609253,3
and,561,0.020056,5.639787,3
to,517,0.018483,5.757624,2
a,516,0.018448,5.760417,1
s,460,0.016446,5.926154,1
like,444,0.015874,5.977228,4
that,401,0.014336,6.124186,4


In [21]:
df_tokens_student.head(50)

Unnamed: 0,Unnamed: 1,pos_tuple,pos,token_str,term_str,pos_group,filename,classification,person,timestamp,sent_str,line_str
1,0,"(Those, DT)",DT,Those,those,DT,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,45:41,Those two three.,Those two three.
1,1,"(two, CD)",CD,two,two,CD,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,45:41,Those two three.,Those two three.
1,2,"(three, CD)",CD,three,three,CD,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,45:41,Those two three.,Those two three.
2,0,"(Do, VBP)",VBP,Do,do,VB,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,45:43,Do we hit done?,Do we hit done?
2,1,"(we, PRP)",PRP,we,we,PR,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,45:43,Do we hit done?,Do we hit done?
2,2,"(hit, VB)",VB,hit,hit,VB,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,45:43,Do we hit done?,Do we hit done?
2,3,"(done, VBN)",VBN,done,done,VB,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,45:43,Do we hit done?,Do we hit done?
4,0,"(Jupiter, NN)",NN,Jupiter,jupiter,NN,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,45:54,Jupiter?,Jupiter?
6,0,"(Giants, NNS)",NNS,Giants,giants,NN,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,46:05,Giants?,Giants?
8,0,"(Neptune, NNP)",NNP,Neptune,neptune,NN,01_0101_G4_L03_transcript.txt,2.0,Unidentified Student,46:20,Neptune?,Neptune?


In [22]:
df_tokens_teacher.head(50)

Unnamed: 0,Unnamed: 1,pos_tuple,pos,token_str,term_str,pos_group,filename,classification,person,timestamp,sent_str,line_str
0,0,"(You, PRP)",PRP,You,you,PR,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,1,"(ca, MD)",MD,ca,ca,MD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,2,"(n't, RB)",RB,n't,nt,RB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,3,"(see, VB)",VB,see,see,VB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,4,"(those, DT)",DT,those,those,DT,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,5,"(up, IN)",IN,up,up,IN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,6,"(there, RB)",RB,there,there,RB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,You can't see those up there?,You can't see those up there? Which ones are y...
0,0,"(Which, JJ)",JJ,Which,which,JJ,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,Which ones are you going to put down?,You can't see those up there? Which ones are y...
0,1,"(ones, NNS)",NNS,ones,ones,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,Which ones are you going to put down?,You can't see those up there? Which ones are y...
0,2,"(are, VBP)",VBP,are,are,VB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,45:33,Which ones are you going to put down?,You can't see those up there? Which ones are y...


In [23]:
df_tokens.to_csv(f'{file_path}/df_tokens.csv', index=False)