In [53]:
import pandas as pd
import numpy as np 
import re
import nltk
#from transformers import BertTokenizer

### Create a setence level data set using the filename, speaker, and timestamp as labels 

In [54]:
file_path = '/Users/steffenerickson/Box Sync/ECR Observation Data/2023-2024 Final Data/Quant Team - Results'
df = pd.read_csv(f'{file_path}/transcripts.csv')

In [55]:
df_lines_split = df['text'].str.split(r'\\n').apply(pd.Series).stack().to_frame('line_str')
df_lines_split.reset_index(drop=True, inplace=True)
df_lines_split['filename'] = df['filename'].repeat(df_lines_split.shape[0] // len(df)).reset_index(drop=True)
df_cleaned = df_lines_split[df_lines_split['line_str'].str.strip() != ""]

In [56]:
df_lines_split.to_csv(f'{file_path}/df_lines_split.csv', index=False)

In [57]:
def is_teacher_timestamp(text):
    return bool(re.search(r'teacher \d+.*\d+:\d+', text, re.IGNORECASE))
def is_transcribed_by_otter(text):
    return bool(re.search(r'transcribed by https://otter\.ai', text, re.IGNORECASE))

In [58]:
df_cleaned = df_cleaned.copy()
df_cleaned.loc[:, 'is_teacher_timestamp'] = df_cleaned['line_str'].apply(is_teacher_timestamp)
df_cleaned.loc[:, 'firstrow'] = df_cleaned.groupby('filename')['is_teacher_timestamp'].transform(
    lambda x: x.idxmax() == x.index).fillna(False).astype(int)
df_cleaned.loc[:, 'is_transcribed_by_otter'] = df_cleaned['line_str'].apply(is_transcribed_by_otter)
df_cleaned.loc[:, 'lastrow'] = df_cleaned['is_transcribed_by_otter'].apply(lambda x: 1 if x else 0)

In [59]:
df_cleaned.to_csv(f'{file_path}/df_cleaned.csv', index=False)

In [43]:
def keep_between_ones(group):
    start_idx = group['firstrow'].idxmax()  # First occurrence of 1 in the second column
    end_idx = group['lastrow'].idxmax()    # First occurrence of 1 in the third column
    return group.loc[start_idx:end_idx-1]
df_chopped = df_cleaned.groupby('filename', group_keys=False).apply(keep_between_ones)
df_chopped = df_chopped.drop(['firstrow', 'lastrow','is_teacher_timestamp','is_transcribed_by_otter'], axis=1)

In [44]:
df_chopped.to_csv(f'{file_path}/df_chopped.csv', index=False)

In [32]:
# Define a function to classify the rows based on the 'line_str'
def classify_line_str(line):
    if re.match(r'Teacher.*\d{2}:\d{2}', line):
        return 1
    elif re.match(r'.*\d{2}:\d{2}', line):
        return 2
    else:
        return np.nan
df_chopped['classification'] = df_chopped['line_str'].apply(classify_line_str)

In [33]:
# Define a function to extract the person (text before the timestamp) and timestamp
def extract_person_and_timestamp(line):
    # Use regex to match the format "[text] [timestamp]"
    match = re.search(r'(.+?)\s+(\d{2}:\d{2})', line)  # Adjusted to find any text before the timestamp
    if match:
        return match.group(1).strip(), match.group(2)
    else:
        return np.nan, np.nan
df_chopped['person'], df_chopped['timestamp'] = zip(*df_chopped.apply(
    lambda row: extract_person_and_timestamp(row['line_str']) if pd.notna(row['classification']) else (np.nan, np.nan), axis=1))

In [34]:
df_chopped['keep'] = df_chopped['classification'].apply(lambda x: 1 if pd.isna(x) else 0)
df_chopped['classification'] = df_chopped['classification'].ffill()
df_chopped['person'] = df_chopped['person'].ffill()
df_chopped['timestamp'] = df_chopped['timestamp'].ffill()
df_filtered = df_chopped[df_chopped['keep'] == 1]
df_filtered = df_filtered.drop(['keep'], axis=1)
df_filtered.reset_index(drop=True, inplace=True)

In [35]:
df_sentences = df_filtered.line_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x))).stack().to_frame('sent_str')
df_sentences = df_sentences.reset_index(level=1, drop=True)
variables = ['filename', 'classification', 'person', 'timestamp','line_str']
for var in variables:
    df_sentences[var] = df_filtered[var].repeat(df_sentences.groupby(level=0).size()).values

In [36]:
#df_speaker = df_sentences[df_sentences['person'].str.contains('speaker', case=False, na=False)]

In [37]:
## Apply the conditions to locate the rows to be updated
#condition = (
#    (df_sentences['filename'] == "01_0101_G4_L03_transcript.txt") & 
#    (df_sentences['person'].str.contains('speaker', case=False, na=False))
#)
## Update the 'person' column
#df_sentences.loc[condition, 'person'] = "Teacher 01_0101"
## Update the 'classification' column to 1 for the same rows
#df_sentences.loc[condition, 'classification'] = 1

In [38]:
 df_sentences.head()

Unnamed: 0,sent_str,filename,classification,person,timestamp,line_str
0,You got it.,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You got it. Nope which one which one? Sorry. Y...
0,Nope which one which one?,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You got it. Nope which one which one? Sorry. Y...
0,Sorry.,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You got it. Nope which one which one? Sorry. Y...
0,You guys are good.,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You got it. Nope which one which one? Sorry. Y...
1,Which planets revolve around the sun?,01_0101_G4_L04_transcript.txt,1.0,Teacher 01_0101,36:14,Which planets revolve around the sun?


### Tokenize Sentences 

In [13]:
keep_whitespace = True
if keep_whitespace:
    # Return a tokenized copy of text
    # using NLTK's recommended word tokenizer.
    df_tokens = df_sentences.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
else:
    # Tokenize a string on whitespace (space, tab, newline).
    # In general, users should use the string ``split()`` method instead.
    # Returns fewer tokens.
    df_tokens = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
df_tokens['pos'] = df_tokens.pos_tuple.apply(lambda x: x[1])
df_tokens['token_str'] = df_tokens.pos_tuple.apply(lambda x: x[0])
df_tokens['term_str'] = df_tokens.token_str.str.lower().str.replace(r"\W+", "", regex=True)
df_tokens['pos_group'] = df_tokens.pos.str[:2]

In [14]:
variables = ['filename', 'classification', 'person', 'timestamp','sent_str','line_str']
repeat_counts = df_sentences['sent_str'].apply(lambda x: len(nltk.word_tokenize(x)))
for var in variables:
    df_tokens[var] = np.repeat(df_sentences[var].values, repeat_counts)

In [15]:
df_tokens = df_tokens[df_tokens.term_str != '']

In [16]:
df_tokens.head(50)

Unnamed: 0,Unnamed: 1,pos_tuple,pos,token_str,term_str,pos_group,filename,classification,person,timestamp,sent_str,line_str
0,0,"(You, PRP)",PRP,You,you,PR,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You got it.,You got it. Nope which one which one? Sorry. Y...
0,1,"(got, VBD)",VBD,got,got,VB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You got it.,You got it. Nope which one which one? Sorry. Y...
0,2,"(it, PRP)",PRP,it,it,PR,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You got it.,You got it. Nope which one which one? Sorry. Y...
0,0,"(Nope, NN)",NN,Nope,nope,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Nope which one which one?,You got it. Nope which one which one? Sorry. Y...
0,1,"(which, WDT)",WDT,which,which,WD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Nope which one which one?,You got it. Nope which one which one? Sorry. Y...
0,2,"(one, CD)",CD,one,one,CD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Nope which one which one?,You got it. Nope which one which one? Sorry. Y...
0,3,"(which, WDT)",WDT,which,which,WD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Nope which one which one?,You got it. Nope which one which one? Sorry. Y...
0,4,"(one, CD)",CD,one,one,CD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Nope which one which one?,You got it. Nope which one which one? Sorry. Y...
0,0,"(Sorry, NNP)",NNP,Sorry,sorry,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Sorry.,You got it. Nope which one which one? Sorry. Y...
0,0,"(You, PRP)",PRP,You,you,PR,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You guys are good.,You got it. Nope which one which one? Sorry. Y...


### Vocab Tables 

In [17]:
# Teacher Vocab 
df_tokens_teacher = df_tokens[df_tokens['classification'] == 1]
vocab_teacher = df_tokens_teacher['term_str'].value_counts().to_frame('n')
vocab_teacher.index.name = 'term_str'
vocab_teacher['p'] = vocab_teacher['n'] / vocab_teacher['n'].sum()  # Probability of each term
vocab_teacher['i'] = -np.log2(vocab_teacher['p'])  # Information content
vocab_teacher['n_chars'] = vocab_teacher.index.str.len()  # Number of characters in each term

In [18]:
#Student Vocab
df_tokens_student = df_tokens[df_tokens['classification'] == 2]
vocab_student = df_tokens_student['term_str'].value_counts().to_frame('n')
vocab_student.index.name = 'term_str'
vocab_student['p'] = vocab_student['n'] / vocab_student['n'].sum()  # Probability of each term
vocab_student['i'] = -np.log2(vocab_student['p'])  # Information content
vocab_student['n_chars'] = vocab_student.index.str.len()  # Number of characters in each term
vocab_student.head()

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
the,1627,0.045186,4.467991,3
it,1184,0.032882,4.926536,2
i,899,0.024967,5.323812,1
you,773,0.021468,5.541665,3
and,753,0.020913,5.579484,3


In [19]:
vocab_teacher.head(20)

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
you,6528,0.04061,4.622016,3
the,5288,0.032896,4.925935,3
it,4202,0.02614,5.257581,2
to,4196,0.026103,5.259642,2
i,3207,0.01995,5.647433,1
and,3003,0.018681,5.742253,3
that,2932,0.01824,5.776772,4
a,2588,0.0161,5.956819,1
s,2582,0.016062,5.960168,1
we,2536,0.015776,5.986102,2


In [20]:
vocab_student.head(20)

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
the,1627,0.045186,4.467991,3
it,1184,0.032882,4.926536,2
i,899,0.024967,5.323812,1
you,773,0.021468,5.541665,3
and,753,0.020913,5.579484,3
to,714,0.019829,5.65621,2
a,672,0.018663,5.743672,1
s,631,0.017524,5.834494,1
that,593,0.016469,5.924101,4
we,565,0.015691,5.993883,2


In [21]:
df_tokens_student.head(50)

Unnamed: 0,Unnamed: 1,pos_tuple,pos,token_str,term_str,pos_group,filename,classification,person,timestamp,sent_str,line_str
2,0,"(What, WP)",WP,What,what,WP,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,36:06,What does revolve mean?,What does revolve mean?
2,1,"(does, VBZ)",VBZ,does,does,VB,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,36:06,What does revolve mean?,What does revolve mean?
2,2,"(revolve, VB)",VB,revolve,revolve,VB,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,36:06,What does revolve mean?,What does revolve mean?
2,3,"(mean, VB)",VB,mean,mean,VB,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,36:06,What does revolve mean?,What does revolve mean?
3,0,"(It, PRP)",PRP,It,it,PR,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,36:07,It means go around.,It means go around.
3,1,"(means, VBZ)",VBZ,means,means,VB,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,36:07,It means go around.,It means go around.
3,2,"(go, VBP)",VBP,go,go,VB,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,36:07,It means go around.,It means go around.
3,3,"(around, RB)",RB,around,around,RB,01_0101_G4_L04_transcript.txt,2.0,Unidentified Student,36:07,It means go around.,It means go around.
5,0,"(Yay, NNP)",NNP,Yay,yay,NN,01_0101_G4_L04_transcript.txt,2.0,Whole Class,36:17,Yay (varied respones),Yay (varied respones)
5,2,"(varied, JJ)",JJ,varied,varied,JJ,01_0101_G4_L04_transcript.txt,2.0,Whole Class,36:17,Yay (varied respones),Yay (varied respones)


In [22]:
df_tokens_teacher.head(50)

Unnamed: 0,Unnamed: 1,pos_tuple,pos,token_str,term_str,pos_group,filename,classification,person,timestamp,sent_str,line_str
0,0,"(You, PRP)",PRP,You,you,PR,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You got it.,You got it. Nope which one which one? Sorry. Y...
0,1,"(got, VBD)",VBD,got,got,VB,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You got it.,You got it. Nope which one which one? Sorry. Y...
0,2,"(it, PRP)",PRP,it,it,PR,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You got it.,You got it. Nope which one which one? Sorry. Y...
0,0,"(Nope, NN)",NN,Nope,nope,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Nope which one which one?,You got it. Nope which one which one? Sorry. Y...
0,1,"(which, WDT)",WDT,which,which,WD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Nope which one which one?,You got it. Nope which one which one? Sorry. Y...
0,2,"(one, CD)",CD,one,one,CD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Nope which one which one?,You got it. Nope which one which one? Sorry. Y...
0,3,"(which, WDT)",WDT,which,which,WD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Nope which one which one?,You got it. Nope which one which one? Sorry. Y...
0,4,"(one, CD)",CD,one,one,CD,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Nope which one which one?,You got it. Nope which one which one? Sorry. Y...
0,0,"(Sorry, NNP)",NNP,Sorry,sorry,NN,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,Sorry.,You got it. Nope which one which one? Sorry. Y...
0,0,"(You, PRP)",PRP,You,you,PR,01_0101_G4_L03_transcript.txt,1.0,Teacher 01_0101,47:59,You guys are good.,You got it. Nope which one which one? Sorry. Y...


In [23]:
df_tokens.to_csv(f'{file_path}/df_tokens.csv', index=False)