## This notebook creates library, corpus, and vocabulary tables

In [17]:
import pandas as pd
import nltk
import re
import numpy as np

### Config 

In [18]:
import configparser
config = configparser.ConfigParser()
config.read("../../env.ini")
data_home  = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

### Read in Data 

In [19]:
df = pd.read_stata(f"{data_home}/metamodeltranscripttextdata.dta") # Text Data was cleaned using
df['text'] = df['text'].str.replace("SUMMARY KEYWORDS.*?([0-9]{2}:[0-9]{2})", "\\1", regex=True)
df['text'] = df['text'].str.replace(r'^\d{2}:\d{2}\s+|\s+\d{2}:\d{2}\s+', " ", regex=True)

In [20]:
library = df[['id','metadata','site','course','person','personid','task']]
df_tasks = df[['personid','task','text']]
OHCO = ['personid', 'task', 'sent_num', 'token_num']
df_tasks.set_index(OHCO[:2], inplace=True)

In [21]:
df_tasks

Unnamed: 0_level_0,Unnamed: 1_level_0,text
personid,task,Unnamed: 2_level_1
JMU_F22_002,P1,"Okay, so today, we're going to do a word prob..."
JMU_F22_002,P2,"Okay, guys, so today we're gonna do a word pr..."
JMU_F22_002,P3,"Okay, old problems. So, today we are going to..."
JMU_F22_002,P4,"Okay, so today we're gonna do a word problem...."
JMU_F22_002,P5,"Okay, guys, so today we're gonna do a problem..."
...,...,...
UVA_S23_023,Placement,So now we will start a simple maths lesson an...
UVA_S23_024,Placement,Today we're going to work on a fraction word ...
UVA_S23_025,Placement,"So, you might notice, I am filming you guys. ..."
UVA_S23_026,Placement,"Okay, all right. I started my recording. Alri..."


### Chunk to Sentences

In [22]:
#df_tasks

In [23]:
df_sentences = df_tasks.text.apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame('sent_str')
df_sentences.index.names = OHCO[:3]

In [24]:
df_sentences 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
personid,task,sent_num,Unnamed: 3_level_1
JMU_F22_002,P1,0,"Okay, so today, we're going to do a word prob..."
JMU_F22_002,P1,1,And we're going to try to make sense of this w...
JMU_F22_002,P1,2,Okay.
JMU_F22_002,P1,3,"So, our problem says that Ava's mom, so Ava's ..."
JMU_F22_002,P1,4,Okay.
...,...,...,...
UVA_S23_027,Placement,37,I don't know what world Cody's living in but t...
UVA_S23_027,Placement,38,I'm sorry $1 They can either sell all tomatoes...
UVA_S23_027,Placement,39,Let's say we're trying to be the most efficien...
UVA_S23_027,Placement,40,What is the smallest number of tomatoes and pu...


### Tokenize Sentences 

In [25]:
keep_whitespace = True
if keep_whitespace:
    # Return a tokenized copy of text
    # using NLTK's recommended word tokenizer.
    df_tokens = df_sentences .sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
else:
    # Tokenize a string on whitespace (space, tab, newline).
    # In general, users should use the string ``split()`` method instead.
    # Returns fewer tokens.
    df_tokens = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')

In [26]:
df_tokens.index.names = OHCO[:4]

In [27]:
df_tokens['pos'] = df_tokens.pos_tuple.apply(lambda x: x[1])
df_tokens['token_str'] = df_tokens.pos_tuple.apply(lambda x: x[0])
df_tokens['term_str'] = df_tokens.token_str.str.lower().str.replace(r"\W+", "", regex=True)
df_tokens['pos_group'] = df_tokens.pos.str[:2]

In [28]:
df_tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str,pos_group
personid,task,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
JMU_F22_002,P1,0,0,"(Okay, NNP)",NNP,Okay,okay,NN
JMU_F22_002,P1,0,1,"(,, ,)",",",",",,","
JMU_F22_002,P1,0,2,"(so, RB)",RB,so,so,RB
JMU_F22_002,P1,0,3,"(today, NN)",NN,today,today,NN
JMU_F22_002,P1,0,4,"(,, ,)",",",",",,","
...,...,...,...,...,...,...,...,...
UVA_S23_027,Placement,41,5,"(have, VBP)",VBP,have,have,VB
UVA_S23_027,Placement,41,6,"(been, VBN)",VBN,been,been,VB
UVA_S23_027,Placement,41,7,"(through, IN)",IN,through,through,IN
UVA_S23_027,Placement,41,8,"(it, PRP)",PRP,it,it,PR


### Handle Anomalies

In [29]:
df_tokens[df_tokens.term_str == '']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str,pos_group
personid,task,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
JMU_F22_002,P1,0,1,"(,, ,)",",",",",,","
JMU_F22_002,P1,0,4,"(,, ,)",",",",",,","
JMU_F22_002,P1,0,13,"(., .)",.,.,,.
JMU_F22_002,P1,1,13,"(,, ,)",",",",",,","
JMU_F22_002,P1,1,30,"(., .)",.,.,,.
...,...,...,...,...,...,...,...,...
UVA_S23_027,Placement,37,14,"(., .)",.,.,,.
UVA_S23_027,Placement,38,3,"($, $)",$,$,,$
UVA_S23_027,Placement,38,20,"(., .)",.,.,,.
UVA_S23_027,Placement,39,20,"(., .)",.,.,,.


In [30]:
df_tokens[df_tokens.term_str == ''].token_str.value_counts()

token_str
.    36220
,    25429
?     7883
$     1950
%       21
&        1
Name: count, dtype: int64

In [31]:
df_tokens = df_tokens[df_tokens.term_str != '']

In [32]:
df_tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str,pos_group
personid,task,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
JMU_F22_002,P1,0,0,"(Okay, NNP)",NNP,Okay,okay,NN
JMU_F22_002,P1,0,2,"(so, RB)",RB,so,so,RB
JMU_F22_002,P1,0,3,"(today, NN)",NN,today,today,NN
JMU_F22_002,P1,0,5,"(we, PRP)",PRP,we,we,PR
JMU_F22_002,P1,0,6,"('re, VBP)",VBP,'re,re,VB
...,...,...,...,...,...,...,...,...
UVA_S23_027,Placement,41,5,"(have, VBP)",VBP,have,have,VB
UVA_S23_027,Placement,41,6,"(been, VBN)",VBN,been,been,VB
UVA_S23_027,Placement,41,7,"(through, IN)",IN,through,through,IN
UVA_S23_027,Placement,41,8,"(it, PRP)",PRP,it,it,PR


### Extract Vocabulary 

In [33]:
vocab = df_tokens.term_str.value_counts().to_frame('n')
vocab.index.name = 'term_str'
vocab['p'] = vocab.n / vocab.n.sum()
vocab['i'] = -np.log2(vocab.p)
vocab['n_chars'] = vocab.index.str.len()

In [34]:
vocab

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
to,20721,0.035155,4.830128,2
we,20066,0.034044,4.876469,2
so,19632,0.033307,4.908015,2
the,19499,0.033082,4.917822,3
i,15502,0.026300,5.248767,1
...,...,...,...,...
jr,1,0.000002,19.168934,2
reaches,1,0.000002,19.168934,7
courthouses,1,0.000002,19.168934,11
2946,1,0.000002,19.168934,4


### Max POS 

In [35]:
df_tokens[['term_str','pos_group']].value_counts().sort_index().loc['love']
df_tokens[['term_str','pos']].value_counts().sort_index().loc['love']

pos
IN      3
NN      4
VB     12
VBP    36
VBZ     1
Name: count, dtype: int64

In [36]:
vocab['max_pos_group'] = df_tokens[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)
vocab['max_pos'] = df_tokens[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [37]:
df_tokens.shape[0]

589419

### POS ambiguity

In [38]:
vocab['n_pos_group'] = df_tokens[['term_str','pos_group']].value_counts().unstack().count(1)
vocab['cat_pos_group'] = df_tokens[['term_str','pos_group']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos_group.apply(lambda x: set(x))

In [39]:
vocab['n_pos'] = df_tokens[['term_str','pos']].value_counts().unstack().count(1)
vocab['cat_pos'] = df_tokens[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))

In [40]:
vocab.sort_values('n_pos')

Unnamed: 0_level_0,n,p,i,n_chars,max_pos_group,max_pos,n_pos_group,cat_pos_group,n_pos,cat_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
to,20721,0.035155,4.830128,2,TO,TO,1,{TO},1,{TO}
atoms,1,0.000002,19.168934,5,NN,NNS,1,{NN},1,{NNS}
lads,1,0.000002,19.168934,4,NN,NNP,1,{NN},1,{NNP}
storytellers,1,0.000002,19.168934,12,NN,NNS,1,{NN},1,{NNS}
hiding,1,0.000002,19.168934,6,VB,VBG,1,{VB},1,{VBG}
...,...,...,...,...,...,...,...,...,...,...
yes,348,0.000590,10.725991,3,UH,UH,7,"{RP, IN, NN, VB, JJ, UH, RB}",12,"{RP, IN, NN, JJR, NNP, VB, JJ, VBP, UH, VBZ, R..."
oh,402,0.000682,10.517882,2,UH,UH,7,"{RP, IN, NN, VB, JJ, UH, RB}",13,"{RP, IN, NN, NNPS, VBN, NNP, VB, JJ, VBP, UH, ..."
yeah,963,0.001634,9.257542,4,UH,UH,6,"{IN, NN, VB, JJ, UH, RB}",14,"{IN, NN, NNPS, VBN, JJR, NNP, VB, VBD, JJ, VBP..."
okay,3324,0.005639,7.470229,4,NN,NNP,9,"{RP, IN, NN, PR, VB, JJ, MD, UH, RB}",15,"{RP, IN, NN, VBN, MD, JJR, NNP, PRP, VB, JJ, V..."


### Stop Words 

In [41]:
sw = pd.DataFrame({'stop': 1}, index=nltk.corpus.stopwords.words('english'))
sw.index.name='term_str'

In [42]:
if 'stop' not in vocab.columns:
    vocab = vocab.join(sw)
    vocab['stop'] = vocab['stop'].fillna(0).astype('int')

### Add Stems 

In [43]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
vocab['porter_stem'] = vocab.apply(lambda x: stemmer.stem(x.name), 1)

In [44]:
vocab

Unnamed: 0_level_0,n,p,i,n_chars,max_pos_group,max_pos,n_pos_group,cat_pos_group,n_pos,cat_pos,stop,porter_stem
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
to,20721,0.035155,4.830128,2,TO,TO,1,{TO},1,{TO},1,to
we,20066,0.034044,4.876469,2,PR,PRP,1,{PR},1,{PRP},1,we
so,19632,0.033307,4.908015,2,RB,RB,4,"{NN, IN, RB, VB}",9,"{IN, NN, NNPS, NNP, VB, VBD, VBP, RB, NNS}",1,so
the,19499,0.033082,4.917822,3,DT,DT,1,{DT},1,{DT},1,the
i,15502,0.026300,5.248767,1,PR,PRP,6,"{IN, NN, PR, VB, JJ, RB}",9,"{IN, NN, PRP, VB, JJ, VBP, RB, VBZ, NNS}",1,i
...,...,...,...,...,...,...,...,...,...,...,...,...
jr,1,0.000002,19.168934,2,NN,NNP,1,{NN},1,{NNP},0,jr
reaches,1,0.000002,19.168934,7,VB,VBZ,1,{VB},1,{VBZ},0,reach
courthouses,1,0.000002,19.168934,11,NN,NNS,1,{NN},1,{NNS},0,courthous
2946,1,0.000002,19.168934,4,CD,CD,1,{CD},1,{CD},0,2946


### Save Data

In [45]:
library.to_csv(f"{output_dir}/library.csv", index=False)
df_sentences.to_csv(f"{output_dir}/sentences_raw.csv", index=True)
df_tasks.to_csv(f"{output_dir}/tasks_raw.csv", index=True)
df_tokens.to_csv(f"{output_dir}/corpus_raw.csv", index=True)
vocab.to_csv(f"{output_dir}/vocab_raw.csv", index=True)