## This notebook creates library, corpus, and vocabulary tables

In [342]:
import pandas as pd
import nltk
import re
import numpy as np

### Config 

In [343]:
import configparser
config = configparser.ConfigParser()
config.read("../../env.ini")
data_home  = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

### Read in Data 

In [344]:
df = pd.read_stata(f"{data_home}/metamodeltranscripttextdata.dta") # Text Data was cleaned using
df['text'] = df['text'].str.replace("SUMMARY KEYWORDS.*?([0-9]{2}:[0-9]{2})", "\\1", regex=True)
df['text'] = df['text'].str.replace(r'^\d{2}:\d{2}\s+|\s+\d{2}:\d{2}\s+', " ", regex=True)

In [345]:
library = df[['id','metadata','site','course','person','personid','task']]
df_tasks = df[['personid','task','text']]
OHCO = ['personid', 'task', 'sent_num', 'token_num']
df_tasks.set_index(OHCO[:2], inplace=True)

In [346]:
df_tasks

Unnamed: 0_level_0,Unnamed: 1_level_0,text
personid,task,Unnamed: 2_level_1
5,P1,"Okay, so today, we're going to do a word prob..."
5,P2,"Okay, guys, so today we're gonna do a word pr..."
5,P3,"Okay, old problems. So, today we are going to..."
5,P4,"Okay, so today we're gonna do a word problem...."
5,P5,"Okay, guys, so today we're gonna do a problem..."
...,...,...
96,Placement,So now we will start a simple maths lesson an...
100,Placement,Today we're going to work on a fraction word ...
103,Placement,"So, you might notice, I am filming you guys. ..."
106,Placement,"Okay, all right. I started my recording. Alri..."


### Chunk to Sentences

In [347]:
#df_tasks

In [348]:
df_sentences = df_tasks.text.apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame('sent_str')
df_sentences.index.names = OHCO[:3]

In [349]:
#df_sentences 

### Tokenize Sentences 

In [350]:
keep_whitespace = True
if keep_whitespace:
    # Return a tokenized copy of text
    # using NLTK's recommended word tokenizer.
    df_tokens = df_sentences .sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
else:
    # Tokenize a string on whitespace (space, tab, newline).
    # In general, users should use the string ``split()`` method instead.
    # Returns fewer tokens.
    df_tokens = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')

In [351]:
df_tokens.index.names = OHCO[:4]

In [352]:
df_tokens['pos'] = df_tokens.pos_tuple.apply(lambda x: x[1])
df_tokens['token_str'] = df_tokens.pos_tuple.apply(lambda x: x[0])
df_tokens['term_str'] = df_tokens.token_str.str.lower().str.replace(r"\W+", "", regex=True)
df_tokens['pos_group'] = df_tokens.pos.str[:2]

In [353]:
df_tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str,pos_group
personid,task,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5,P1,0,0,"(Okay, NNP)",NNP,Okay,okay,NN
5,P1,0,1,"(,, ,)",",",",",,","
5,P1,0,2,"(so, RB)",RB,so,so,RB
5,P1,0,3,"(today, NN)",NN,today,today,NN
5,P1,0,4,"(,, ,)",",",",",,","
...,...,...,...,...,...,...,...,...
110,Placement,41,5,"(have, VBP)",VBP,have,have,VB
110,Placement,41,6,"(been, VBN)",VBN,been,been,VB
110,Placement,41,7,"(through, IN)",IN,through,through,IN
110,Placement,41,8,"(it, PRP)",PRP,it,it,PR


### Handle Anomalies

In [354]:
df_tokens[df_tokens.term_str == '']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str,pos_group
personid,task,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5,P1,0,1,"(,, ,)",",",",",,","
5,P1,0,4,"(,, ,)",",",",",,","
5,P1,0,13,"(., .)",.,.,,.
5,P1,1,13,"(,, ,)",",",",",,","
5,P1,1,30,"(., .)",.,.,,.
...,...,...,...,...,...,...,...,...
110,Placement,37,14,"(., .)",.,.,,.
110,Placement,38,3,"($, $)",$,$,,$
110,Placement,38,20,"(., .)",.,.,,.
110,Placement,39,20,"(., .)",.,.,,.


In [355]:
df_tokens[df_tokens.term_str == ''].token_str.value_counts()

token_str
.    36220
,    25429
?     7883
$     1950
%       21
&        1
Name: count, dtype: int64

In [356]:
df_tokens = df_tokens[df_tokens.term_str != '']

In [357]:
df_tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str,pos_group
personid,task,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5,P1,0,0,"(Okay, NNP)",NNP,Okay,okay,NN
5,P1,0,2,"(so, RB)",RB,so,so,RB
5,P1,0,3,"(today, NN)",NN,today,today,NN
5,P1,0,5,"(we, PRP)",PRP,we,we,PR
5,P1,0,6,"('re, VBP)",VBP,'re,re,VB
...,...,...,...,...,...,...,...,...
110,Placement,41,5,"(have, VBP)",VBP,have,have,VB
110,Placement,41,6,"(been, VBN)",VBN,been,been,VB
110,Placement,41,7,"(through, IN)",IN,through,through,IN
110,Placement,41,8,"(it, PRP)",PRP,it,it,PR


### Extract Vocabulary 

In [358]:
vocab = df_tokens.term_str.value_counts().to_frame('n')
vocab.index.name = 'term_str'
vocab['p'] = vocab.n / vocab.n.sum()
vocab['i'] = -np.log2(vocab.p)
vocab['n_chars'] = vocab.index.str.len()

In [359]:
vocab

Unnamed: 0_level_0,n,p,i,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
to,20721,0.035155,4.830128,2
we,20066,0.034044,4.876469,2
so,19632,0.033307,4.908015,2
the,19499,0.033082,4.917822,3
i,15502,0.026300,5.248767,1
...,...,...,...,...
jr,1,0.000002,19.168934,2
reaches,1,0.000002,19.168934,7
courthouses,1,0.000002,19.168934,11
2946,1,0.000002,19.168934,4


### Max POS 

In [360]:
df_tokens[['term_str','pos_group']].value_counts().sort_index().loc['love']
df_tokens[['term_str','pos']].value_counts().sort_index().loc['love']

pos
IN      3
NN      4
VB     12
VBP    36
VBZ     1
Name: count, dtype: int64

In [361]:
vocab['max_pos_group'] = df_tokens[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)
vocab['max_pos'] = df_tokens[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [362]:
df_tokens.shape[0]

589419

### POS ambiguity

In [363]:
TPM1 = df_tokens[['term_str','pos_group']].value_counts().unstack()
vocab['n_pos_group'] = TPM1.count(1)
TPM2 = df_tokens[['term_str','pos']].value_counts().unstack()
vocab['n_pos'] = TPM2.count(1)

In [364]:
vocab.sort_values('n_pos')

Unnamed: 0_level_0,n,p,i,n_chars,max_pos_group,max_pos,n_pos_group,n_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
to,20721,0.035155,4.830128,2,TO,TO,1,1
atoms,1,0.000002,19.168934,5,NN,NNS,1,1
lads,1,0.000002,19.168934,4,NN,NNP,1,1
storytellers,1,0.000002,19.168934,12,NN,NNS,1,1
hiding,1,0.000002,19.168934,6,VB,VBG,1,1
...,...,...,...,...,...,...,...,...
yes,348,0.000590,10.725991,3,UH,UH,7,12
oh,402,0.000682,10.517882,2,UH,UH,7,13
yeah,963,0.001634,9.257542,4,UH,UH,6,14
okay,3324,0.005639,7.470229,4,NN,NNP,9,15


### Stop Words 

In [365]:
sw = pd.DataFrame({'stop': 1}, index=nltk.corpus.stopwords.words('english'))
sw.index.name='term_str'

In [366]:
if 'stop' not in vocab.columns:
    vocab = vocab.join(sw)
    vocab['stop'] = vocab['stop'].fillna(0).astype('int')

### Add Stems 

In [370]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
vocab['porter_stem'] = vocab.apply(lambda x: stemmer.stem(x.name), 1)

In [371]:
vocab

Unnamed: 0_level_0,n,p,i,n_chars,max_pos_group,max_pos,n_pos_group,n_pos,stop,p_stem,porter_stem
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
to,20721,0.035155,4.830128,2,TO,TO,1,1,1,to,to
we,20066,0.034044,4.876469,2,PR,PRP,1,1,1,we,we
so,19632,0.033307,4.908015,2,RB,RB,4,9,1,so,so
the,19499,0.033082,4.917822,3,DT,DT,1,1,1,the,the
i,15502,0.026300,5.248767,1,PR,PRP,6,9,1,i,i
...,...,...,...,...,...,...,...,...,...,...,...
jr,1,0.000002,19.168934,2,NN,NNP,1,1,0,jr,jr
reaches,1,0.000002,19.168934,7,VB,VBZ,1,1,0,reach,reach
courthouses,1,0.000002,19.168934,11,NN,NNS,1,1,0,courthous,courthous
2946,1,0.000002,19.168934,4,CD,CD,1,1,0,2946,2946


### Save Data

In [369]:
library.to_csv(f"{output_dir}/library.csv", index=True)
df_tokens.to_csv(f"{output_dir}/corpus_raw.csv", index=True)
vocab.to_csv(f"{output_dir}/vocab_raw.csv", index=True)