## This notebook creates the derived tables (BOW, DTCM, and add-ons to vocab table)

In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from scipy.spatial.distance import pdist, squareform

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [2]:
# For ngrams 
import nltk
from nltk.lm import MLE
from nltk.lm import Vocabulary
from nltk.lm import NgramCounter
from nltk.lm.preprocessing import padded_everygram_pipeline
from collections import Counter

### Config

In [3]:
import configparser
config = configparser.ConfigParser()
config.read("../../env.ini")
data_home  = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

### Functions 

In [4]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

In [5]:
def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
            
    DTCM = BOW.n.unstack() # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(DTCM.T + 1)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.count() # Assumes NULLs 
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log2(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log2(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log2(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log2((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    DFIDF = DF * IDF
    
    TFIDF = TFIDF.fillna(0)

    return TFIDF, DFIDF

### Data 

In [6]:
OHCO = ['personid','session','task', 'sent_num', 'token_num']
sent_num = OHCO[:4]
task = OHCO[:3]
session = OHCO[:2]
personid = OHCO[:1]
vocab_filter = 'dfidf'
n_terms = 1000
pos_list = "NN NNS VB VBD VBG VBN VBP VBZ JJ JJR JJS RB RBR RBS".split() # Open categories with no proper nouns

In [7]:
corpus = pd.read_csv(f"{output_dir}/corpus_raw.csv")
vocab = pd.read_csv(f"{output_dir}/vocab_raw.csv").set_index('term_str')

In [8]:
# Creat a time grouping variable for when we reduce the TFIDF table
conditions = [
    corpus['task'].isin(['P1', 'P2', 'P3']),  # Task is P1, P2, or P3
    corpus['task'].isin(['P4', 'P5', 'P6']),  # Task is P4, P5, or P6
    corpus['task'] == 'Placement'             # Task is Placement
]

choices = [
    0,  # Value for tasks P1, P2, P3
    1,  # Value for tasks P4, P5, P6
    2   # Value for Placement
]

# Apply the conditions and choices to create a new column
corpus['session'] = np.select(conditions, choices, default=np.nan)
corpus['session'] = np.select(conditions, choices, default=np.nan)

In [9]:
corpus = corpus.set_index(OHCO)

In [10]:
corpus

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
personid,session,task,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
JMU_F22_002,0.0,P1,0,0,"('Okay', 'NNP')",NNP,Okay,okay,NN
JMU_F22_002,0.0,P1,0,2,"('so', 'RB')",RB,so,so,RB
JMU_F22_002,0.0,P1,0,3,"('today', 'NN')",NN,today,today,NN
JMU_F22_002,0.0,P1,0,5,"('we', 'PRP')",PRP,we,we,PR
JMU_F22_002,0.0,P1,0,6,"(""'re"", 'VBP')",VBP,'re,re,VB
...,...,...,...,...,...,...,...,...,...
UVA_S23_027,2.0,Placement,41,5,"('have', 'VBP')",VBP,have,have,VB
UVA_S23_027,2.0,Placement,41,6,"('been', 'VBN')",VBN,been,been,VB
UVA_S23_027,2.0,Placement,41,7,"('through', 'IN')",IN,through,through,IN
UVA_S23_027,2.0,Placement,41,8,"('it', 'PRP')",PRP,it,it,PR


In [11]:
corpus = corpus[corpus['term_str'].str.len() > 1]

In [12]:
#remove the nouns from word prompts 
nouns_to_remove = "Ava Mom Years grandma cafeteria CeCe cookies sugar cup cookie dough recipe MP3 players \
Price Electronics CD Player MP3 player Smart Speaker Radio playground Diego cousin pie \
Mrs. Mrs Molloy STEM club coding robots Ring lights Robots Craft Kits Document Cameras students classrooms".lower().split()

In [13]:
corpus = corpus[~corpus['term_str'].isin(nouns_to_remove)]

In [14]:
#corpus_raw

In [15]:
#vocab_raw

### B0W

In [16]:
bow_tasks = create_bow(corpus, bag=task)

In [17]:
bow_tasks

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n
personid,session,task,term_str,Unnamed: 4_level_1
JMU_F22_002,0.0,P1,10,14
JMU_F22_002,0.0,P1,1020,2
JMU_F22_002,0.0,P1,11,2
JMU_F22_002,0.0,P1,20,10
JMU_F22_002,0.0,P1,2122,2
...,...,...,...,...
UVA_S23_027,2.0,Placement,yeah,4
UVA_S23_027,2.0,Placement,you,42
UVA_S23_027,2.0,Placement,your,2
UVA_S23_027,2.0,Placement,yourself,1


### DTCM 

In [18]:
dtcm_tasks = bow_tasks.n.unstack(fill_value=0) # Create Doc-Term Count Matrix
dtcm_tasks

Unnamed: 0_level_0,Unnamed: 1_level_0,term_str,00,000,00000,001,004,008,0081,01,010,02,...,zeroes,zeros,zoe,zoey,zone,zones,zoo,zoom,zowie,zwiers
personid,session,task,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
JMU_F22_002,0.0,P1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
JMU_F22_002,0.0,P2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
JMU_F22_002,0.0,P3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
JMU_F22_002,1.0,P4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
JMU_F22_002,1.0,P5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UVA_S23_027,0.0,P3,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
UVA_S23_027,1.0,P4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
UVA_S23_027,1.0,P5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
UVA_S23_027,1.0,P6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TFIDF

In [19]:
tfidf_tasks, dfidf_tasks = get_tfidf(bow_tasks, tf_method='max', df_method='standard')

In [20]:
tfidf_tasks

Unnamed: 0_level_0,Unnamed: 1_level_0,term_str,00,000,00000,001,004,008,0081,01,010,02,...,zeroes,zeros,zoe,zoey,zone,zones,zoo,zoom,zowie,zwiers
personid,session,task,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
JMU_F22_002,0.0,P1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
JMU_F22_002,0.0,P2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
JMU_F22_002,0.0,P3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
JMU_F22_002,1.0,P4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
JMU_F22_002,1.0,P5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UVA_S23_027,0.0,P3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.196358,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
UVA_S23_027,1.0,P4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
UVA_S23_027,1.0,P5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
UVA_S23_027,1.0,P6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


### Add DFIDF to vocab table 

In [21]:
vocab['dfidf'] = dfidf_tasks

In [22]:
vocab['mean_tfidf'] = tfidf_tasks.mean()

## Add TFIDF to BOW table 

In [23]:
bow_tasks['tfidf'] = tfidf_tasks.stack()

In [24]:
bow_tasks

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n,tfidf
personid,session,task,term_str,Unnamed: 4_level_1,Unnamed: 5_level_1
JMU_F22_002,0.0,P1,10,14,0.218545
JMU_F22_002,0.0,P1,1020,2,0.129856
JMU_F22_002,0.0,P1,11,2,0.079636
JMU_F22_002,0.0,P1,20,10,0.259616
JMU_F22_002,0.0,P1,2122,2,0.129856
...,...,...,...,...,...
UVA_S23_027,2.0,Placement,yeah,4,0.227714
UVA_S23_027,2.0,Placement,you,42,0.384076
UVA_S23_027,2.0,Placement,your,2,0.058146
UVA_S23_027,2.0,Placement,yourself,1,0.102002


### TFIDF with L2 normalization

In [25]:
vocab.dfidf.sort_values(ascending=False).head(20) 

term_str
some       579.022457
put        579.022457
says       579.014685
looking    578.961985
solve      578.956098
them       578.956098
four       578.908604
total      578.876413
10         578.833162
than       578.833162
take       578.833162
five       578.543874
way        578.483531
an         578.419745
has        578.419745
plus       578.130331
trying     578.119531
she        578.034940
into       577.946532
could      577.946532
Name: dfidf, dtype: float64

In [26]:
VIDX = vocab.loc[vocab.max_pos.isin(pos_list)]\
    .sort_values(vocab_filter, ascending=False)\
    .head(n_terms).index

In [27]:
tfidf_tasks

Unnamed: 0_level_0,Unnamed: 1_level_0,term_str,00,000,00000,001,004,008,0081,01,010,02,...,zeroes,zeros,zoe,zoey,zone,zones,zoo,zoom,zowie,zwiers
personid,session,task,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
JMU_F22_002,0.0,P1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
JMU_F22_002,0.0,P2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
JMU_F22_002,0.0,P3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
JMU_F22_002,1.0,P4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
JMU_F22_002,1.0,P5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UVA_S23_027,0.0,P3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.196358,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
UVA_S23_027,1.0,P4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
UVA_S23_027,1.0,P5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
UVA_S23_027,1.0,P6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [28]:
M = tfidf_tasks[VIDX].fillna(0).groupby(['personid', 'session']).mean() # MUST FILLNA

In [29]:
M

Unnamed: 0_level_0,term_str,put,says,looking,solve,total,take,way,has,trying,little,...,others,symbol,meet,bat,tackle,fours,task,activity,ago,demonstrate
personid,session,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
JMU_F22_002,0.0,0.054181,0.042653,0.026125,0.000000,0.033948,0.010407,0.045062,0.015830,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JMU_F22_002,1.0,0.012572,0.188128,0.022208,0.000000,0.000000,0.000000,0.051438,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JMU_F22_005,0.0,0.038219,0.041330,0.172445,0.059627,0.090356,0.037855,0.127360,0.027926,0.072456,0.044204,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JMU_F22_005,1.0,0.040987,0.042789,0.028740,0.110006,0.077317,0.000000,0.029691,0.009767,0.143276,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JMU_F22_009,0.0,0.126461,0.080373,0.043110,0.000000,0.043433,0.139600,0.000000,0.040506,0.044864,0.022601,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UVA_S23_026,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UVA_S23_026,2.0,0.010386,0.021084,0.021242,0.010283,0.053505,0.050903,0.020009,0.049898,0.011053,0.055683,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UVA_S23_027,0.0,0.000000,0.000000,0.056107,0.031245,0.041020,0.000000,0.019175,0.051918,0.030561,0.049086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UVA_S23_027,1.0,0.026541,0.000000,0.000000,0.000000,0.042804,0.000000,0.035572,0.039919,0.050355,0.022273,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
tfidf_tasks_L2 = M.apply(lambda x: x / norm(x), 1) # Euclidean

In [31]:
tfidf_tasks_L2

Unnamed: 0_level_0,term_str,put,says,looking,solve,total,take,way,has,trying,little,...,others,symbol,meet,bat,tackle,fours,task,activity,ago,demonstrate
personid,session,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
JMU_F22_002,0.0,0.052812,0.041575,0.025465,0.000000,0.033090,0.010144,0.043924,0.015430,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JMU_F22_002,1.0,0.011206,0.167684,0.019795,0.000000,0.000000,0.000000,0.045848,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JMU_F22_005,0.0,0.039944,0.043195,0.180225,0.062317,0.094433,0.039563,0.133106,0.029186,0.075725,0.046198,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JMU_F22_005,1.0,0.037452,0.039099,0.026261,0.100520,0.070650,0.000000,0.027131,0.008925,0.130921,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JMU_F22_009,0.0,0.105650,0.067147,0.036015,0.000000,0.036286,0.116627,0.000000,0.033840,0.037481,0.018881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UVA_S23_026,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UVA_S23_026,2.0,0.007951,0.016142,0.016263,0.007873,0.040964,0.038972,0.015319,0.038203,0.008463,0.042631,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UVA_S23_027,0.0,0.000000,0.000000,0.043658,0.024313,0.031919,0.000000,0.014921,0.040399,0.023780,0.038195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UVA_S23_027,1.0,0.020625,0.000000,0.000000,0.000000,0.033263,0.000000,0.027643,0.031021,0.039131,0.017309,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## NGRAMS 

In [32]:
#ngram_order = 3 # 3 means trigrams
#pads = ["<s>", "</s>"]
#widx = [f"w{i}" for i in range(ngram_order)]
#ohco = ['sent_num', 'token_num']

In [33]:
#train_df = corpus
#train_ohco = OHCO

In [34]:
#train_tokens = train_df.groupby(train_ohco[:4]).term_str.apply(list).values.tolist()

In [35]:
#ngram_args = dict(pad_right=True, pad_left=True, left_pad_symbol=pads[0], right_pad_symbol=pads[1])
#train_ngrams = [[] for i in range(ngram_order)]
#for j in range(ngram_order):
#    train_ngrams[j] = [nltk.ngrams(sent, n=j+1, **ngram_args) for sent in train_tokens]
#
#ng_cols = ['sent_num', 'token_num', 'word_pos', 'token']
#ng_data = [[] for n in range(ngram_order)]
#ng_df = [None for n in range(ngram_order)]
#for n in range(ngram_order):
#    for i, z in enumerate(train_ngrams[n]):
#        for j, x in enumerate(list(z)):
#            for k, token in enumerate(list(x)):
#                ng_data[n].append((i, j, f"w{k}", token))
#    ng_df[n] = pd.DataFrame(ng_data[n], columns=ng_cols).set_index(ng_cols[:-1]).unstack()
#    ng_df[n].columns = ng_df[n].columns.droplevel(0)

In [36]:
#ng_df[2].loc[5].tail(50)

In [37]:
#ng_counts = []
#K = .01
#for n in range(ngram_order):
#    ng_counts.append(ng_df[n].value_counts().to_frame('n'))
#    if n == 0:
#        V = len(ng_counts[0])
#    ng_counts[n]['mle'] = ng_counts[n].n / ng_counts[n].n.sum()
#    ng_counts[n]['p'] = (ng_counts[n].n + K) / (ng_counts[n].n.sum() + K)
#    if n > 0:
#        ng_counts[n]['cp'] = (ng_counts[n].n + K) / (ng_counts[n-1].n + V * k)
#        ng_counts[n]['ci'] = -np.log2(ng_counts[n].cp)
#    ng_counts[n] = ng_counts[n].sort_index()

In [38]:
#ng_counts

In [39]:
#ng_counts[2].head(50)

### Save Data 

In [40]:
vocab.to_csv(f"{output_dir}/vocab.csv", index=True)
corpus.to_csv(f"{output_dir}/corpus.csv", index=True)
bow_tasks.to_csv(f"{output_dir}/bow.csv", index=True)
dtcm_tasks.to_csv(f"{output_dir}/dtcm.csv", index=True)
tfidf_tasks.to_csv(f"{output_dir}/tfidf.csv", index=True)
tfidf_tasks_L2.to_csv(f"{output_dir}/tfidf_reduced_L2.csv", index=True)