In [1]:
import pandas as pd
import numpy as np

In [2]:
table = pd.read_csv('rating.txt', header=None)

In [3]:
table.columns

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [4]:
table.columns = ['token_scheme', 'n', 'all_until_n', 
                         'max_vocab_size', 'emb_dim', 'optim', 'lr_schedule', 'train_acc', 'val_acc', 'test_acc']

In [5]:
table.sort_values(by=table.columns[8], ascending=False).head(5)[['token_scheme', 'n',  
                         'max_vocab_size', 'emb_dim', 'optim', 'lr_schedule', 'train_acc', 'val_acc', 'test_acc']]

Unnamed: 0,token_scheme,n,max_vocab_size,emb_dim,optim,lr_schedule,train_acc,val_acc,test_acc
197,en_core_web_sm,1,25000,10,adam,linear_annealing,99.995,86.38,81.948
196,en_core_web_sm,1,25000,10,adam,constant,100.0,85.76,82.236
201,en_core_web_sm,1,25000,100,adam,linear_annealing,100.0,85.66,81.604
200,en_core_web_sm,1,25000,100,adam,constant,100.0,85.46,81.708
204,en_core_web_sm,1,25000,150,adam,constant,100.0,85.22,81.412


In [19]:
table.sort_values(by=table.columns[8], ascending=False).tail(5)[['token_scheme', 'n', 'all_until_n', 
                         'max_vocab_size', 'emb_dim', 'optim', 'lr_schedule', 'train_acc', 'val_acc', 'test_acc']]

Unnamed: 0,token_scheme,n,all_until_n,max_vocab_size,emb_dim,optim,lr_schedule,train_acc,val_acc,test_acc
819,en_core_web_sm,4,False,5000,100,sgd,linear_annealing,20.505,19.96,20.084
843,en_core_web_sm,4,False,10000,10,sgd,linear_annealing,20.5,19.96,20.056
822,en_core_web_sm,4,False,5000,150,sgd,constant,18.69,19.92,20.032
890,en_core_web_sm,4,False,25000,300,sgd,constant,11.32,10.84,10.288
887,en_core_web_sm,4,False,25000,250,sgd,linear_annealing,10.27,9.88,9.376


### Tokenization and N-Grams

In [7]:
df = table.loc[ (table['optim'] == 'adam') & (table['lr_schedule'] == 'linear_annealing') & (table['emb_dim'] == 100) & (table['max_vocab_size'] == 25000)]

In [8]:
df[['token_scheme', 'n', 'all_until_n', 'train_acc', 'val_acc']].sort_values(by='val_acc', ascending=False).head(5)

Unnamed: 0,token_scheme,n,all_until_n,train_acc,val_acc
201,en_core_web_sm,1,False,100.0,85.66
313,en_core_web_sm,2,True,100.0,83.78
425,en_core_web_sm,2,False,100.0,83.34
537,en_core_web_sm,3,True,99.76,83.22
761,en_core_web_sm,4,True,99.885,83.1


In [9]:
df[['token_scheme', 'n', 'all_until_n', 'train_acc', 'val_acc']].sort_values(by='val_acc', ascending=True).head(5)

Unnamed: 0,token_scheme,n,all_until_n,train_acc,val_acc
873,en_core_web_sm,4,False,98.905,27.56
89,en_core_web_sm,1,True,99.99,37.18
649,en_core_web_sm,3,False,99.995,77.08
761,en_core_web_sm,4,True,99.885,83.1
537,en_core_web_sm,3,True,99.76,83.22


### Embed Dim

In [10]:
df = table.loc[ (table['optim'] == 'adam') & (table['lr_schedule'] == 'linear_annealing') & (table['n'] == 1) & (table['max_vocab_size'] == 25000) & (table['token_scheme'] == 'en_core_web_sm') & (table['all_until_n'] == False) ]

In [11]:
df[ ['emb_dim', 'train_acc', 'val_acc'] ].sort_values(by = 'val_acc', ascending=False)

Unnamed: 0,emb_dim,train_acc,val_acc
197,10,99.995,86.38
201,100,100.0,85.66
205,150,100.0,85.2
217,300,99.915,84.88
221,500,99.76,84.88
209,200,99.755,84.38
213,250,99.765,84.06


### Max Vocab Size

In [12]:
df = table.loc[ (table['optim'] == 'adam') & (table['lr_schedule'] == 'linear_annealing') & (table['n'] == 1) & (table['emb_dim'] == 100) & (table['token_scheme'] == 'en_core_web_sm') & (table['all_until_n'] == False) ]

In [13]:
df[ ['max_vocab_size', 'train_acc', 'val_acc'] ].sort_values(by = 'val_acc', ascending=False)

Unnamed: 0,max_vocab_size,train_acc,val_acc
201,25000,100.0,85.66
145,5000,95.66,83.66
173,10000,99.475,83.66
117,1000,85.565,82.86


### Optimizer

In [14]:
df = table.loc[ (table['max_vocab_size'] == 25000) & (table['lr_schedule'] == 'linear_annealing') & (table['n'] == 1) & (table['emb_dim'] == 100) & (table['token_scheme'] == 'en_core_web_sm') & (table['all_until_n'] == False) ]

In [15]:
df[ ['optim', 'train_acc', 'val_acc'] ].sort_values(by = 'val_acc', ascending=False)

Unnamed: 0,optim,train_acc,val_acc
201,adam,100.0,85.66
203,sgd,64.28,63.5


### LR Schedule

In [16]:
df = table.loc[ (table['optim'] == 'adam') & (table['max_vocab_size'] == 25000) & (table['n'] == 1) & (table['emb_dim'] == 100) & (table['token_scheme'] == 'en_core_web_sm') & (table['all_until_n'] == False) ]

In [17]:
df[ ['lr_schedule', 'train_acc', 'val_acc'] ].sort_values(by = 'val_acc', ascending=False)

Unnamed: 0,lr_schedule,train_acc,val_acc
201,linear_annealing,100.0,85.66
200,constant,100.0,85.46
