## SEQ2Sql

Seq2sql implemented using BERT encoding instead of Glove.

### Import Libraries

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import json
from transformers import BertTokenizer, glue_convert_examples_to_features, BertModel
import numpy as np

### Import Preprocessed Dataset

Pre processed, tokenized WikiSql data is stored in './data/tokenized*'

In [4]:
dev_set_tokenized = pd.read_json('./data/tokenized_dev.jsonl',lines = True)
dev_set_tokenized.head()

Unnamed: 0,phase,table_id,question,sql,tokenized_query,query,tokenized_question
0,1,1-10015132-11,What position does the player who played for b...,"{'sel': 3, 'conds': [[5, 0, 'Butler CC (KS)']]...","[SELECT, position, FROM, table_, WHERE, school...",SELECT position FROM table_ WHERE school/club ...,"[what, position, does, the, player, who, playe..."
1,1,1-10015132-11,How many schools did player number 3 play at?,"{'sel': 5, 'conds': [[1, 0, '3']], 'agg': 3}","[SELECT, COUNT, (, school/club, team, ), FROM,...",SELECT COUNT ( school/club team ) FROM table_ ...,"[how, many, schools, did, player, number, 3, p..."
2,1,1-10015132-11,What school did player number 21 play for?,"{'sel': 5, 'conds': [[1, 0, '21']], 'agg': 0}","[SELECT, school/club, team, FROM, table_, WHER...",SELECT school/club team FROM table_ WHERE no ....,"[what, school, did, player, number, 21, play, ..."
3,1,1-10015132-11,Who is the player that wears number 42?,"{'sel': 0, 'conds': [[1, 0, '42']], 'agg': 0}","[SELECT, player, FROM, table_, WHERE, no, ., E...",SELECT player FROM table_ WHERE no . EQL 42,"[who, is, the, player, that, wears, number, 42..."
4,1,1-10015132-11,What player played guard for toronto in 1996-97?,"{'sel': 0, 'conds': [[3, 0, 'Guard'], [4, 0, '...","[SELECT, player, FROM, table_, WHERE, position...",SELECT player FROM table_ WHERE position EQL g...,"[what, player, played, guard, for, toronto, in..."


In [5]:
dev_set = pd.read_json('./data/dev.jsonl',lines = True)
dev_set.head()

Unnamed: 0,phase,table_id,question,sql
0,1,1-10015132-11,What position does the player who played for b...,"{'sel': 3, 'conds': [[5, 0, 'Butler CC (KS)']]..."
1,1,1-10015132-11,How many schools did player number 3 play at?,"{'sel': 5, 'conds': [[1, 0, '3']], 'agg': 3}"
2,1,1-10015132-11,What school did player number 21 play for?,"{'sel': 5, 'conds': [[1, 0, '21']], 'agg': 0}"
3,1,1-10015132-11,Who is the player that wears number 42?,"{'sel': 0, 'conds': [[1, 0, '42']], 'agg': 0}"
4,1,1-10015132-11,What player played guard for toronto in 1996-97?,"{'sel': 0, 'conds': [[3, 0, 'Guard'], [4, 0, '..."


In [6]:
dev_tables = pd.read_json('./data/dev.tables.jsonl',lines = True)
dev_tables.head()

Unnamed: 0,header,page_title,types,id,section_title,caption,rows,name,page_id
0,"[Player, No., Nationality, Position, Years in ...",Toronto Raptors all-time roster,"[text, text, text, text, text, text]",1-10015132-11,L,L,"[[Antonio Lang, 21, United States, Guard-Forwa...",table_10015132_11,
1,"[Player, No., Nationality, Position, Years in ...",Toronto Raptors all-time roster,"[text, text, text, text, text, text]",1-10015132-9,J,J,"[[Jarrett Jack, 1, United States, Guard, 2009-...",table_10015132_9,
2,[Entered office as Head of State or Government...,List of longest-serving G8 leaders,"[text, text, text, text, text]",1-10026563-1,Chronology of longest serving G8 Leaders,Chronology of longest serving G8 Leaders,"[[16 October 1964, 15 November 1975, 5 April 1...",table_10026563_1,
3,"[Country ( exonym ), Capital ( exonym ), Count...",,"[text, text, text, text, text]",1-1008653-1,,,"[[Afghanistan, Kabul, Afghanestan افغانستان, K...",table_1008653_1,
4,"[No. in set, No. in series, Title, Directed by...",,"[real, real, text, text, text, text, text]",1-10088101-1,,,"[[1, 174, "" Per Manum "", Kim Manners, Chris Ca...",table_10088101_1,


In [7]:
dev_tables_tokenized = pd.read_json('./data/tokenized_dev.tables.jsonl',lines = True)
dev_tables_tokenized.head()

Unnamed: 0,header,page_title,types,id,section_title,caption,rows,name,page_id,tokenized_header
0,"[Player, No., Nationality, Position, Years in ...",Toronto Raptors all-time roster,"[text, text, text, text, text, text]",1-10015132-11,L,L,"[[Antonio Lang, 21, United States, Guard-Forwa...",table_10015132_11,,"[[player], [no, .], [nationality], [position],..."
1,"[Player, No., Nationality, Position, Years in ...",Toronto Raptors all-time roster,"[text, text, text, text, text, text]",1-10015132-9,J,J,"[[Jarrett Jack, 1, United States, Guard, 2009-...",table_10015132_9,,"[[player], [no, .], [nationality], [position],..."
2,[Entered office as Head of State or Government...,List of longest-serving G8 leaders,"[text, text, text, text, text]",1-10026563-1,Chronology of longest serving G8 Leaders,Chronology of longest serving G8 Leaders,"[[16 October 1964, 15 November 1975, 5 April 1...",table_10026563_1,,"[[entered, office, as, head, of, state, or, go..."
3,"[Country ( exonym ), Capital ( exonym ), Count...",,"[text, text, text, text, text]",1-1008653-1,,,"[[Afghanistan, Kabul, Afghanestan افغانستان, K...",table_1008653_1,,"[[country, (, exonym, )], [capital, (, exonym,..."
4,"[No. in set, No. in series, Title, Directed by...",,"[real, real, text, text, text, text, text]",1-10088101-1,,,"[[1, 174, "" Per Manum "", Kim Manners, Chris Ca...",table_10088101_1,,"[[no, ., in, set], [no, ., in, series], [title..."


Load the data into `sql_data` and `table_data`, the `table_data` needs to be a dictionary

In [8]:
sql_data = []
with open('./data/tokenized_dev.jsonl') as lines:
    for i,line in enumerate(lines):
        data = json.loads(line.strip())
        sql_data.append(data)

In [9]:
table_data = {}
with open('./data/tokenized_dev.tables.jsonl') as lines:
     for i, line in enumerate(lines):
        data =  json.loads(line.strip())
        table_data[data['id']] = data

In [10]:
print(len(table_data))

2716


#### Universal function for loading the data

By passing the value `train` `dev` or `test` load the training dataset, validation dataset or the testing dataset respecitvily

In [11]:
def load_data_set(dataset_name):
    """
    Universal function to load the dataset
    Input: dataset_name - str {train, test, dev}
    Output: (sql_data, table_data) - (list, dictionary): containg the tokenized queries and the table dataset.
    """
    print(f"Loading {dataset_name} dataset")
    sql_file = './data/tokenized_'+dataset_name+'.jsonl'
    tables_file = './data/tokenized_'+dataset_name+'.tables.jsonl'
    sql_query_count = 0
    table_count = 0
    sql_data = []
    table_data = {}
    
    with open(sql_file) as lines:
        for line in lines:
            sql = json.loads(line.strip()) #remove white space before and after 
            sql_data.append(sql)
            sql_query_count += 1
    with open(tables_file) as lines:
        for line in lines:
            tab = json.loads(line.strip())
            table_data[tab['id']] = tab
            table_count += 1
    print(f"Loaded {sql_query_count} queries and {table_count} tables")
    return sql_data,table_data

In [12]:
train_sql, train_table = load_data_set('train')
print(f"{train_sql[0]}\n{train_table[train_sql[0]['table_id']]}")

Loading train dataset
Loaded 56355 queries and 18585 tables
{'phase': 1, 'table_id': '1-1000181-1', 'question': 'Tell me what the notes are for South Australia ', 'sql': {'sel': 5, 'conds': [[3, 0, 'SOUTH AUSTRALIA']], 'agg': 0}, 'tokenized_query': ['SELECT', 'notes', 'FROM', 'table_', 'WHERE', 'current', 'slogan', 'EQL', 'south', 'australia'], 'query': 'SELECT notes FROM table_ WHERE current slogan EQL south australia', 'tokenized_question': ['tell', 'me', 'what', 'the', 'notes', 'are', 'for', 'south', 'australia']}
{'id': '1-1000181-1', 'header': ['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes'], 'types': ['text', 'text', 'text', 'text', 'text', 'text'], 'rows': [['Australian Capital Territory', 'blue/white', 'Yaa·nna', 'ACT · CELEBRATION OF A CENTURY 2013', 'YIL·00A', 'Slogan screenprinted on plate'], ['New South Wales', 'black/yellow', 'aa·nn·aa', 'NEW SOUTH WALES', 'BX·99·HI', 'No slogan on current series'], ['New South Wales', 'b

In [13]:
tuple(x for x in train_sql[0]['sql']['conds'])
#val_sql[0]['sql']['sel']

([3, 0, 'SOUTH AUSTRALIA'],)

#### Function to create batch sequences.

Given a batch size this function creates batch sequences.  
The required indecies are passsed to idxes.

(This method of loading data is discouraged and need to implement `torch.utils.data.Dataset` and `Dataloader`)

In [14]:
def gen_batch_sequence(sql_data, table_data,idxes,start, end):
    # [3,5,7,8]
    question_seq = []
    column_seq = []
    number_of_col = []
    answer_seq = []
    query_seq = []
    ground_truth_cond_seq = []
    raw_data = []
    for i in range(start, end):
        sql = sql_data[idxes[i]]
        table_id = sql['table_id']
        question_seq.append(sql['tokenized_question'])
        column_seq.append(table_data[table_id]['tokenized_header'])
        number_of_col.append(len(table_data[table_id]['header']))
        answer_seq.append((sql['sql']['agg'],
                           sql['sql']['sel'],
                           len(sql['sql']['conds']),
                           tuple(x[0] for x in sql['sql']['conds']),
                            tuple(x[1] for x in sql['sql']['conds'])
                          ))
        query_seq.append(sql['tokenized_query'])
        ground_truth_cond_seq.append(sql['sql']['conds'])
        raw_data.append((sql['question'],table_data[table_id]['header'], sql['query']))
        
    return (question_seq, column_seq, number_of_col, answer_seq, query_seq, ground_truth_cond_seq,raw_data)

-----


#### Import BERT tokenizer 

In [15]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
bertModel = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Freeze the bertModel
for params in bertModel.parameters():
    params.requires_grad = False

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
q_seq,col_seq, num_col, answer_seq, query_seq,gt_cond,rd = gen_batch_sequence(train_sql,train_table,[23,4,654],0,3)

In [17]:
col_seq[0]

[['player'],
 ['no', '.'],
 ['nationality'],
 ['position'],
 ['years', 'in', 'toronto'],
 ['school/club', 'team']]

In [18]:
for i, (one_q, one_col) in enumerate(zip(q_seq, col_seq)):
    #print(f"{i}: {one_q} || {one_col}")
    one_col_all = [x for toks in one_col for x in toks + ['[SEP]'] ]
    print(one_col_all)

['player', '[SEP]', 'no', '.', '[SEP]', 'nationality', '[SEP]', 'position', '[SEP]', 'years', 'in', 'toronto', '[SEP]', 'school/club', 'team', '[SEP]']
['order', 'year', '[SEP]', 'manufacturer', '[SEP]', 'model', '[SEP]', 'fleet', 'series', '(', 'quantity', ')', '[SEP]', 'powertrain', '(', 'engine/transmission', ')', '[SEP]', 'fuel', 'propulsion', '[SEP]']
['#', '[SEP]', 'episode', '[SEP]', 'air', 'date', '[SEP]', 'rating', '[SEP]', 'share', '[SEP]', '18–49', '(', 'rating/share', ')', '[SEP]', 'viewers', '(', 'm', ')', '[SEP]', 'weekly', 'rank', '[SEP]']


In [19]:
text = ['[SEP]']
encode = tokenizer.encode_plus(text=text,add_special_tokens = True,
                                #truncation=True,
                                return_token_type_ids = True,
                                padding = 'longest', 
                                return_attention_mask = True,
                                return_tensors = 'pt')
encode

{'input_ids': tensor([[101, 102, 102]]), 'token_type_ids': tensor([[0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1]])}

In [20]:
print(q_seq)
batch_queries = [' '.join(x) for x in q_seq]
q_encode = tokenizer.batch_encode_plus(batch_text_or_text_pairs=batch_queries ,
                                #max_length = 60,
                                add_special_tokens = True,
                                #truncation=True,
                                return_token_type_ids = True,
                                padding = 'longest', 
                                return_attention_mask = True,
                                return_tensors = 'pt')
bert_op = bertModel(**q_encode)
print(type(bert_op))
#print(q_encode["input_ids"])

[['what', 'number', 'is', 'the', 'player', 'that', 'played', '1998-2001'], ['how', 'many', 'times', 'is', 'the', 'fuel', 'propulsion', 'is', 'cng', '?'], ['what', 'is', 'the', 'episode', 'where', '18-49', 'has', 'a', 'rating/share', 'of', '3.5/9']]
<class 'transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions'>


In [21]:
print(bert_op.keys())

odict_keys(['last_hidden_state', 'pooler_output'])


In [22]:
'''
Converting This: [['player'],
 ['no', '.'],
 ['nationality'],
 ['position'],
 ['years', 'in', 'toronto'],
 ['school/club', 'team']]
 
 Into : player | no . | nationality | position | years in toronto | school/club team |
 
 for each element in the batch
'''

separator = '[SEP]'
header_batch_list=list(
                   map(lambda col: ' '.join([x for tok in col for x in tok + [separator]]), col_seq)
                    )
print(header_batch_list[0])

# for col in col_seq:
#     print(' '.join([x for tok in col for x in tok + [separator]]))
col_encode = tokenizer.batch_encode_plus(batch_text_or_text_pairs= header_batch_list,
                                 # max_length = 60,
                                add_special_tokens = True,
                                #truncation=True,
                                return_token_type_ids = True,
                                padding = 'longest', 
                                return_attention_mask = True,
                                return_tensors = 'pt')
col_encode['input_ids']

player [SEP] no . [SEP] nationality [SEP] position [SEP] years in toronto [SEP] school/club team [SEP]


tensor([[  101,  2447,   102,  2053,  1012,   102, 10662,   102,  2597,   102,
          2086,  1999,  4361,   102,  2082,  1013,  2252,  2136,   102,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2344,  2095,   102,  7751,   102,  2944,   102,  4170,  2186,
          1006, 11712,  1007,   102,  2373, 23654,  1006,  3194,  1013,  6726,
          1007,   102,  4762, 16404,   102,   102,     0,     0,     0,     0],
        [  101,  1001,   102,  2792,   102,  2250,  3058,   102,  5790,   102,
          3745,   102,  2324,  1516,  4749,  1006,  5790,  1013,  3745,  1007,
           102,  7193,  1006,  1049,  1007,   102,  4882,  4635,   102,   102]])

In [23]:
'''
Converting This: [['player'],
 ['no', '.'],
 ['nationality'],
 ['position'],
 ['years', 'in', 'toronto'],
 ['school/club', 'team']]
 
 Into : player [SEP] no . [SEP] nationality [SEP] position [SEP] years in toronto [SEP] school/club team [SEP]
 
 for each element in the batch
 
 and combining the question with the above table header
 Eg:
 ['what', 'number', 'is', 'the', 'player', 'that', 'played', '1998-2001']
     and
 ['player', '[SEP]', 'no', '.', '[SEP]', 'nationality', '[SEP]', 'position', '[SEP]', 'years', 'in', 'toronto', '[SEP]', 'school/club', 'team', '[SEP]']
 
 into: what number is the player that played 1998-2001 [SEP] player [SEP] no . [SEP] nationality [SEP] position [SEP] years in toronto [SEP] school/club team [SEP]
'''

separator = '[SEP]'
header_batch_list=list(
                   map(lambda col: [x for tok in col for x in tok + [separator]], col_seq)
                    )
# print(header_batch_list[0])
# print(q_seq)

input_string = []
for i in range(len(header_batch_list)):
    input_string.append(' '.join(q_seq[i] +[separator]+ header_batch_list[i][:-1]))

print(input_string[0])    
tokenizer_args = { 'add_special_tokens' : True,
                   'return_token_type_ids' : True,
                   'padding' : 'longest', 
                   'return_attention_mask' : True,
                   'return_tensors' : 'pt'}
# for col in col_seq:
#     print(' '.join([x for tok in col for x in tok + [separator]]))
col_encode = tokenizer.batch_encode_plus(batch_text_or_text_pairs= input_string, **tokenizer_args)
col_encode['input_ids']

what number is the player that played 1998-2001 [SEP] player [SEP] no . [SEP] nationality [SEP] position [SEP] years in toronto [SEP] school/club team


tensor([[  101,  2054,  2193,  2003,  1996,  2447,  2008,  2209,  2687,  1011,
          2541,   102,  2447,   102,  2053,  1012,   102, 10662,   102,  2597,
           102,  2086,  1999,  4361,   102,  2082,  1013,  2252,  2136,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2129,  2116,  2335,  2003,  1996,  4762, 16404,  2003, 27166,
          2290,  1029,   102,  2344,  2095,   102,  7751,   102,  2944,   102,
          4170,  2186,  1006, 11712,  1007,   102,  2373, 23654,  1006,  3194,
          1013,  6726,  1007,   102,  4762, 16404,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2054,  2003,  1996,  2792,  2073,  2324,  1011,  4749,  2038,
          1037,  5790,  1013,  3745,  1997,  1017,  1012,  1019,  1013,  1023,
           102,  1001,   102,  2792,   102,  2250,  3058,   102,

In [24]:
col_encode['input_ids'].shape

torch.Size([3, 49])

In [25]:
bert_op = bertModel(**col_encode)


In [40]:
print(bert_op.last_hidden_state[0])
bert_op.last_hidden_state[0].shape

tensor([[-0.7509, -0.0451, -0.3663,  ..., -0.3276,  0.1047,  0.8885],
        [-0.4485,  0.3220, -0.4430,  ...,  0.2473,  0.1332, -0.1755],
        [ 0.1347, -0.5952,  0.0330,  ..., -0.5421,  0.0289,  0.8840],
        ...,
        [-0.6576,  0.1654, -0.0034,  ..., -0.1518, -0.1862,  0.2657],
        [-0.6042,  0.2060, -0.0119,  ...,  0.0314,  0.0453,  0.3622],
        [-0.3505,  0.1392, -0.1125,  ..., -0.2700, -0.1290,  0.4474]])


torch.Size([49, 768])

In [27]:
print(list(col_encode.keys()))

['input_ids', 'token_type_ids', 'attention_mask']


In [28]:
x = torch.randn(2,3)
y = torch.randn(2,4)
z = torch.cat((x,y), dim=1)
print(f"X: {x}\nY: {y}\nZ: {z}")

X: tensor([[-0.1272,  1.5787, -0.4264],
        [-0.7108,  1.7789, -0.0619]])
Y: tensor([[-2.1488, -0.1765, -0.2232,  0.1911],
        [-0.8833,  1.1074,  1.6308,  0.7582]])
Z: tensor([[-0.1272,  1.5787, -0.4264, -2.1488, -0.1765, -0.2232,  0.1911],
        [-0.7108,  1.7789, -0.0619, -0.8833,  1.1074,  1.6308,  0.7582]])


In [29]:
con_cat_string = torch.cat((q_encode['input_ids'],col_encode['input_ids']),dim = 1)
print(con_cat_string)

tensor([[  101,  2054,  2193,  2003,  1996,  2447,  2008,  2209,  2687,  1011,
          2541,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   101,  2054,  2193,  2003,  1996,  2447,  2008,  2209,  2687,
          1011,  2541,   102,  2447,   102,  2053,  1012,   102, 10662,   102,
          2597,   102,  2086,  1999,  4361,   102,  2082,  1013,  2252,  2136,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2129,  2116,  2335,  2003,  1996,  4762, 16404,  2003, 27166,
          2290,  1029,   102,     0,     0,     0,     0,     0,     0,     0,
             0,   101,  2129,  2116,  2335,  2003,  1996,  4762, 16404,  2003,
         27166,  2290,  1029,   102,  2344,  2095,   102,  7751,   102,  2944,
           102,  4170,  2186,  1006, 11712,  1007,   102,  2373, 23654,  1006,
          3194,  1013,  6726,  1007,   102,  4762, 

In [30]:
tokenizer.convert_ids_to_tokens(col_encode['input_ids'][0])

['[CLS]',
 'what',
 'number',
 'is',
 'the',
 'player',
 'that',
 'played',
 '1998',
 '-',
 '2001',
 '[SEP]',
 'player',
 '[SEP]',
 'no',
 '.',
 '[SEP]',
 'nationality',
 '[SEP]',
 'position',
 '[SEP]',
 'years',
 'in',
 'toronto',
 '[SEP]',
 'school',
 '/',
 'club',
 'team',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [31]:
## function to train the model
def epoch_train(model, optim, batch_size, sql_data, table_data):
    model.train()

## WordEmbedding Module

In [32]:
class WordEmbedding(nn.Module):
    def __init__(self, bert_encoder, max_length=None, separator = '[SEP]'):
        super(WordEmbedding, self).__init__()
        self.bert_tokenizer = BertTokenizer.from_pretrained(bert_encoder)
        self.bert_model = BertModel.from_pretrained(bert_encoder)
        self.max_length = max_length
        self.gpu = torch.cuda.is_available()
        self.seperator = separator
        
        self.bert_args = { 'add_special_tokens' : True,
                           'return_token_type_ids' : True,
                           'padding' : 'longest', 
                           'return_attention_mask' : True,
                           'return_tensors' : 'pt'}
        
        if(self.max_length != None):
            bert_args['max_length'] = self.max_length
            bert_args['padding'] = 'max_length'
        for param in self.bert_model.parameters():
            param.requires_grad = False
        assert list(self.bert_model.parameters())[0].requires_grad == False 
        
        if self.gpu:
            self.bert_model = self.bert_model.to('cuda')
    
    def gen_x_batch(self,q_batch,col_batch):
        '''
        Input: q_batch: list of tokenized query string i.e. List[List].
               col_batch: list of tokenzed header of the corresponding table header. List[List[List]]
        Output: ret_dict: a dictionary containing the 
        '''
        batch_queries = [' '.join(x) for x in q_batch]
        if batch_queries == []:
            print(q_batch, col_batch)
        '-------'
        header_batch_list=list(
                   map(lambda col: [x for tok in col for x in tok + [separator]], col_batch)
                    )

        input_string_list = []
        for i in range(len(header_batch_list)):
            input_string_list.append(' '.join(q_batch[i] +[separator]+ header_batch_list[i][:-1]))

        '-------'
        inp_encode = self.bert_tokenizer.batch_encode_plus(batch_text_or_text_pairs=input_string_list ,**self.bert_args)
        if self.gpu:
            for key in inp_encode.keys():
                inp_encode[key] = inp_encode[key].to('cuda')
        bert_op = self.bert_model(**inp_encode)
        
        # odict_keys(['last_hidden_state', 'pooler_output'])    
        return bert_op
        

In [33]:
#train_sql, train_table = load_data_set(data_set)
#def test_wordembed_module(data_set, batch_size = 32):
def test_wordembed_module(train_sql,train_table,batch_size = 32): 
    word_emb = WordEmbedding('bert-base-uncased')
    start = 0
    end = batch_size
    num_x = len(train_sql)
    idxes = np.random.permutation(num_x)
    i = 0
    while start < num_x:
        end = start+batch_size if start+batch_size <= num_x else num_x
        ret_tuple = gen_batch_sequence(train_sql,train_table,idxes, start, end)
        if ret_tuple[0] == []:
            print(start, end)
        bert_op = word_emb.gen_x_batch(ret_tuple[0],ret_tuple[1])
        last_hidden_state,pooler_output = bert_op.last_hidden_state, bert_op.pooler_output 
        
        assert last_hidden_state.size()[2] == 768
        assert last_hidden_state.size()[0] <= batch_size
        assert pooler_output.size()[1] == 768
        assert pooler_output.size()[0] <= batch_size
        
        if (i%10 == 0):
            print(f"\ni={i}\nSanity Check!!\n")
            print(f"Start: {start}\nEnd: {end}\n")
            print(f"Size of LastHidden Layer Size:{last_hidden_state.size()}\nPooler output size:{pooler_output.size()}\n\
            LastHidden Layer: {last_hidden_state}\nPooler output: {pooler_output}\n")
            
        i = i+1
        start = end
    
test_wordembed_module(train_sql,train_table)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



i=0
Sanity Check!!

Start: 0
End: 32

Size of LastHidden Layer Size:torch.Size([32, 91, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.1757,  0.1176, -0.0385,  ..., -0.3337,  0.2855,  0.7696],
         [-0.1743, -0.1953, -0.2888,  ...,  0.3705,  0.0293, -0.4341],
         [-0.2986, -0.3153, -0.0748,  ..., -0.0767, -0.0712,  0.3809],
         ...,
         [-0.1913, -0.2420, -0.2525,  ...,  0.0772,  0.4887,  0.3546],
         [-0.1194, -0.1350, -0.3206,  ...,  0.0404,  0.4210,  0.3224],
         [-0.3075,  0.2801, -0.0397,  ..., -0.1871, -0.0273,  0.3829]],

        [[-0.8465, -0.1847, -0.0270,  ..., -0.2296,  0.4143,  0.7087],
         [-0.9341, -0.2153,  0.6237,  ..., -0.1480,  0.5412, -0.0200],
         [-0.4342,  0.2370, -0.1530,  ..., -0.6994, -0.3456,  0.2828],
         ...,
         [-0.6433, -0.2636,  0.1400,  ..., -0.1164,  0.1008,  0.3192],
         [-0.5058, -0.1829,  0.0858,  ...,  0.0956,  0.2623, -0.1172],
         [-0.6582,  0.5


i=30
Sanity Check!!

Start: 960
End: 992

Size of LastHidden Layer Size:torch.Size([32, 64, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.5032e-01,  8.3251e-02, -3.5419e-02,  ..., -4.3575e-01,
           1.6364e-01,  6.8620e-01],
         [-2.2287e-01,  7.1706e-02, -1.4899e-01,  ...,  3.5431e-01,
           4.7144e-01, -8.8550e-02],
         [-7.6863e-01, -3.5141e-01, -1.3466e-01,  ...,  2.1026e-01,
          -2.2978e-01,  3.8933e-01],
         ...,
         [-6.5899e-02,  1.3975e-01,  3.2272e-01,  ..., -3.4987e-01,
           8.9843e-02, -6.9096e-02],
         [-2.0162e-01,  8.0221e-02, -4.2225e-01,  ...,  1.2996e-01,
           3.6398e-01, -4.8759e-02],
         [-1.5393e-01,  5.0784e-01, -1.8692e-01,  ..., -1.8612e-01,
           1.1236e-01, -1.8552e-02]],

        [[-3.6107e-01, -6.4294e-01,  4.0478e-01,  ..., -6.3596e-01,
           4.5443e-01,  6.3289e-01],
         [ 1.7928e-01, -1.2804e-02, -3.0635e-01,  ...,  2.3680e-01,
           


i=50
Sanity Check!!

Start: 1600
End: 1632

Size of LastHidden Layer Size:torch.Size([32, 88, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.9204e-01,  1.5092e-01, -2.7518e-02,  ..., -2.7190e-01,
           2.1748e-01,  9.4026e-01],
         [-2.3785e-01,  1.0356e-01,  1.5788e-01,  ...,  3.2609e-01,
           1.3817e-01, -1.3266e-01],
         [ 3.0454e-01, -4.5316e-01, -8.7580e-02,  ..., -5.7605e-01,
           3.1317e-01,  5.3209e-01],
         ...,
         [-1.8486e-01,  1.7464e-01,  1.4626e-01,  ..., -1.0801e-01,
           9.3258e-02,  3.6555e-01],
         [-1.7714e-01,  1.4709e-01,  1.3602e-01,  ..., -1.6471e-01,
           8.1176e-02,  2.3896e-01],
         [-1.2492e-01,  4.6101e-03,  2.9193e-02,  ..., -2.4863e-01,
           8.2629e-02,  5.5214e-01]],

        [[-9.4299e-01, -1.0643e-01, -7.6496e-02,  ..., -2.4270e-01,
           2.4956e-01,  7.2425e-01],
         [-3.7517e-02,  2.4295e-01, -4.0724e-01,  ...,  2.4250e-01,
         


i=80
Sanity Check!!

Start: 2560
End: 2592

Size of LastHidden Layer Size:torch.Size([32, 61, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.9861e-01,  4.1456e-02, -6.0846e-02,  ..., -2.9052e-01,
           1.2840e-01,  5.6053e-01],
         [-4.1043e-01, -2.1747e-01,  4.9763e-01,  ...,  3.0608e-01,
           3.3339e-01, -2.1527e-01],
         [-7.0603e-01, -3.0564e-01,  3.3954e-01,  ...,  2.3865e-02,
           2.8492e-01,  3.3936e-01],
         ...,
         [-2.8855e-01, -5.4686e-02,  3.7506e-02,  ..., -7.3083e-02,
           1.7431e-01,  3.2567e-01],
         [-7.8247e-02, -1.1127e-01,  2.1244e-01,  ..., -1.2989e-02,
           1.7113e-01, -4.4578e-03],
         [-1.3658e-01,  1.4455e-01,  6.4740e-01,  ...,  1.9113e-01,
           2.0852e-01, -1.7026e-01]],

        [[-7.1321e-01,  2.7847e-02, -1.6643e-01,  ..., -5.4106e-01,
           1.0992e-01,  7.1278e-01],
         [-5.0586e-01, -2.9268e-01, -7.9197e-02,  ..., -5.4014e-02,
         


i=100
Sanity Check!!

Start: 3200
End: 3232

Size of LastHidden Layer Size:torch.Size([32, 56, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5407, -0.1883, -0.0791,  ..., -0.2421, -0.1870,  0.5953],
         [-0.5841, -0.1362,  0.6354,  ..., -0.2228,  0.2085, -0.4341],
         [-1.4396,  0.1799,  1.4966,  ..., -0.4199, -1.0012, -0.4716],
         ...,
         [-0.4089, -0.0761,  0.1643,  ..., -0.1814, -0.5154,  0.2009],
         [-0.3758, -0.2300, -0.0094,  ..., -0.1135, -0.3426, -0.0566],
         [-0.3511, -0.0275,  0.1900,  ..., -0.2666, -0.5219,  0.1957]],

        [[-0.7112,  0.0532, -0.0684,  ..., -0.2296,  0.2746,  0.7525],
         [-0.7405, -0.0359,  0.4372,  ..., -0.0308,  0.4109, -0.2172],
         [-0.3846, -0.1543,  0.1304,  ...,  0.2197, -0.4250,  0.0248],
         ...,
         [-0.2378,  0.0998, -0.0783,  ..., -0.0648,  0.0315,  0.1655],
         [-0.3519,  0.0046,  0.1498,  ..., -0.1203,  0.1001,  0.1507],
         [-0.536


i=130
Sanity Check!!

Start: 4160
End: 4192

Size of LastHidden Layer Size:torch.Size([32, 58, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.6851e-01, -1.7635e-01, -3.2756e-01,  ..., -5.1762e-01,
           1.9138e-01,  4.9186e-01],
         [ 8.8573e-03, -1.6553e-01, -3.9244e-01,  ...,  4.8362e-02,
           3.8275e-01, -1.6635e-02],
         [ 3.6731e-01, -1.0243e+00,  3.3449e-01,  ..., -1.7142e-01,
           2.1253e-01, -2.2612e-01],
         ...,
         [-1.3713e-01, -1.7063e-01,  8.2732e-02,  ..., -3.4467e-01,
          -3.5600e-02,  1.5887e-01],
         [-3.7010e-01, -5.3580e-01, -2.0228e-01,  ..., -2.2010e-01,
           1.0247e-01, -6.9871e-02],
         [-7.5472e-02, -2.2154e-01,  2.2006e-01,  ..., -2.8728e-01,
          -1.2920e-01,  6.3186e-02]],

        [[-7.9580e-01, -5.8499e-01, -9.8436e-02,  ..., -2.9836e-01,
           9.0210e-01,  1.0409e+00],
         [-8.1466e-01, -3.8990e-01, -4.1463e-01,  ..., -2.3824e-01,
        


i=160
Sanity Check!!

Start: 5120
End: 5152

Size of LastHidden Layer Size:torch.Size([32, 55, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.0385, -0.0589, -0.2207,  ..., -0.4742,  0.1669,  0.9470],
         [-0.6123,  0.0551, -0.4137,  ...,  0.3623,  0.2170, -0.5368],
         [-0.5880, -0.2529, -0.6270,  ..., -0.3053, -0.6572,  0.6553],
         ...,
         [-1.1608, -0.3158,  0.0298,  ..., -0.2755,  0.2371,  0.7061],
         [-1.0335, -0.1571,  0.0496,  ..., -0.3619,  0.1663,  0.7505],
         [-1.1570, -0.3373, -0.0508,  ..., -0.3044,  0.2689,  0.7291]],

        [[-0.7769, -0.4015, -0.0151,  ..., -0.5293,  0.1038,  0.6284],
         [-0.9821, -0.0109, -0.2757,  ...,  0.2216, -0.3266, -0.7629],
         [-1.1879, -1.1415, -0.3884,  ...,  0.0350, -0.5550,  0.3232],
         ...,
         [ 1.0115,  0.0236, -0.1339,  ...,  0.0986, -0.6843, -0.3044],
         [-0.6883,  0.0846, -0.2764,  ..., -0.2814, -0.1368,  0.3350],
         [-0.507


i=190
Sanity Check!!

Start: 6080
End: 6112

Size of LastHidden Layer Size:torch.Size([32, 78, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.8426e-01,  2.6336e-02, -1.9078e-01,  ..., -6.7485e-01,
           2.7415e-01,  6.3516e-01],
         [-5.6560e-01, -5.4848e-01, -3.3339e-01,  ..., -4.5721e-03,
           8.2544e-02, -4.7639e-01],
         [-4.2768e-01, -6.4452e-01,  3.0989e-01,  ..., -4.1666e-01,
          -2.2852e-01,  3.7063e-01],
         ...,
         [ 6.3901e-02, -4.3655e-01, -1.9988e-01,  ..., -2.2618e-01,
           2.8089e-01,  1.0264e-01],
         [ 7.4794e-02, -4.5447e-01, -2.2930e-01,  ..., -2.1121e-01,
           2.8436e-01,  8.1686e-02],
         [ 1.3895e-01, -3.4350e-01, -4.9020e-02,  ..., -2.9674e-01,
           2.1215e-01,  1.2991e-01]],

        [[-4.5371e-01, -1.6133e-01,  1.4527e-01,  ..., -4.5571e-01,
           3.2885e-02,  5.2027e-01],
         [-1.2058e-01,  2.9000e-02, -4.6150e-01,  ..., -5.9860e-02,
        


i=210
Sanity Check!!

Start: 6720
End: 6752

Size of LastHidden Layer Size:torch.Size([32, 77, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6114, -0.0942, -0.2229,  ..., -0.7003, -0.1774,  0.8391],
         [-0.5253, -0.2982, -0.5157,  ...,  0.3088, -0.1242, -0.7477],
         [-0.1605,  0.0927, -0.2141,  ..., -0.3832, -0.9447,  0.5658],
         ...,
         [-0.4930, -0.2779,  0.3935,  ..., -0.1207, -0.2425,  0.4277],
         [-0.4781, -0.3086,  0.3646,  ..., -0.0949, -0.2222,  0.4022],
         [-0.3581, -0.1894,  0.2696,  ..., -0.2189, -0.4653,  0.3404]],

        [[-0.8000,  0.0304, -0.0154,  ..., -0.2538,  0.2116,  0.6811],
         [-1.6115,  0.0197,  0.4648,  ..., -0.4217,  0.4744,  0.0390],
         [-0.9712, -0.9469,  0.2533,  ...,  0.5054, -0.6891,  0.3783],
         ...,
         [-0.5779, -0.0965,  0.1102,  ...,  0.2301,  0.0034,  0.4958],
         [-0.8195, -0.0475, -0.1672,  ...,  0.1893,  0.2326,  0.4424],
         [-0.764


i=240
Sanity Check!!

Start: 7680
End: 7712

Size of LastHidden Layer Size:torch.Size([32, 51, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.0458e+00, -1.0628e-01, -1.1403e-01,  ..., -3.3301e-01,
           1.5707e-01,  8.0826e-01],
         [-1.6744e-01,  2.8436e-01, -3.7335e-01,  ...,  4.4782e-01,
           3.4333e-01, -2.6818e-01],
         [-2.9782e-01, -2.0879e-01, -5.6301e-01,  ...,  9.0915e-02,
          -8.3747e-01,  7.0463e-01],
         ...,
         [-1.0306e+00, -1.1673e-01, -4.3234e-02,  ..., -2.3864e-02,
           1.8721e-01,  5.1191e-01],
         [-1.0339e+00, -1.2324e-01, -1.2909e-02,  ..., -7.6469e-02,
           1.5718e-01,  5.5246e-01],
         [-1.0381e+00, -1.2230e-01,  4.1622e-02,  ..., -7.3615e-02,
           9.3366e-02,  6.0354e-01]],

        [[-5.5461e-01, -5.2334e-02,  8.0509e-02,  ..., -5.9875e-01,
          -1.8166e-02,  5.4621e-01],
         [-2.8818e-01, -3.1481e-01, -1.7637e-01,  ..., -1.0579e-01,
        


i=270
Sanity Check!!

Start: 8640
End: 8672

Size of LastHidden Layer Size:torch.Size([32, 72, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-5.1044e-01,  2.5956e-01, -9.8170e-02,  ..., -1.9662e-01,
           1.9670e-02,  7.0197e-01],
         [-2.6930e-01,  3.2980e-01, -3.2816e-01,  ...,  2.9128e-01,
          -6.4383e-05, -1.2415e-01],
         [-1.7804e-01,  9.4337e-02, -5.5340e-01,  ...,  1.9665e-01,
          -7.5629e-01,  4.2805e-01],
         ...,
         [-3.2297e-01,  3.6658e-01,  7.5106e-02,  ..., -3.8632e-01,
          -1.4929e-01,  4.9836e-01],
         [-3.7628e-01,  3.4066e-01,  1.2079e-01,  ..., -4.2372e-01,
          -1.4008e-01,  4.7909e-01],
         [-2.0655e-01,  5.4683e-01,  8.1279e-02,  ..., -1.8226e-01,
          -3.4001e-02,  3.6650e-01]],

        [[-6.7789e-01,  1.3824e-01, -1.1593e-01,  ..., -5.4599e-01,
           3.5177e-01,  6.5447e-01],
         [-1.2595e+00,  4.6688e-02, -3.5359e-01,  ..., -4.1794e-01,
        


i=290
Sanity Check!!

Start: 9280
End: 9312

Size of LastHidden Layer Size:torch.Size([32, 93, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.0182e-01, -1.0372e-01, -1.0942e-01,  ..., -3.0655e-01,
           2.4022e-01,  6.5197e-01],
         [-3.0493e-01, -2.1523e-01, -4.4609e-02,  ..., -5.7514e-01,
           4.8282e-01,  2.6021e-01],
         [-1.5933e+00, -5.8655e-02, -4.6729e-01,  ..., -5.2678e-01,
           4.4663e-01,  1.4048e-01],
         ...,
         [-7.7375e-01, -3.8733e-01, -8.3082e-03,  ...,  2.5699e-01,
           1.9252e-01,  4.2824e-01],
         [-4.6434e-01, -6.8407e-01,  1.9672e-01,  ..., -3.4061e-02,
           9.7991e-02,  1.4383e-03],
         [-9.5152e-01, -3.8264e-01, -2.1029e-01,  ...,  1.9314e-01,
           4.1332e-01,  8.7183e-02]],

        [[-7.1320e-01, -2.6008e-02, -4.8000e-02,  ..., -5.0848e-01,
           2.6074e-01,  7.5674e-01],
         [-1.6751e+00,  2.2989e-01, -9.8849e-02,  ..., -5.2662e-01,
        


i=320
Sanity Check!!

Start: 10240
End: 10272

Size of LastHidden Layer Size:torch.Size([32, 121, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.0105e-01, -1.5830e-01, -1.3167e-01,  ..., -4.1999e-01,
           4.6624e-01,  7.3059e-01],
         [-1.8485e+00, -6.5665e-01, -6.7281e-01,  ..., -1.1871e+00,
           3.6636e-01,  3.9678e-01],
         [-1.7161e+00, -1.4116e+00, -6.2805e-01,  ..., -4.7884e-01,
           2.6721e-01,  1.8511e-01],
         ...,
         [-8.9862e-01, -1.3966e-01, -1.1325e-01,  ..., -1.0714e-01,
           4.7307e-01,  5.1362e-01],
         [-8.8880e-01, -2.2512e-01, -9.5947e-02,  ..., -1.7964e-02,
           5.1794e-01,  4.4780e-01],
         [-6.7190e-01,  3.1397e-02,  9.7428e-02,  ..., -1.8424e-01,
           3.3445e-01,  4.2666e-01]],

        [[-1.0745e+00, -3.0908e-02, -6.5444e-02,  ..., -2.9204e-01,
           9.3536e-02,  7.5665e-01],
         [-7.4232e-01,  1.9367e-01, -3.7592e-01,  ...,  1.4761e-01,
     


i=340
Sanity Check!!

Start: 10880
End: 10912

Size of LastHidden Layer Size:torch.Size([32, 62, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.9471e-01, -4.5690e-01, -1.6442e-01,  ..., -5.2708e-01,
           1.2391e-02,  8.0426e-01],
         [-8.3844e-01, -2.0485e-01,  6.5510e-01,  ..., -2.0929e-01,
           1.7163e-01, -2.7698e-01],
         [-4.7397e-01, -1.6748e-01,  4.4007e-01,  ..., -4.3778e-01,
          -6.0541e-01,  3.1741e-02],
         ...,
         [-4.8852e-01, -3.1963e-02,  2.7036e-01,  ..., -2.4629e-01,
          -5.5271e-01,  5.7929e-01],
         [-4.5559e-01, -6.4210e-02,  2.8931e-01,  ..., -3.4483e-01,
          -5.5817e-01,  5.6081e-01],
         [-6.0079e-01, -4.5027e-01, -9.4216e-02,  ..., -1.2746e-01,
          -1.6609e-01,  2.3286e-01]],

        [[-7.9370e-01,  6.3372e-03,  2.6952e-02,  ..., -4.5878e-01,
           3.4522e-01,  7.9600e-01],
         [-1.7017e-01,  5.8033e-02, -3.2689e-01,  ...,  1.7457e-01,
      


i=370
Sanity Check!!

Start: 11840
End: 11872

Size of LastHidden Layer Size:torch.Size([32, 157, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3840, -0.3138,  0.0044,  ..., -0.5113,  0.1621,  0.7111],
         [-0.5920, -0.5052, -0.3544,  ...,  0.1405,  0.1117, -0.6706],
         [-0.5027, -1.0168,  0.2036,  ..., -0.6317,  0.3673, -0.1867],
         ...,
         [ 0.4869, -0.3513,  0.1128,  ...,  0.2599,  0.1676,  0.0737],
         [ 0.0110, -0.3105,  0.4906,  ...,  0.0924,  0.1114, -0.1293],
         [-0.2628, -0.5238, -0.2459,  ...,  0.2996,  0.3294,  0.0399]],

        [[-0.5768, -0.0625, -0.0298,  ..., -0.7080, -0.1294,  0.7297],
         [-0.5935, -0.3153, -0.1232,  ...,  0.1273, -0.4302, -0.7476],
         [-0.7494, -0.2266, -0.2478,  ..., -0.4037, -0.7976,  0.6048],
         ...,
         [ 0.1438,  0.0411,  0.2643,  ..., -0.1070,  0.0988,  0.0875],
         [ 0.1534, -0.0156,  0.2400,  ..., -0.0570,  0.1176,  0.0493],
         [ 0.


i=400
Sanity Check!!

Start: 12800
End: 12832

Size of LastHidden Layer Size:torch.Size([32, 62, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4349, -0.0980, -0.0724,  ..., -0.3075,  0.1956,  0.6879],
         [ 0.6530, -0.0118, -0.3181,  ...,  0.0699,  0.4332,  0.4812],
         [-0.9740, -0.4506, -0.3855,  ..., -0.2554,  0.9207,  0.0911],
         ...,
         [-0.6009, -0.3216,  0.3503,  ..., -0.1332,  0.0933,  0.3115],
         [-0.7384, -0.4118,  0.3166,  ..., -0.2255,  0.1518,  0.1051],
         [-0.8775, -0.5703,  0.2033,  ..., -0.1726,  0.0482,  0.3445]],

        [[-0.6482,  0.0234, -0.1515,  ..., -0.6169,  0.2064,  0.6384],
         [-0.6139, -0.2500, -0.2066,  ..., -0.0844, -0.0953, -0.6487],
         [-1.2786, -1.3796, -0.1562,  ...,  0.3241, -0.5949,  0.1493],
         ...,
         [-0.3164, -0.3550,  0.2753,  ..., -0.0509,  0.1775,  0.3853],
         [-0.3253, -0.3820,  0.2967,  ..., -0.0804,  0.2301,  0.3935],
         [-0.1


i=430
Sanity Check!!

Start: 13760
End: 13792

Size of LastHidden Layer Size:torch.Size([32, 55, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.1565, -0.2316, -0.0648,  ..., -0.3265,  0.3057,  0.7285],
         [-0.6884,  0.2730, -0.0643,  ...,  0.4553,  0.2956, -0.3904],
         [-1.1467, -0.3180, -0.1798,  ..., -0.8209,  0.0817,  0.5833],
         ...,
         [-0.8055, -0.0813,  0.2275,  ..., -0.2365,  0.1716,  0.3070],
         [-1.0958, -0.1759,  0.0375,  ..., -0.0088,  0.4802,  0.4101],
         [-0.8657, -0.0755,  0.1197,  ..., -0.2284,  0.1875,  0.2915]],

        [[-0.6125, -0.0989,  0.0086,  ..., -0.4685, -0.2599,  0.4973],
         [ 0.0671,  0.5857, -0.1310,  ..., -0.0332,  0.3463,  0.6768],
         [-1.3064, -0.2810,  0.2832,  ..., -0.4908, -0.2553,  0.1754],
         ...,
         [-0.4610, -0.0422, -0.1713,  ..., -0.0444, -0.2656, -0.0235],
         [-0.2857, -0.0134,  0.3259,  ..., -0.3653, -0.3505,  0.3810],
         [-0.3


i=460
Sanity Check!!

Start: 14720
End: 14752

Size of LastHidden Layer Size:torch.Size([32, 64, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.8574e-01,  2.5543e-01,  8.2443e-02,  ..., -5.6760e-01,
           3.2169e-01,  6.2745e-01],
         [-4.3550e-01, -1.5425e-01, -2.0920e-01,  ...,  1.0799e-01,
           3.2585e-01, -3.8009e-01],
         [-5.4818e-01, -3.8700e-01, -6.8562e-04,  ..., -3.2214e-01,
          -4.8770e-01,  2.3965e-01],
         ...,
         [-6.7127e-01, -7.9403e-02, -7.9453e-02,  ..., -1.8231e-02,
           3.2337e-01,  2.4693e-01],
         [-4.1220e-01,  6.0150e-03,  2.5431e-01,  ..., -1.8356e-01,
           1.7778e-01,  3.6775e-01],
         [-6.7984e-01, -5.5175e-02,  2.1939e-02,  ..., -3.2563e-02,
           3.3580e-01,  3.1594e-01]],

        [[-7.6900e-01, -3.7137e-01, -1.4626e-01,  ..., -5.2905e-01,
           1.2397e-02,  6.0203e-01],
         [ 1.0766e-01,  2.7443e-01, -1.8320e-01,  ..., -1.9085e-01,
      


i=490
Sanity Check!!

Start: 15680
End: 15712

Size of LastHidden Layer Size:torch.Size([32, 74, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6071,  0.1039, -0.0865,  ..., -0.2233,  0.3017,  0.7571],
         [-0.1177,  0.3455, -0.3289,  ...,  0.1739,  0.5607,  0.1940],
         [-0.5201, -0.6856,  0.2280,  ..., -0.1405, -0.0705,  0.3270],
         ...,
         [-0.3764, -0.0448,  0.1253,  ...,  0.0052,  0.1715,  0.0720],
         [-0.7156, -0.2107, -0.4138,  ...,  0.0827,  0.3494,  0.3338],
         [-0.3842, -0.1230,  0.3732,  ..., -0.2120, -0.0200,  0.2501]],

        [[-0.6798, -0.1606, -0.1484,  ..., -0.6139,  0.1900,  0.6986],
         [-0.4686, -0.1147, -0.6488,  ...,  0.0901,  0.2099, -0.6667],
         [-0.8580, -0.9000, -0.4983,  ...,  0.1862, -0.5751,  0.4405],
         ...,
         [-0.4490, -0.0838, -0.0651,  ..., -0.0981,  0.2146,  0.5167],
         [-0.7573, -0.1052, -0.1232,  ..., -0.2409,  0.2377,  0.7198],
         [-0.7


i=520
Sanity Check!!

Start: 16640
End: 16672

Size of LastHidden Layer Size:torch.Size([32, 178, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3856, -0.4400,  0.1131,  ..., -0.5672,  0.1942,  0.8324],
         [-1.0612, -0.2118, -0.2252,  ..., -0.2531,  0.8687, -0.2933],
         [ 0.8045,  0.0452,  0.5721,  ..., -1.1052,  0.4675,  0.3729],
         ...,
         [-0.5146, -0.5319,  0.2989,  ...,  0.0449,  0.4059,  0.2204],
         [-0.4014, -0.4719,  0.3478,  ..., -0.1972,  0.3439,  0.2963],
         [-0.2570, -0.2242,  0.4271,  ..., -0.3485,  0.2301,  0.4521]],

        [[-0.6491,  0.2041,  0.1020,  ..., -0.5624,  0.4115,  0.5122],
         [-0.3673, -0.1752, -0.0488,  ..., -0.0256,  0.2852, -0.4850],
         [-0.5732, -0.7321, -0.0092,  ...,  0.3376, -0.2303,  0.0296],
         ...,
         [-0.7961,  0.0089,  0.1095,  ..., -0.0564,  0.4727,  0.4679],
         [-0.8125,  0.3094,  0.2263,  ..., -0.2695,  0.5196,  0.4026],
         [-0.


i=550
Sanity Check!!

Start: 17600
End: 17632

Size of LastHidden Layer Size:torch.Size([32, 126, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6025, -0.1284,  0.1070,  ..., -0.4238,  0.2832,  0.7278],
         [-1.2784, -0.0333,  0.5105,  ..., -0.1377,  0.4344,  0.0403],
         [-0.4766, -0.9164,  0.4460,  ...,  0.1564, -0.1768,  0.6309],
         ...,
         [-0.2780, -0.2377,  0.7518,  ..., -0.0266,  0.1950,  0.3561],
         [-0.5109, -0.2295,  0.5128,  ..., -0.0293,  0.1137,  0.3480],
         [-0.5983, -0.2878,  0.2501,  ...,  0.2048,  0.1810,  0.2056]],

        [[-0.4468, -0.6349,  0.3946,  ..., -0.5179,  0.5743,  0.7372],
         [-0.2748, -0.2997, -0.0971,  ...,  0.1943,  0.6248, -0.0923],
         [-0.4326, -1.0090,  0.0965,  ...,  0.1306,  0.0076,  0.8604],
         ...,
         [-0.3643, -0.7156,  0.5945,  ...,  0.0350,  0.4467, -0.0308],
         [-0.4682, -0.7300,  0.5702,  ...,  0.0168,  0.4905, -0.0086],
         [-0.


i=580
Sanity Check!!

Start: 18560
End: 18592

Size of LastHidden Layer Size:torch.Size([32, 49, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.8586, -0.0870, -0.0492,  ..., -0.6205,  0.0265,  0.7149],
         [-0.2690,  0.4785, -0.1045,  ..., -0.3519,  0.4245,  0.6271],
         [-1.3915,  0.1357,  0.1334,  ..., -0.3890, -0.6722,  0.0748],
         ...,
         [-0.4460, -0.1355, -0.2590,  ..., -0.1375, -0.1488,  0.3124],
         [-0.3960, -0.0196, -0.2695,  ..., -0.3466, -0.2559,  0.4134],
         [-0.4634, -0.0678, -0.3527,  ..., -0.3151, -0.2371,  0.3643]],

        [[-0.8126, -0.2996, -0.1693,  ..., -0.4504,  0.3764,  0.6756],
         [-0.3392, -0.1303, -0.1542,  ...,  0.3452,  0.1566, -0.1275],
         [-0.8099, -0.1322,  0.1668,  ...,  0.3278,  0.4241, -0.2619],
         ...,
         [-0.8789, -0.2364, -0.0438,  ..., -0.1877,  0.2008,  0.7208],
         [-0.8661, -0.1859,  0.0954,  ..., -0.1928,  0.1435,  0.5812],
         [-0.8


i=610
Sanity Check!!

Start: 19520
End: 19552

Size of LastHidden Layer Size:torch.Size([32, 59, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7582, -0.3954, -0.2314,  ..., -0.5988,  0.1831,  0.6965],
         [-1.0707,  0.0176, -0.1382,  ..., -0.9379,  0.3276,  0.1480],
         [-0.7622, -0.4670, -0.1561,  ..., -0.0696, -0.0593, -0.3166],
         ...,
         [-1.3789, -0.8500, -0.2800,  ..., -0.0993,  0.1523, -0.7338],
         [ 0.0159, -0.5851,  0.4067,  ..., -0.0120, -0.2552, -0.7507],
         [ 0.8909,  0.0400, -0.3029,  ..., -0.1407, -0.6476, -0.2766]],

        [[-0.2096,  0.1922,  0.1984,  ..., -0.2428,  0.0749,  0.6120],
         [-0.2720,  0.0160,  0.3532,  ...,  0.2402,  0.2084, -0.2871],
         [ 0.0172,  0.3474, -0.2799,  ...,  0.3310,  0.3791, -0.2648],
         ...,
         [-0.1487,  0.1773,  0.2198,  ..., -0.0848,  0.1582,  0.2323],
         [-0.2900,  0.0552,  0.0025,  ..., -0.0339,  0.3015,  0.3732],
         [-0.2


i=640
Sanity Check!!

Start: 20480
End: 20512

Size of LastHidden Layer Size:torch.Size([32, 65, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.8919e-01, -2.1111e-01, -1.3825e-01,  ..., -3.1496e-01,
           3.5024e-01,  5.5515e-01],
         [-1.1856e-01, -8.1421e-02, -4.9086e-01,  ...,  1.7575e-01,
           3.5064e-01, -3.7184e-01],
         [-6.1724e-01, -2.9024e-01, -2.6990e-01,  ...,  2.7172e-01,
          -4.0004e-01,  1.3799e-01],
         ...,
         [-1.5215e-01, -2.4185e-01,  2.9874e-01,  ...,  4.5513e-02,
           1.6611e-01, -4.0005e-03],
         [-3.7348e-01, -2.1292e-01,  6.8730e-02,  ..., -1.7428e-01,
           1.8009e-01,  1.5641e-01],
         [-6.2955e-01, -3.2927e-01, -2.6536e-01,  ...,  2.5406e-02,
           4.8154e-01,  3.7297e-02]],

        [[-3.4381e-01,  1.5147e-01,  1.7093e-01,  ..., -3.5072e-01,
           1.0163e-03,  5.0659e-01],
         [ 3.9908e-01,  4.2492e-01, -5.1101e-01,  ..., -3.0612e-01,
      


i=670
Sanity Check!!

Start: 21440
End: 21472

Size of LastHidden Layer Size:torch.Size([32, 46, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5482,  0.1531, -0.0325,  ..., -0.2368,  0.3114,  0.5780],
         [-0.5182,  0.0722, -0.3994,  ...,  0.1284,  0.5764, -0.4575],
         [-0.8272, -0.5118, -0.1421,  ..., -0.0980, -0.4322,  0.5020],
         ...,
         [-0.9206,  0.0598, -0.1008,  ..., -0.0110,  0.4921,  0.4287],
         [-0.8470,  0.0825, -0.0834,  ..., -0.0708,  0.3978,  0.4255],
         [-0.9258,  0.0779, -0.1101,  ..., -0.0160,  0.5262,  0.4243]],

        [[-0.6971, -0.1110,  0.3491,  ..., -0.6433, -0.1218,  0.4802],
         [-0.2627, -0.0723, -0.1540,  ...,  0.4648,  0.2963, -0.3172],
         [-0.3353, -0.6371,  0.8470,  ..., -0.6795,  0.3906, -0.1182],
         ...,
         [ 0.7697,  0.2631, -0.1439,  ...,  0.0111, -0.8528, -0.5180],
         [-0.1696,  0.4908,  0.5220,  ..., -0.3003, -0.3390, -0.1690],
         [ 0.7


i=700
Sanity Check!!

Start: 22400
End: 22432

Size of LastHidden Layer Size:torch.Size([32, 54, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.4260e-01, -1.7847e-01,  1.2786e-02,  ..., -5.3199e-01,
           7.7723e-02,  6.2680e-01],
         [ 3.0781e-01,  3.1528e-01, -3.2027e-01,  ..., -2.5945e-01,
           5.3598e-01,  7.2836e-01],
         [-7.3802e-01, -8.0848e-01,  2.6737e-01,  ..., -3.0033e-01,
           2.9663e-01,  2.4784e-01],
         ...,
         [ 2.0142e-01, -1.6454e-01,  5.3259e-01,  ..., -1.9755e-01,
           2.1492e-01,  4.4487e-01],
         [-1.8717e-01, -4.5115e-01,  4.2075e-01,  ..., -1.8847e-01,
           2.3844e-01,  1.3590e-01],
         [-1.0494e-01, -2.6563e-01,  3.2757e-01,  ..., -2.8104e-01,
           4.1323e-02,  3.6657e-01]],

        [[-4.5847e-01,  7.2996e-02,  2.5392e-01,  ..., -2.4383e-01,
           5.1651e-01,  9.2655e-01],
         [-3.7698e-01, -1.1542e-02,  2.9655e-01,  ...,  4.4148e-01,
      


i=730
Sanity Check!!

Start: 23360
End: 23392

Size of LastHidden Layer Size:torch.Size([32, 54, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4964, -0.2548, -0.0273,  ..., -0.4493,  0.3050,  0.6459],
         [-0.5844, -0.0481,  0.1850,  ...,  0.2343,  0.3798, -0.7178],
         [-0.1918, -1.4765, -0.0488,  ...,  0.1484,  0.4405,  0.1487],
         ...,
         [-0.3069, -0.1196,  0.1491,  ..., -0.2802,  0.0819, -0.2880],
         [-0.7685, -0.3835, -0.3650,  ...,  0.0487,  0.0666, -0.0717],
         [-0.3144, -0.2271,  0.1689,  ..., -0.3821,  0.0490, -0.0983]],

        [[-0.4070, -0.2504,  0.2028,  ..., -0.1209,  0.4791,  0.7790],
         [-0.4855, -0.1734, -0.3468,  ...,  0.1618,  0.8458, -0.3735],
         [-0.8957, -1.0732, -0.3880,  ..., -0.0397,  0.3672,  0.1829],
         ...,
         [-0.0506, -0.7292,  0.0784,  ..., -0.5543,  0.4490,  0.0282],
         [-0.6961, -0.1674,  0.5702,  ..., -0.3035,  0.2318, -0.4092],
         [ 0.7


i=760
Sanity Check!!

Start: 24320
End: 24352

Size of LastHidden Layer Size:torch.Size([32, 157, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.1406e-01, -1.1503e-01,  2.0439e-01,  ..., -5.2112e-01,
           5.7873e-01,  4.4215e-01],
         [-3.0616e-01, -6.5781e-01,  6.9594e-01,  ..., -4.0687e-01,
           1.1758e+00, -1.7259e-01],
         [-2.2321e-02, -4.0710e-01,  5.4194e-01,  ...,  1.9886e-02,
           7.4175e-01, -4.4117e-01],
         ...,
         [-9.1141e-03, -2.6979e-01,  5.4035e-01,  ..., -2.9509e-01,
           2.6430e-01, -1.2934e-01],
         [-9.3962e-02, -2.5946e-01,  4.7241e-01,  ..., -3.5882e-02,
           3.3961e-01,  6.1968e-02],
         [ 3.7562e-02, -1.4864e-01,  5.1067e-01,  ..., -1.7262e-01,
           4.0031e-01, -4.7240e-01]],

        [[-7.2209e-01, -1.2147e-01, -4.1180e-02,  ..., -6.4908e-01,
           1.1659e-01,  7.8851e-01],
         [-2.3081e-01, -1.9763e-01, -5.1529e-01,  ...,  3.6168e-01,
     


i=780
Sanity Check!!

Start: 24960
End: 24992

Size of LastHidden Layer Size:torch.Size([32, 85, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3867, -0.3017,  0.2878,  ..., -0.5713,  0.1893,  0.8794],
         [-1.0402, -0.1316, -0.0807,  ..., -0.3147,  0.8190, -0.1614],
         [ 0.8903,  0.2052,  0.7018,  ..., -1.1122,  0.4477,  0.4266],
         ...,
         [-0.2217, -0.5113,  0.4246,  ..., -0.2355,  0.1546,  0.4062],
         [-0.4382, -0.4742,  0.2284,  ..., -0.0975,  0.3220,  0.2814],
         [-0.3268, -0.3965,  0.3214,  ..., -0.2461,  0.2071,  0.3810]],

        [[-0.6994,  0.0035, -0.1176,  ..., -0.5362,  0.2290,  0.5962],
         [-0.1614,  0.0909, -0.4840,  ...,  0.1359,  0.1012, -0.5507],
         [-0.8235, -0.6393, -0.3419,  ...,  0.6906, -0.7518,  0.0267],
         ...,
         [-0.7656, -0.0460, -0.2956,  ..., -0.0776,  0.1885,  0.3994],
         [-0.7285, -0.1157, -0.2458,  ..., -0.0644,  0.1582,  0.4126],
         [-0.7


i=810
Sanity Check!!

Start: 25920
End: 25952

Size of LastHidden Layer Size:torch.Size([32, 82, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.0334e+00,  2.1744e-02, -1.6816e-01,  ..., -5.6257e-01,
           3.4470e-01,  6.7114e-01],
         [-2.4786e-01, -4.4949e-01, -3.1058e-01,  ...,  5.7018e-02,
           2.5160e-01, -3.4874e-01],
         [-6.8280e-01, -7.3880e-01, -1.5215e-01,  ..., -3.5380e-01,
           1.0838e-02,  3.8454e-01],
         ...,
         [-8.4410e-01,  5.6006e-02, -2.0149e-01,  ...,  1.9148e-01,
           4.7725e-01,  2.0436e-01],
         [-8.6316e-01,  4.6012e-02, -2.2254e-01,  ...,  1.9815e-01,
           4.5246e-01,  1.9892e-01],
         [-9.0864e-01,  4.2815e-02, -2.5461e-01,  ...,  2.0279e-01,
           4.9271e-01,  1.7576e-01]],

        [[-5.3689e-01, -9.5764e-02,  5.7873e-02,  ..., -2.0173e-01,
           4.0464e-01,  7.0484e-01],
         [-6.6473e-01, -1.8282e-01,  2.4298e-01,  ..., -9.2248e-02,
      


i=840
Sanity Check!!

Start: 26880
End: 26912

Size of LastHidden Layer Size:torch.Size([32, 64, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6447, -0.0322,  0.1828,  ..., -0.3102,  0.2747,  0.5223],
         [-0.5051, -0.2148,  0.0153,  ..., -0.0664,  0.6372,  0.2068],
         [-0.6147, -0.0681,  0.1740,  ...,  0.0241,  0.2459, -0.1795],
         ...,
         [-0.4859, -0.2922,  0.3788,  ..., -0.0779,  0.3994,  0.2297],
         [-0.3786, -0.2675,  0.4699,  ..., -0.1962,  0.1477,  0.1257],
         [-0.0882, -0.3680,  0.4594,  ..., -0.1526,  0.0140, -0.0592]],

        [[-0.5423,  0.1631,  0.1754,  ..., -0.2529,  0.0592,  0.6346],
         [-1.4173, -0.3368,  0.8835,  ..., -0.2827,  0.4100,  0.0603],
         [-0.6889, -0.5210,  0.6215,  ..., -0.1890, -0.4419,  0.5127],
         ...,
         [-0.6864, -0.5841,  0.3309,  ..., -0.1829,  0.5023,  0.3868],
         [-0.7480, -0.4684, -0.1194,  ...,  0.2384,  0.3251,  0.3147],
         [-0.7


i=870
Sanity Check!!

Start: 27840
End: 27872

Size of LastHidden Layer Size:torch.Size([32, 136, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7263, -0.0573,  0.0541,  ..., -0.6255,  0.1943,  0.8095],
         [-0.3264, -0.0803,  0.0555,  ...,  0.1385, -0.0318, -0.1863],
         [-0.9163, -0.0388,  0.7242,  ..., -0.5255, -0.9898,  0.6789],
         ...,
         [-0.7069, -0.0816, -0.0521,  ..., -0.2805,  0.0480,  0.5529],
         [-0.3720,  0.2044,  0.1154,  ...,  0.3102,  0.0225,  0.4131],
         [-0.4193,  0.2579, -0.0104,  ...,  0.4244,  0.0676,  0.4221]],

        [[-0.8130, -0.1295, -0.1634,  ..., -0.4206,  0.1246,  0.5721],
         [-0.2765, -0.4270, -0.3563,  ...,  0.0027, -0.0398, -0.4728],
         [-0.9178, -0.9605, -0.2353,  ...,  0.4381, -0.5637,  0.0063],
         ...,
         [-0.6698, -0.2993, -0.0767,  ...,  0.1260,  0.2210,  0.3414],
         [-0.6227, -0.3364, -0.1030,  ...,  0.0546,  0.3026,  0.3518],
         [-0.


i=900
Sanity Check!!

Start: 28800
End: 28832

Size of LastHidden Layer Size:torch.Size([32, 66, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6605,  0.0580, -0.2841,  ..., -0.4750,  0.1617,  0.6419],
         [-0.4120,  0.0198, -0.6282,  ...,  0.3911,  0.4052, -0.2207],
         [-0.4301, -0.0116, -0.1681,  ..., -0.7174, -0.6654,  0.4887],
         ...,
         [-0.5066, -0.0112, -0.3780,  ..., -0.0175,  0.1519,  0.5206],
         [-0.2690,  0.0354, -0.0058,  ...,  0.0559,  0.3299,  0.5687],
         [-0.2539,  0.2036,  0.0269,  ...,  0.0359,  0.3750,  0.5089]],

        [[-0.6701, -0.0879, -0.3725,  ..., -0.5598,  0.0151,  0.6658],
         [-0.5183, -0.5105, -0.1137,  ...,  0.2031,  0.1899, -0.3468],
         [-0.4624, -0.6917, -0.8760,  ..., -0.4744,  0.2368, -0.4634],
         ...,
         [-0.8413, -0.3279, -0.3228,  ..., -0.0179,  0.4414,  0.4866],
         [-0.5232, -0.0142,  0.1238,  ..., -0.5176,  0.2231,  0.5785],
         [-0.8


i=930
Sanity Check!!

Start: 29760
End: 29792

Size of LastHidden Layer Size:torch.Size([32, 77, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4932,  0.2459,  0.1000,  ..., -0.2030,  0.1783,  0.5376],
         [-0.1417,  0.3187, -0.4951,  ...,  0.3170,  0.6903,  0.1487],
         [-0.9220, -0.2804,  0.0818,  ..., -0.0754,  0.2651,  0.0807],
         ...,
         [-0.4026, -0.1658,  0.1106,  ..., -0.1190,  0.5133,  0.0963],
         [-0.5436, -0.2660, -0.1193,  ..., -0.2820,  0.2223,  0.0936],
         [-0.5646, -0.2445,  0.1065,  ..., -0.2311,  0.4052,  0.0915]],

        [[-0.6120, -0.6412, -0.0064,  ..., -0.5721,  0.5470,  0.7421],
         [-0.2382, -0.1846, -0.3204,  ...,  0.2435,  0.6455, -0.2035],
         [-0.3867, -0.9245, -0.1218,  ...,  0.0101, -0.0348,  0.6733],
         ...,
         [ 0.0560, -0.7962,  0.7421,  ..., -0.5363,  0.4941,  0.5134],
         [ 0.4565,  0.0581,  0.1526,  ..., -0.0373, -0.4393, -0.3688],
         [ 0.7


i=960
Sanity Check!!

Start: 30720
End: 30752

Size of LastHidden Layer Size:torch.Size([32, 68, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.2255e-01, -9.9337e-02,  1.5933e-01,  ..., -2.2806e-01,
           4.8828e-01,  1.2479e-01],
         [-2.4068e-01, -5.7934e-02, -2.3981e-01,  ...,  1.4214e-01,
           9.1153e-01, -2.2239e-01],
         [-9.6767e-01, -1.0407e+00, -2.6407e-01,  ...,  1.1707e-02,
           8.2056e-01,  2.9453e-01],
         ...,
         [-8.2127e-01, -4.0485e-01,  2.0223e-01,  ..., -2.6722e-01,
           4.1031e-01, -5.8630e-01],
         [-8.2342e-01, -5.1540e-01,  5.0046e-02,  ..., -2.3356e-01,
           5.0891e-01, -7.2245e-01],
         [ 6.1564e-01, -2.6349e-02, -1.7492e-01,  ...,  4.4974e-02,
          -4.2688e-01, -4.2527e-01]],

        [[-5.0986e-01,  1.6670e-02, -2.9679e-01,  ..., -3.9821e-01,
           4.0192e-01,  1.1365e+00],
         [-1.3596e-01, -2.9313e-01, -5.5789e-01,  ...,  5.4640e-01,
      


i=990
Sanity Check!!

Start: 31680
End: 31712

Size of LastHidden Layer Size:torch.Size([32, 51, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.8882, -0.1326, -0.1932,  ..., -0.5164,  0.0863,  0.5620],
         [-0.6317, -0.1677, -0.2892,  ..., -0.1107,  0.2496, -0.2703],
         [-0.7274, -0.6397,  0.0506,  ..., -0.1210, -0.5593,  0.6794],
         ...,
         [-0.3919,  0.0068,  0.1866,  ...,  0.0046, -0.4257,  0.5910],
         [-0.5144, -0.0538,  0.0892,  ..., -0.0623, -0.4877,  0.5447],
         [-0.4036, -0.0141, -0.0896,  ...,  0.0361, -0.1300,  0.2024]],

        [[-0.6816,  0.1327, -0.3034,  ..., -0.3731, -0.2357,  0.6703],
         [ 0.3664,  0.5140, -0.4840,  ...,  0.0192, -0.0846,  0.5983],
         [-0.8501, -0.0817, -0.8997,  ..., -0.5118, -0.4674, -0.0262],
         ...,
         [-0.3409,  0.1634, -0.1351,  ..., -0.3195, -0.5147,  0.4332],
         [-0.3369,  0.1565, -0.1626,  ..., -0.3180, -0.5321,  0.4410],
         [-0.2


i=1020
Sanity Check!!

Start: 32640
End: 32672

Size of LastHidden Layer Size:torch.Size([32, 98, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5494, -0.0609,  0.2910,  ..., -0.7101,  0.1122,  0.8669],
         [ 0.0647,  0.1542, -0.4089,  ...,  0.1526,  0.5978,  0.2509],
         [-0.0127, -0.8061,  0.2436,  ..., -0.5599, -0.1285,  0.0081],
         ...,
         [-0.5230, -0.1559,  0.0162,  ..., -0.2653,  0.1894,  0.0218],
         [-0.4737, -0.2007, -0.0477,  ..., -0.2960,  0.1882,  0.0588],
         [-0.4941, -0.1361,  0.0395,  ..., -0.2078,  0.2426,  0.1770]],

        [[-0.6087,  0.0365,  0.0261,  ..., -0.4011,  0.2426,  0.8588],
         [-0.4934,  0.0366, -0.4897,  ..., -0.0596,  0.3419, -0.4602],
         [-0.4660, -0.6738, -0.3374,  ..., -0.2909, -0.5046,  0.4633],
         ...,
         [-0.6887,  0.0240, -0.5939,  ...,  0.0478,  0.4814,  0.5984],
         [-0.2656,  0.0159,  0.1219,  ..., -0.0606,  0.1549,  0.5299],
         [-0.


i=1050
Sanity Check!!

Start: 33600
End: 33632

Size of LastHidden Layer Size:torch.Size([32, 90, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.2205e-01, -6.7106e-04, -2.2508e-01,  ..., -4.0820e-01,
           1.8438e-01,  8.2713e-01],
         [-2.9534e-01, -1.3764e-01, -6.5512e-01,  ...,  2.7496e-01,
           3.6796e-01, -2.7961e-01],
         [-3.8708e-01, -4.0220e-01, -7.8645e-01,  ..., -4.3828e-01,
          -7.0148e-01,  7.2752e-01],
         ...,
         [-7.4751e-01,  3.6292e-02, -1.3017e-01,  ..., -1.3245e-01,
           1.2183e-01,  4.5263e-01],
         [-2.4665e-01,  6.7151e-02, -6.6097e-02,  ..., -2.6129e-02,
           9.0201e-02,  1.5917e-01],
         [-4.0651e-01,  1.2563e-01, -9.0126e-02,  ..., -1.4714e-01,
          -6.1179e-02,  1.5494e-01]],

        [[-2.3792e-01,  5.9340e-02,  1.8279e-01,  ..., -2.4654e-01,
           7.1408e-02,  2.6571e-01],
         [-4.9791e-01, -5.5461e-02,  2.4533e-01,  ..., -3.8322e-02,
     


i=1070
Sanity Check!!

Start: 34240
End: 34272

Size of LastHidden Layer Size:torch.Size([32, 77, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6230,  0.0049,  0.1654,  ..., -0.4398,  0.3874,  0.6703],
         [-0.0144,  0.4472, -0.1288,  ...,  0.0182,  0.6391,  0.1995],
         [-0.1354, -0.5795,  0.5223,  ..., -0.4228,  0.3710,  0.0362],
         ...,
         [-0.0586, -0.2469,  0.2107,  ..., -0.0104,  0.1300, -0.0033],
         [-0.5497, -0.4557, -0.1278,  ...,  0.1448,  0.1977,  0.0693],
         [-0.1643, -0.2678,  0.5210,  ..., -0.1589,  0.1278,  0.0192]],

        [[-0.3383, -0.0883, -0.0857,  ..., -0.2378,  0.2481,  0.5131],
         [-0.7871, -0.2826,  0.3289,  ..., -0.2875,  0.6952, -0.1690],
         [-0.1717, -0.5827,  0.4355,  ...,  0.3657,  0.3469,  0.3779],
         ...,
         [-0.3929, -0.4864, -0.2008,  ...,  0.0185,  0.4222,  0.2296],
         [-0.4847, -0.5329, -0.2364,  ...,  0.0688,  0.4807,  0.2455],
         [-0.


i=1100
Sanity Check!!

Start: 35200
End: 35232

Size of LastHidden Layer Size:torch.Size([32, 61, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4988,  0.0881, -0.1748,  ..., -0.3612,  0.1701,  0.7158],
         [-0.4806, -0.5311, -0.2564,  ...,  0.2872, -0.0708, -0.2578],
         [-0.1675, -0.7001, -0.1036,  ..., -0.0773, -0.5385,  0.4432],
         ...,
         [-0.3424, -0.1725,  0.3659,  ...,  0.2599, -0.0227,  0.2934],
         [-0.1785, -0.0505,  0.3318,  ...,  0.1571,  0.0127,  0.0837],
         [-0.3200, -0.0327,  0.2519,  ..., -0.0852,  0.0113,  0.2097]],

        [[-0.9807, -0.4234,  0.0955,  ..., -0.5225, -0.1053,  0.5595],
         [-0.2257, -0.0755, -0.3904,  ...,  0.0846,  0.4642, -0.0734],
         [-0.7503, -0.6172, -0.2953,  ...,  0.0601, -0.5896,  0.3525],
         ...,
         [-0.8897, -0.0885,  0.1663,  ..., -0.2298,  0.0055,  0.0629],
         [-0.8473, -0.0622,  0.0938,  ..., -0.2061, -0.0292,  0.0545],
         [-0.


i=1130
Sanity Check!!

Start: 36160
End: 36192

Size of LastHidden Layer Size:torch.Size([32, 68, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6381,  0.0649, -0.0233,  ..., -0.2405,  0.2814,  0.7601],
         [-0.4724,  0.0539, -0.4586,  ...,  0.1554,  0.1738, -0.2417],
         [-0.1979, -0.6795, -0.2055,  ...,  0.0945, -0.2510,  0.7334],
         ...,
         [-0.6476, -0.2061, -0.1159,  ...,  0.2095,  0.4520,  0.6502],
         [-0.8595, -0.0736,  0.1311,  ...,  0.1078,  0.2826,  0.8310],
         [-0.9573, -0.0053,  0.1469,  ...,  0.2728,  0.4425,  0.7096]],

        [[-0.7131,  0.0025, -0.1195,  ..., -0.2875,  0.1965,  0.7268],
         [-0.7310, -0.1504, -0.2367,  ...,  0.0074,  0.4387, -0.2793],
         [-0.8788,  0.1985, -0.8487,  ...,  0.0628,  0.7793,  0.3540],
         ...,
         [-0.7239, -0.1218, -0.1890,  ..., -0.3412,  0.2123,  0.3209],
         [-0.7815, -0.0039, -0.2891,  ..., -0.2542,  0.2346,  0.3572],
         [-0.


i=1160
Sanity Check!!

Start: 37120
End: 37152

Size of LastHidden Layer Size:torch.Size([32, 58, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4109, -0.2800, -0.1777,  ..., -0.5103,  0.3612,  0.6155],
         [-0.6712, -0.1728,  0.0613,  ...,  0.3953,  0.0624, -0.3535],
         [-1.2132, -0.6457, -0.0266,  ..., -0.6435, -0.2769,  0.0770],
         ...,
         [-0.2460, -0.2804,  0.2692,  ..., -0.0235,  0.0887,  0.0758],
         [-0.4438, -0.5296, -0.1727,  ...,  0.0954,  0.3861, -0.0254],
         [-0.5437, -0.3588,  0.3246,  ..., -0.2123,  0.3075,  0.3966]],

        [[-0.5625, -0.1758, -0.0798,  ..., -0.5280,  0.1653,  0.6785],
         [-1.4047, -0.5689,  0.2812,  ..., -0.3133,  0.8388, -0.0184],
         [-0.6201, -0.7005,  0.7009,  ..., -0.3948,  0.1441,  0.2348],
         ...,
         [-0.5106, -0.5206, -0.2360,  ..., -0.1560,  0.3126, -0.0669],
         [-0.4143, -0.0804,  0.2470,  ..., -0.2541,  0.1231, -0.1510],
         [-0.


i=1190
Sanity Check!!

Start: 38080
End: 38112

Size of LastHidden Layer Size:torch.Size([32, 59, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.1647e-01,  1.0371e-01, -9.7118e-02,  ..., -3.5351e-01,
           4.9089e-02,  6.0381e-01],
         [-6.5449e-01, -1.5733e-01,  8.1011e-02,  ...,  3.5246e-01,
          -1.7736e-01, -2.8478e-01],
         [-6.7061e-01, -2.9000e-01,  4.4362e-01,  ...,  7.8580e-02,
          -3.7732e-01,  4.0368e-01],
         ...,
         [-8.5166e-01, -1.7392e-01,  2.1422e-01,  ..., -2.3422e-01,
           2.8199e-01,  5.2295e-01],
         [-7.5988e-01, -8.8623e-02,  1.8677e-01,  ..., -1.2829e-01,
           1.7598e-01,  6.2661e-01],
         [-6.7107e-01, -2.1467e-01,  1.5403e-01,  ...,  2.9876e-02,
           1.0898e-01,  6.4369e-01]],

        [[-6.9504e-01, -2.2503e-01, -2.3614e-01,  ..., -5.0572e-01,
           1.2753e-01,  5.3633e-01],
         [-9.1450e-01, -2.1947e-01, -6.7010e-01,  ..., -1.9363e-01,
     


i=1210
Sanity Check!!

Start: 38720
End: 38752

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.7900e-01, -1.4528e-01, -5.8352e-02,  ..., -5.9505e-01,
           1.6514e-01,  8.0405e-01],
         [-1.6432e+00,  1.0397e-01, -2.4718e-02,  ..., -4.8659e-01,
           3.1034e-01,  5.0479e-01],
         [-4.9842e-01, -1.0960e+00, -6.1840e-01,  ...,  4.9655e-01,
          -5.7086e-01,  9.0783e-01],
         ...,
         [-5.5970e-01, -1.6454e-01, -3.2302e-01,  ...,  1.6783e-01,
           2.2308e-01,  4.6994e-01],
         [-7.2556e-01, -3.2465e-01, -2.9545e-01,  ..., -1.2602e-01,
           2.6500e-01,  5.5395e-01],
         [-9.2896e-01, -2.3905e-01, -9.4758e-02,  ..., -2.7404e-01,
           9.8047e-02,  4.6502e-01]],

        [[-6.1633e-01,  2.8110e-01,  9.8834e-02,  ..., -5.0503e-01,
           4.2570e-01,  5.6265e-01],
         [-7.5517e-01,  5.1767e-01, -9.2886e-02,  ..., -4.6568e-02,
     


i=1240
Sanity Check!!

Start: 39680
End: 39712

Size of LastHidden Layer Size:torch.Size([32, 58, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5309,  0.0994,  0.0623,  ..., -0.4465,  0.1659,  0.7863],
         [-0.3239,  0.1193, -0.1706,  ...,  0.0259,  0.6095, -0.1747],
         [-0.4806, -0.4444,  0.0306,  ..., -0.0948, -0.1130,  0.5717],
         ...,
         [-0.4634, -0.1067,  0.0450,  ..., -0.2027,  0.3477,  0.3980],
         [-0.3627, -0.0094,  0.3430,  ..., -0.3089,  0.1777,  0.3333],
         [-0.4382, -0.1341,  0.0392,  ..., -0.1663,  0.3693,  0.3099]],

        [[-0.6645,  0.0185, -0.1486,  ..., -0.4050,  0.1838,  0.6886],
         [-0.9582, -0.2339,  0.3442,  ..., -0.2926,  0.3869,  0.0857],
         [-0.4346, -0.7242,  0.0621,  ...,  0.0366, -0.3080,  0.3903],
         ...,
         [-0.6719, -0.2155,  0.1093,  ..., -0.3974,  0.0909,  0.3649],
         [-0.6436, -0.2920,  0.4110,  ..., -0.2529,  0.0763,  0.1257],
         [-0.


i=1270
Sanity Check!!

Start: 40640
End: 40672

Size of LastHidden Layer Size:torch.Size([32, 66, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.3901e-01, -6.0832e-02,  1.8417e-02,  ..., -4.5740e-01,
           2.4223e-01,  8.3075e-01],
         [-7.2517e-01, -1.6663e-01, -1.4014e-01,  ..., -3.0285e-01,
          -1.8649e-02, -5.7735e-01],
         [-9.9798e-01, -1.4595e+00, -1.3537e-01,  ...,  2.2926e-01,
          -4.7153e-01,  1.0730e-01],
         ...,
         [-4.0850e-01, -4.6639e-01,  2.6258e-01,  ...,  1.3646e-01,
           4.9213e-01,  2.2985e-01],
         [-6.2139e-01, -9.4177e-02,  2.5886e-01,  ..., -9.3729e-02,
           4.3386e-01,  5.3313e-01],
         [-5.7484e-01, -1.1733e-01,  3.5234e-01,  ..., -9.2358e-02,
           3.8620e-01,  5.5251e-01]],

        [[-6.7364e-01, -6.6553e-02, -2.0845e-01,  ..., -4.1093e-01,
           3.3889e-01,  6.4099e-01],
         [-3.9605e-01,  3.1210e-02, -8.5474e-01,  ..., -3.2403e-01,
     


i=1300
Sanity Check!!

Start: 41600
End: 41632

Size of LastHidden Layer Size:torch.Size([32, 184, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.8472, -0.1626,  0.1265,  ..., -0.1283,  0.3911,  0.6085],
         [-0.6226, -0.2218,  0.1892,  ..., -0.3327,  0.9989,  0.0369],
         [-0.0273,  0.1495,  0.4986,  ..., -0.7340,  0.0092,  0.2241],
         ...,
         [-0.1173, -0.0028,  0.2514,  ..., -0.1447,  0.3594,  0.0869],
         [ 0.0015,  0.0121,  0.2631,  ...,  0.0397,  0.1686, -0.1575],
         [ 0.0057, -0.0045,  0.3149,  ..., -0.0166,  0.2332, -0.1008]],

        [[-0.6123,  0.1495,  0.0337,  ..., -0.5303,  0.4206,  0.6966],
         [-0.9329, -0.1379,  0.1983,  ..., -0.0830,  0.6204, -0.2423],
         [-0.1117, -0.5847,  0.0804,  ..., -0.2989,  0.5042, -0.0709],
         ...,
         [-0.2578, -0.3840,  0.0658,  ..., -0.1035,  0.1299,  0.1408],
         [-0.3566, -0.1160, -0.0991,  ..., -0.2444,  0.3354,  0.2841],
         [-0


i=1330
Sanity Check!!

Start: 42560
End: 42592

Size of LastHidden Layer Size:torch.Size([32, 65, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.1769e-01, -2.4100e-02, -1.1036e-01,  ..., -4.6146e-01,
           3.7456e-01,  7.6575e-01],
         [-1.7605e+00, -1.1707e-01,  2.0399e-01,  ..., -3.4710e-01,
           4.5335e-01, -4.3018e-03],
         [-1.5707e+00, -1.4670e+00,  5.0378e-02,  ...,  4.2141e-01,
          -4.0947e-01,  4.9433e-01],
         ...,
         [-7.5631e-01, -2.5963e-01, -5.2189e-02,  ..., -5.7213e-02,
           6.0084e-01,  4.4471e-01],
         [-4.8362e-01, -5.4934e-01,  1.8307e-01,  ..., -1.3486e-01,
           3.5535e-01,  4.8424e-01],
         [-7.4608e-01, -2.9791e-01, -1.8422e-01,  ...,  1.3049e-01,
           5.4326e-01,  5.0409e-01]],

        [[-9.0040e-01,  4.0764e-02, -1.5261e-01,  ..., -2.9888e-01,
           3.3891e-01,  7.8114e-01],
         [-4.8351e-01, -1.3271e-01, -2.3243e-01,  ..., -3.0948e-02,
     


i=1350
Sanity Check!!

Start: 43200
End: 43232

Size of LastHidden Layer Size:torch.Size([32, 48, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.9502, -0.1186, -0.0228,  ..., -0.3081,  0.1519,  0.6586],
         [-0.1211, -0.1237, -0.4732,  ...,  0.1800,  0.6144, -0.1691],
         [-0.4730, -0.8354, -0.4105,  ...,  0.2314, -0.6248,  0.6081],
         ...,
         [-0.9558,  0.1215,  0.1497,  ..., -0.1768,  0.0671,  0.1695],
         [-0.9040,  0.0869, -0.0066,  ..., -0.0151,  0.1313,  0.1336],
         [-0.9544, -0.0348,  0.1387,  ..., -0.2219, -0.1716,  0.2332]],

        [[-0.8119, -0.3034, -0.2660,  ..., -0.3256,  0.3329,  0.8650],
         [-0.4134,  0.0476, -0.7934,  ...,  0.1725,  0.8596, -0.4426],
         [-0.5810, -0.4436, -0.6572,  ..., -0.4978, -0.1265,  0.4752],
         ...,
         [-0.5896, -0.1930,  0.0610,  ..., -0.1783,  0.4456,  0.6089],
         [-0.5414, -0.1767, -0.0441,  ..., -0.1781,  0.4690,  0.5386],
         [-0.


i=1380
Sanity Check!!

Start: 44160
End: 44192

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3899,  0.0575, -0.0777,  ..., -0.6237,  0.1208,  0.5176],
         [-0.2992, -0.0710, -0.2025,  ..., -0.2092,  0.5112, -0.1388],
         [-0.4939,  0.3492,  0.5231,  ..., -0.5630, -0.0021,  0.3522],
         ...,
         [ 0.1145,  0.3117, -0.0842,  ..., -0.3492,  0.0194,  0.0784],
         [ 0.0497,  0.3290, -0.2110,  ..., -0.0217,  0.0252, -0.0545],
         [ 0.1598,  0.1125, -0.1643,  ..., -0.1805,  0.2008, -0.0968]],

        [[-0.3367,  0.2711,  0.3976,  ..., -0.6181,  0.3119,  0.7163],
         [-0.8418,  0.0609,  0.6143,  ..., -0.1507,  0.6068, -0.0937],
         [-0.3925, -0.8737,  0.5662,  ..., -0.2401,  0.6935,  0.0164],
         ...,
         [-0.9591, -0.2248, -0.2311,  ..., -0.1497,  0.4747,  0.1418],
         [-0.8833, -0.2674, -0.1900,  ..., -0.0263,  0.5026,  0.1306],
         [-0.


i=1410
Sanity Check!!

Start: 45120
End: 45152

Size of LastHidden Layer Size:torch.Size([32, 80, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4878, -0.1365, -0.2895,  ..., -0.3928, -0.2632,  0.8450],
         [ 0.4771,  0.2374, -0.4225,  ..., -0.1489,  0.2404,  0.6230],
         [-0.3239,  0.1567, -0.4438,  ...,  0.2751, -0.2191,  0.3192],
         ...,
         [-0.4353, -0.2596,  0.3547,  ..., -0.0790, -0.3010,  0.3464],
         [-0.4115, -0.2705,  0.2425,  ..., -0.1265, -0.3777,  0.3512],
         [-0.4799, -0.3115,  0.2683,  ..., -0.1532, -0.2933,  0.3443]],

        [[-0.6940, -0.2543,  0.1019,  ..., -0.8024,  0.1793,  0.8662],
         [-0.1063, -0.1262, -0.4528,  ...,  0.2892,  0.1462,  0.0617],
         [ 0.2388, -0.7937, -0.6016,  ..., -0.0921, -1.0666,  1.0366],
         ...,
         [-0.3700,  0.0408, -0.0670,  ...,  0.2296, -0.0202,  0.4029],
         [-0.6082, -0.1639, -0.0699,  ..., -0.0772,  0.0569,  0.5332],
         [-0.


i=1440
Sanity Check!!

Start: 46080
End: 46112

Size of LastHidden Layer Size:torch.Size([32, 66, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.8896, -0.1210, -0.0334,  ..., -0.2704,  0.5653,  0.7527],
         [-0.4286, -0.4369, -0.2711,  ..., -0.1129,  0.2861, -0.1658],
         [-1.0351, -0.9451, -0.0865,  ..., -0.6983, -0.0471,  0.6698],
         ...,
         [-0.3474, -0.4825,  0.3376,  ..., -0.0865,  0.0145,  0.2103],
         [-0.3441, -0.6164,  0.2149,  ..., -0.0308,  0.0744,  0.2240],
         [-0.2769, -0.4041,  0.2555,  ..., -0.2151,  0.2434,  0.0479]],

        [[-0.9804, -0.0251, -0.2452,  ..., -0.3338,  0.0297,  0.6775],
         [-0.9957, -0.2679,  0.2128,  ..., -0.0936,  0.2118, -0.2444],
         [-0.3036, -0.8568, -0.4523,  ...,  0.4171, -0.4421, -0.7979],
         ...,
         [-0.5796, -0.2345,  0.2397,  ..., -0.1600,  0.3345,  0.2216],
         [-0.8947, -0.3602, -0.4613,  ...,  0.1266,  0.3522,  0.4717],
         [-0.


i=1470
Sanity Check!!

Start: 47040
End: 47072

Size of LastHidden Layer Size:torch.Size([32, 68, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.9023, -0.3609,  0.4113,  ..., -0.3136,  0.3842,  0.8609],
         [-0.0632, -0.2997, -0.3410,  ...,  0.5007,  0.4058, -0.0253],
         [-0.4875, -1.1390,  0.1259,  ...,  0.4543, -0.3633,  0.3868],
         ...,
         [-0.4801, -0.7315, -0.2998,  ...,  0.1749,  0.4084,  0.2788],
         [-0.4598, -0.1573,  0.9284,  ...,  0.0529,  0.3788,  0.2967],
         [-0.2415, -0.1758,  0.5732,  ..., -0.2093,  0.2344,  0.4817]],

        [[-0.9271, -0.1745, -0.2706,  ..., -0.4002,  0.1054,  0.9272],
         [-0.6835, -0.2255,  0.2709,  ..., -0.1171,  0.5527, -0.0438],
         [-0.4613, -0.3378, -0.2614,  ..., -0.5849,  0.3018,  0.5320],
         ...,
         [-1.1153, -0.2452,  0.0578,  ..., -0.1025,  0.1587,  0.6801],
         [-0.9718, -0.2908,  0.0039,  ..., -0.1813,  0.0648,  0.6052],
         [-1.


i=1500
Sanity Check!!

Start: 48000
End: 48032

Size of LastHidden Layer Size:torch.Size([32, 62, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.3304e+00, -3.1090e-01, -8.6710e-02,  ..., -1.6120e-01,
           1.3672e-01,  6.8795e-01],
         [-6.6246e-03, -1.1119e-01, -5.6143e-01,  ...,  3.0563e-01,
           7.2244e-01, -8.9547e-02],
         [-5.7751e-01, -8.5772e-01, -4.3705e-01,  ...,  2.4839e-01,
          -1.6567e-01,  6.7185e-01],
         ...,
         [-8.4674e-01, -4.3207e-01, -2.0864e-01,  ...,  2.1646e-01,
          -1.6898e-01, -1.7817e-01],
         [-7.9565e-01, -3.1957e-01,  1.2268e-01,  ...,  1.5966e-02,
          -3.3872e-01,  4.1784e-02],
         [-2.6317e-01, -1.0160e-01,  2.3703e-01,  ..., -6.0204e-02,
          -2.9165e-01,  2.6999e-02]],

        [[-6.4526e-01,  3.6773e-02, -2.9103e-01,  ..., -5.8741e-01,
          -1.1723e-01,  6.0012e-01],
         [-3.5680e-01, -3.5518e-01, -2.9456e-01,  ...,  8.0911e-03,
     


i=1520
Sanity Check!!

Start: 48640
End: 48672

Size of LastHidden Layer Size:torch.Size([32, 53, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7845,  0.1314,  0.0563,  ..., -0.3783,  0.2487,  0.6989],
         [-0.4444, -0.0937, -0.1496,  ..., -0.3337,  0.1358, -0.4323],
         [-0.7460, -0.7664, -0.0636,  ...,  0.2510, -0.5522,  0.1325],
         ...,
         [-0.8396,  0.0029,  0.1519,  ..., -0.0416,  0.4944,  0.5522],
         [-0.6919, -0.0839,  0.2482,  ..., -0.0909,  0.2685,  0.3452],
         [-0.8991, -0.0521,  0.1087,  ..., -0.0956,  0.4462,  0.5078]],

        [[-0.5765,  0.1146,  0.0631,  ..., -0.3240,  0.1307,  0.5008],
         [-0.2299, -0.2371, -0.2588,  ...,  0.3653, -0.1610, -0.6195],
         [-1.1053, -0.9678, -0.5569,  ...,  0.5347, -0.6263,  0.2177],
         ...,
         [-0.7081, -0.1309,  0.0498,  ...,  0.1993,  0.3732,  0.3184],
         [-0.7081, -0.1447, -0.0085,  ...,  0.1072,  0.3730,  0.3468],
         [-0.


i=1550
Sanity Check!!

Start: 49600
End: 49632

Size of LastHidden Layer Size:torch.Size([32, 50, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.9458e-01,  6.0687e-02,  2.1428e-01,  ..., -3.4807e-01,
           1.8222e-01,  8.0902e-01],
         [-7.1753e-01,  3.3801e-03, -3.1765e-01,  ...,  9.3013e-02,
           3.9178e-01, -4.2832e-01],
         [-7.9315e-01, -6.7195e-01, -5.1592e-02,  ...,  1.2888e-01,
          -1.0215e-01,  4.0435e-01],
         ...,
         [-7.3621e-02, -1.3215e-01,  5.6388e-01,  ..., -3.1404e-01,
           3.0490e-01,  2.9868e-01],
         [-4.6773e-01, -2.6727e-01,  1.2141e-01,  ..., -1.0618e-01,
           6.1350e-01,  4.0665e-01],
         [-5.5966e-01, -1.8747e-01, -4.1320e-02,  ...,  3.8120e-03,
           6.0536e-01,  3.3958e-01]],

        [[-8.6677e-01, -1.2217e-01, -2.4839e-01,  ..., -6.3889e-01,
           1.6794e-01,  6.4635e-01],
         [-6.8030e-01, -3.2511e-01,  4.2940e-01,  ...,  2.1264e-02,
     


i=1580
Sanity Check!!

Start: 50560
End: 50592

Size of LastHidden Layer Size:torch.Size([32, 88, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.5145e-01, -2.9317e-01, -1.1406e-01,  ..., -2.7876e-01,
          -1.0920e-01,  6.0793e-01],
         [-3.1402e-01, -6.9358e-02, -1.9470e-01,  ...,  6.8338e-02,
          -2.0482e-01, -4.8810e-01],
         [-7.5292e-01, -9.8899e-01, -3.4432e-01,  ...,  5.6604e-01,
          -9.3493e-01,  4.8376e-02],
         ...,
         [-7.5234e-01, -2.2502e-01, -2.4000e-01,  ..., -1.9604e-02,
          -7.5711e-02,  1.0091e-01],
         [-7.4455e-01, -3.7436e-01,  7.9506e-03,  ..., -2.2577e-01,
          -1.7191e-01,  2.6441e-01],
         [-7.3119e-01, -1.8534e-01, -1.7892e-01,  ...,  4.3876e-02,
          -5.1311e-02,  9.5469e-02]],

        [[-4.0878e-01, -2.0288e-01, -1.2428e-01,  ..., -5.2065e-01,
           2.2601e-01,  5.0037e-01],
         [-6.1660e-01, -1.8791e-01, -1.4672e-01,  ..., -3.4310e-01,
     


i=1610
Sanity Check!!

Start: 51520
End: 51552

Size of LastHidden Layer Size:torch.Size([32, 57, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.6948e-01,  2.6907e-02, -8.1429e-02,  ..., -3.5378e-01,
           3.8732e-01,  9.3099e-01],
         [-1.4911e-01, -2.6814e-01, -7.0940e-01,  ...,  3.3446e-01,
           2.9993e-01, -4.3096e-01],
         [-3.8518e-01, -3.4466e-01, -2.6478e-01,  ..., -2.8228e-02,
          -2.2258e-01,  2.5097e-01],
         ...,
         [-2.5157e-01,  3.4394e-02,  2.3011e-01,  ..., -9.8209e-02,
           3.4013e-01,  3.9683e-01],
         [-2.8799e-01, -4.4786e-01, -3.0863e-01,  ...,  1.1505e-01,
           5.3096e-01,  3.1582e-01],
         [-1.6104e-01,  2.3614e-01,  1.4101e-01,  ..., -1.9540e-01,
           3.7636e-01,  5.4590e-01]],

        [[-9.2697e-01,  5.1199e-02, -1.3728e-01,  ..., -4.4171e-01,
           3.8184e-01,  5.7334e-01],
         [-4.6086e-01, -1.9561e-01, -4.2939e-01,  ...,  3.1893e-01,
     


i=1630
Sanity Check!!

Start: 52160
End: 52192

Size of LastHidden Layer Size:torch.Size([32, 82, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.7820e-01, -3.8736e-01, -2.4625e-01,  ..., -4.6982e-01,
           1.7771e-02,  7.8919e-01],
         [-2.8635e-01, -4.1686e-01, -1.7904e-01,  ...,  5.4352e-01,
          -2.4891e-02, -1.1368e-01],
         [-3.7061e-01, -8.9633e-01,  1.7913e-01,  ...,  3.7755e-01,
          -6.6888e-01,  4.2602e-01],
         ...,
         [-4.2918e-02, -1.3481e-01,  1.5751e-01,  ..., -1.9536e-01,
          -3.6656e-01,  2.1199e-01],
         [-5.1297e-03, -2.8631e-01,  3.0772e-01,  ..., -1.4717e-01,
          -3.2365e-01,  5.4629e-02],
         [-9.0575e-02, -5.0921e-01, -4.0664e-01,  ...,  5.9931e-03,
          -1.0875e-01,  1.2801e-01]],

        [[-6.3423e-01, -2.0048e-01,  4.8541e-02,  ..., -5.4413e-01,
           3.6800e-01,  6.5813e-01],
         [-3.4322e-01, -7.0865e-02, -5.3570e-01,  ..., -5.4789e-02,
     


i=1660
Sanity Check!!

Start: 53120
End: 53152

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.3045e-01, -1.3224e-01,  1.5999e-02,  ...,  8.1874e-02,
           9.9387e-02,  6.8630e-01],
         [ 3.3749e-01,  2.7138e-01, -6.0015e-01,  ...,  1.2368e-01,
           7.9495e-01,  1.9569e-01],
         [-5.7073e-01, -2.2291e-01, -5.8471e-01,  ...,  7.7933e-02,
           2.7576e-01,  3.2575e-02],
         ...,
         [-4.9287e-01, -1.8175e-01, -3.6304e-01,  ...,  4.0978e-01,
           2.2334e-01,  1.4097e-01],
         [-1.7148e-01, -1.7890e-01, -5.2890e-02,  ..., -5.1425e-02,
           1.2072e-01,  4.3270e-01],
         [-5.0337e-01, -5.9548e-02, -3.4959e-01,  ...,  4.1006e-01,
           2.7307e-01,  1.5290e-01]],

        [[-5.8260e-01,  4.0926e-01,  8.8262e-02,  ..., -2.3078e-01,
          -2.0260e-02,  6.1611e-01],
         [-4.5830e-01, -6.7089e-04, -1.2193e-01,  ...,  2.9247e-01,
     


i=1690
Sanity Check!!

Start: 54080
End: 54112

Size of LastHidden Layer Size:torch.Size([32, 63, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5406,  0.1907,  0.0799,  ..., -0.1729,  0.2667,  0.6248],
         [ 0.0793,  0.1356, -0.3039,  ...,  0.5282,  0.7393, -0.1243],
         [-0.2988, -0.6647, -0.0056,  ...,  0.4174, -0.3269,  0.6095],
         ...,
         [-0.7627,  0.0322, -0.4370,  ...,  0.0810,  0.3819,  0.1935],
         [-0.5483,  0.1538,  0.0884,  ..., -0.0130,  0.1160,  0.0330],
         [-0.6646,  0.1582, -0.0832,  ...,  0.0304,  0.3399,  0.1292]],

        [[-0.6381, -0.4934, -0.0599,  ..., -0.6137,  0.0619,  0.6873],
         [-1.1445,  0.0095, -0.2501,  ..., -0.2385,  1.2918,  0.0570],
         [-0.4971, -1.3105, -0.0801,  ..., -0.2207, -0.3179,  0.5722],
         ...,
         [-0.4495, -0.8509,  0.5800,  ..., -0.0156,  0.0924,  0.3013],
         [-0.4419, -0.8851,  0.5872,  ..., -0.1594,  0.0085,  0.3560],
         [-0.


i=1720
Sanity Check!!

Start: 55040
End: 55072

Size of LastHidden Layer Size:torch.Size([32, 53, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.4273e-01,  9.8706e-02,  1.6834e-01,  ..., -4.1928e-01,
           7.8924e-02,  5.5039e-01],
         [-4.5838e-01,  2.9079e-02, -3.8853e-01,  ..., -3.1082e-02,
           2.4599e-01, -3.9891e-01],
         [-8.6715e-01, -2.4406e-01, -4.7569e-01,  ...,  5.7921e-02,
          -8.4018e-01,  2.1257e-01],
         ...,
         [-5.6851e-01, -5.4883e-02,  9.3904e-02,  ..., -8.1067e-02,
           2.5551e-01,  3.6979e-01],
         [-4.8647e-01, -2.9806e-02,  2.3521e-01,  ..., -2.4283e-01,
          -2.3816e-01,  2.5221e-01],
         [-6.2145e-01, -1.6952e-02,  1.8345e-01,  ..., -7.2951e-02,
           1.1519e-01,  2.5187e-01]],

        [[-3.1976e-01,  1.6316e-02,  5.4453e-02,  ..., -1.4589e-01,
           1.3574e-01,  5.3316e-01],
         [-2.9432e-01, -1.4468e-01, -4.4212e-01,  ...,  1.8994e-01,
     


i=1750
Sanity Check!!

Start: 56000
End: 56032

Size of LastHidden Layer Size:torch.Size([32, 107, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.5385e-01,  3.7598e-01,  8.0945e-02,  ..., -6.0115e-01,
           3.9073e-01,  6.4526e-01],
         [-2.7967e-01,  1.0394e-01, -2.5296e-01,  ...,  3.1094e-01,
           6.3427e-01, -8.7004e-02],
         [-3.3243e-01, -1.5565e-01,  5.6576e-02,  ..., -1.7243e-01,
          -1.8934e-01,  7.7372e-01],
         ...,
         [-4.0225e-01,  1.5062e-01,  3.6517e-01,  ..., -2.0617e-01,
           1.4784e-01,  3.4815e-01],
         [-5.1598e-01, -9.0487e-02,  3.9711e-02,  ...,  5.2704e-02,
           5.1585e-01,  5.5409e-01],
         [-5.9812e-01, -9.3608e-02,  1.1047e-02,  ..., -5.7418e-02,
           4.5501e-01,  4.5921e-01]],

        [[-6.4023e-01,  7.2433e-03,  1.6474e-02,  ..., -5.3677e-01,
           1.5203e-01,  5.6179e-01],
         [-1.0615e+00, -2.3915e-01,  5.1171e-02,  ..., -2.1272e-01,
    

In [1]:
from utils.utils import load_data_set
train_sql, train_table = load_data_set('train')

Loading train dataset
Loaded 56355 queries and 18585 tables


In [2]:
from model.wordEmbedding import test_wordembed_module

test_wordembed_module(train_sql, train_table)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



i=0
Sanity Check!!

Start: 0
End: 32

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7360, -0.3183, -0.1671,  ..., -0.0541,  0.3680,  0.6488],
         [ 0.1146,  0.1187, -0.3189,  ...,  0.4162,  0.3891,  0.2190],
         [-0.8834, -0.8451,  0.4053,  ...,  0.1687, -0.0132,  0.3803],
         ...,
         [ 0.2565, -0.5440,  0.3062,  ...,  0.2203, -0.1344, -0.6219],
         [-0.0052,  0.3801,  0.2257,  ...,  0.0346, -0.6065,  0.1337],
         [ 0.8852,  0.0332, -0.2031,  ...,  0.2007, -0.5840, -0.3072]],

        [[-0.5216,  0.2069, -0.3126,  ..., -0.2208, -0.1085,  0.4717],
         [-0.5596,  0.3364, -0.4079,  ...,  0.3152,  0.2703, -0.0337],
         [-0.4433, -0.4492, -0.2297,  ...,  0.1253, -0.3978,  0.7244],
         ...,
         [-0.5026,  0.2304,  0.1200,  ...,  0.0251, -0.4091,  0.3194],
         [-0.6771, -0.0647,  0.2924,  ...,  0.2989, -0.4582,  0.5327],
         [-0.5909,  0.2


i=30
Sanity Check!!

Start: 960
End: 992

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3537, -0.4447,  0.0796,  ..., -0.6434,  0.1948,  0.8515],
         [-0.9852, -0.1915, -0.1561,  ..., -0.1698,  0.9630, -0.2320],
         [ 0.8798,  0.1429,  0.6082,  ..., -1.0878,  0.5556,  0.3765],
         ...,
         [-0.1552, -0.3071,  0.3599,  ..., -0.3503,  0.1202,  0.3466],
         [-0.3092, -0.4811,  0.2938,  ..., -0.2282,  0.2688,  0.3767],
         [ 0.0138, -0.3922,  0.3843,  ..., -0.2030,  0.1247,  0.4370]],

        [[-0.4347, -0.6686,  0.0100,  ..., -0.0589,  0.4466,  0.8045],
         [-0.5453, -1.2426, -0.6027,  ...,  0.2512, -0.0025, -0.6876],
         [-0.6054, -1.0845,  0.7297,  ..., -0.3916, -0.0925,  0.3934],
         ...,
         [-0.2433, -0.2602,  0.3958,  ...,  0.0854,  0.1304,  0.2709],
         [-0.3735, -0.4544,  0.4562,  ...,  0.0743,  0.3404,  0.3983],
         [-0.3145, 


i=60
Sanity Check!!

Start: 1920
End: 1952

Size of LastHidden Layer Size:torch.Size([32, 56, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4186, -0.0572,  0.0276,  ...,  0.1255,  0.0612,  0.7230],
         [ 0.3679,  0.5419, -0.6098,  ...,  0.1080,  0.7428,  0.3347],
         [-0.6119,  0.4308, -0.9248,  ..., -0.0860,  0.3031,  0.1657],
         ...,
         [-0.5177, -0.1298, -0.5078,  ...,  0.4387,  0.2018,  0.2584],
         [-0.2327,  0.0586, -0.2301,  ..., -0.0572,  0.0420,  0.4678],
         [-0.2754, -0.0618, -0.3754,  ..., -0.0844,  0.0877,  0.5161]],

        [[-0.5108,  0.0236,  0.1160,  ..., -0.3224,  0.1250,  0.7706],
         [-0.5650,  0.1232, -0.3174,  ...,  0.5099,  0.2544, -0.2093],
         [-0.4858, -0.2105, -0.1697,  ...,  0.1047, -0.4189,  0.6615],
         ...,
         [-0.5259, -0.2331,  0.4895,  ..., -0.1473,  0.0127,  0.2249],
         [-0.5382, -0.2782,  0.4199,  ..., -0.0694,  0.0462,  0.1749],
         [-0.5001


i=90
Sanity Check!!

Start: 2880
End: 2912

Size of LastHidden Layer Size:torch.Size([32, 76, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.8624, -0.0187, -0.1579,  ..., -0.4395,  0.2138,  0.7921],
         [-0.0390,  0.2538, -0.2520,  ...,  0.2850,  0.4865,  0.3346],
         [-0.4315, -0.8027, -0.0808,  ..., -0.2743,  0.1257,  0.3221],
         ...,
         [-0.9073, -0.1401,  0.1069,  ...,  0.1155,  0.4105,  0.3667],
         [-0.3261,  0.1077, -0.0609,  ...,  0.0315, -0.2177,  0.5651],
         [-0.8243, -0.1224,  0.0633,  ...,  0.0428,  0.3990,  0.4129]],

        [[-0.7829, -0.0038, -0.1921,  ..., -0.5213,  0.0922,  0.8662],
         [ 0.1274, -0.1997, -0.2169,  ...,  0.1693,  0.4594, -0.1525],
         [-0.1129, -0.3617, -0.4215,  ..., -0.2722, -0.8605,  0.5087],
         ...,
         [-0.3882,  0.1002,  0.2162,  ..., -0.2754, -0.2753,  0.3368],
         [-0.3701,  0.1535,  0.1294,  ..., -0.1883, -0.2319,  0.3708],
         [-0.9301


i=120
Sanity Check!!

Start: 3840
End: 3872

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3948, -0.5427,  0.1586,  ..., -0.5660,  0.2053,  0.8219],
         [-1.1501, -0.3278, -0.1114,  ..., -0.3997,  0.6982, -0.3293],
         [ 0.7502,  0.0768,  0.1487,  ..., -0.7399, -0.5405,  0.0103],
         ...,
         [-0.5051, -0.6034,  0.2577,  ..., -0.0563,  0.4751,  0.1759],
         [-0.2960, -0.4927,  0.3569,  ..., -0.2643,  0.3156,  0.5073],
         [-0.0766, -0.4147,  0.4691,  ..., -0.3306,  0.2041,  0.4186]],

        [[-0.7253, -0.0551,  0.1401,  ..., -0.3754,  0.0924,  0.6613],
         [ 0.0786,  0.0625, -0.4453,  ...,  0.2793,  0.6067, -0.2657],
         [-0.3025, -0.6640, -0.2896,  ...,  0.3652, -0.5787,  0.4859],
         ...,
         [-0.8421,  0.0559, -0.0869,  ...,  0.0471,  0.3065,  0.0776],
         [-0.5852,  0.1226,  0.2084,  ..., -0.1577, -0.1306,  0.2824],
         [-0.764


i=150
Sanity Check!!

Start: 4800
End: 4832

Size of LastHidden Layer Size:torch.Size([32, 105, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.4253e-01, -8.6975e-02,  3.6155e-01,  ..., -2.2380e-01,
           4.0058e-01,  7.5443e-01],
         [-4.0917e-01,  4.0057e-02, -3.0581e-01,  ...,  2.6498e-01,
           7.3313e-01, -1.6402e-01],
         [-6.2069e-01, -7.6083e-01, -5.7245e-01,  ...,  4.4086e-01,
          -1.2565e-02,  3.5093e-01],
         ...,
         [-3.0202e-01, -2.1803e-01,  5.7064e-01,  ...,  2.7582e-02,
           2.9641e-01,  2.7669e-02],
         [-2.2543e-01, -1.1438e-01,  3.3806e-01,  ..., -1.1727e-01,
           5.8577e-01,  8.6509e-02],
         [-5.6543e-01, -3.5831e-01,  8.0651e-03,  ...,  2.7779e-01,
           7.7613e-01, -1.7268e-02]],

        [[-5.7425e-01, -4.1227e-01,  4.0035e-01,  ..., -6.6335e-02,
           9.9156e-01,  3.8805e-01],
         [-1.3457e-01, -6.3196e-01, -1.9743e-01,  ...,  4.2630e-01,
       


i=180
Sanity Check!!

Start: 5760
End: 5792

Size of LastHidden Layer Size:torch.Size([32, 55, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4363, -0.0332, -0.0328,  ..., -0.5426,  0.2267,  0.9964],
         [-0.4078,  0.0218, -0.4376,  ...,  0.1637,  0.3922,  0.0331],
         [-0.5302, -0.4966, -0.2366,  ..., -0.1519, -0.1707,  0.3473],
         ...,
         [ 0.6975,  0.2356, -0.3238,  ...,  0.0394, -0.6425, -0.3234],
         [-0.6081,  0.0951, -0.1261,  ..., -0.6081,  0.1326,  0.1859],
         [ 0.7041,  0.2305, -0.3287,  ...,  0.0363, -0.6333, -0.3266]],

        [[-0.7087, -0.0854, -0.2203,  ..., -0.5064, -0.2378,  0.7105],
         [-0.3732, -0.3434, -0.2664,  ...,  0.4716, -0.3841, -0.3570],
         [ 0.0086, -0.5215, -0.3138,  ..., -0.1640, -0.8351,  0.8068],
         ...,
         [-0.6661, -0.2615,  0.2164,  ...,  0.1655, -0.1906,  0.3871],
         [-0.6947, -0.3279,  0.1532,  ...,  0.0583, -0.1989,  0.4291],
         [-0.455


i=210
Sanity Check!!

Start: 6720
End: 6752

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5145,  0.1573,  0.0535,  ..., -0.2117,  0.0764,  0.4801],
         [-0.0304,  0.2422,  0.3203,  ...,  0.0713,  0.6923,  0.3812],
         [-0.3512, -0.1021, -0.0320,  ..., -0.0925, -0.3584,  0.6901],
         ...,
         [-0.8079,  0.0408,  0.1762,  ...,  0.1758,  0.3430,  0.6220],
         [-0.9688, -0.0300,  0.1934,  ...,  0.0635,  0.3545,  0.6855],
         [-0.8163,  0.1393,  0.3476,  ...,  0.2252,  0.3642,  0.7053]],

        [[-0.7769, -0.3943, -0.1693,  ..., -0.3609,  0.3050,  0.6691],
         [-0.7632, -0.0372, -0.7481,  ...,  0.0511,  0.2125,  1.0245],
         [-0.8173, -0.3167, -0.8009,  ...,  0.5744, -0.2053,  0.7937],
         ...,
         [-0.9817, -0.1895,  0.1847,  ..., -0.3031,  0.4132,  0.5537],
         [-0.8962, -0.0496,  0.1055,  ..., -0.0879,  0.6232,  0.5733],
         [-0.435


i=240
Sanity Check!!

Start: 7680
End: 7712

Size of LastHidden Layer Size:torch.Size([32, 88, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.1707e-01, -5.1129e-02,  3.5682e-01,  ..., -2.6782e-01,
           1.8480e-01,  7.0959e-01],
         [-4.7106e-01,  2.0449e-02, -2.9024e-01,  ...,  3.8921e-01,
           4.6555e-01, -1.2668e-01],
         [-7.9470e-01, -7.4755e-01, -2.9854e-01,  ...,  5.4106e-01,
          -2.0614e-01,  5.8338e-01],
         ...,
         [-4.9607e-01,  8.0248e-03,  1.4669e-01,  ..., -2.3045e-01,
           4.9948e-01,  1.3716e-01],
         [-3.2579e-01, -1.7671e-02,  1.8631e-01,  ..., -1.6404e-01,
           2.8632e-02,  1.0202e-01],
         [-3.6871e-01, -1.8099e-01,  1.3341e-01,  ..., -5.1001e-02,
           3.8075e-01,  4.6004e-02]],

        [[-2.5108e-01,  1.1162e-01, -7.3489e-03,  ..., -2.2489e-01,
           2.2215e-01,  3.5225e-01],
         [-6.9244e-01, -2.7181e-01, -4.0518e-01,  ...,  1.2016e-03,
        


i=270
Sanity Check!!

Start: 8640
End: 8672

Size of LastHidden Layer Size:torch.Size([32, 91, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.4658e-01, -8.9169e-02,  1.5627e-02,  ..., -1.6727e-01,
           1.5573e-01,  6.6239e-01],
         [-3.2453e-01,  1.9813e-01, -4.8500e-01,  ..., -9.8879e-02,
           4.2817e-01,  1.5102e-02],
         [-4.9896e-01, -1.3785e-01,  3.5641e-01,  ..., -7.9018e-02,
           1.1855e-01, -6.6620e-02],
         ...,
         [-2.0610e-01,  2.4267e-01,  2.8147e-01,  ...,  3.9738e-02,
           3.9852e-01,  1.5022e-01],
         [-2.8501e-01, -3.8770e-02,  1.7191e-01,  ...,  7.8340e-02,
           3.1193e-01, -1.7292e-01],
         [-2.9828e-01,  9.0853e-02,  2.3115e-01,  ...,  1.5333e-01,
           2.7582e-01,  4.2379e-02]],

        [[-7.1974e-01, -3.1592e-02,  5.0772e-02,  ..., -1.0127e-01,
           1.9937e-01,  5.0063e-01],
         [ 2.3679e-01,  4.4096e-01, -3.2389e-01,  ..., -9.9903e-02,
        


i=300
Sanity Check!!

Start: 9600
End: 9632

Size of LastHidden Layer Size:torch.Size([32, 73, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3685,  0.1023, -0.1839,  ..., -0.4142,  0.3850,  0.7769],
         [-0.0167, -0.3020, -0.5505,  ...,  0.2999,  0.3360, -0.2119],
         [-0.1345, -0.5262, -0.1045,  ..., -0.5378,  0.1235,  0.9993],
         ...,
         [-0.6214, -0.4552, -0.1632,  ..., -0.0301,  0.4347,  0.4931],
         [-0.5699, -0.5243, -0.3519,  ...,  0.1020,  0.3311,  0.3272],
         [-0.2018,  0.0385,  0.0798,  ..., -0.2344,  0.2813,  0.4695]],

        [[-0.6822, -0.5049, -0.2099,  ..., -0.1331, -0.2372,  0.8407],
         [ 0.3383,  0.3539, -0.4475,  ..., -0.1557,  0.5274,  0.7261],
         [-1.4048, -0.3300, -0.4731,  ...,  0.2106, -0.4514,  0.0940],
         ...,
         [-0.6597, -0.4524, -0.2481,  ...,  0.1178, -0.1335,  0.2183],
         [-0.6991, -0.3189, -0.2325,  ...,  0.0780, -0.1264,  0.1938],
         [-0.716


i=330
Sanity Check!!

Start: 10560
End: 10592

Size of LastHidden Layer Size:torch.Size([32, 78, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.4689e-01, -2.2892e-01, -2.0777e-02,  ..., -3.7563e-01,
          -2.4517e-01,  7.1734e-01],
         [ 2.2723e-01,  2.8927e-01,  3.7502e-01,  ..., -6.1775e-01,
          -3.1285e-02,  3.0190e-01],
         [-9.7581e-01, -4.5677e-01,  3.7328e-02,  ...,  5.5114e-02,
          -3.1954e-01, -1.8019e-01],
         ...,
         [-6.0512e-01,  5.3864e-02, -1.3480e-01,  ..., -2.3965e-02,
          -2.4135e-01,  2.0487e-01],
         [-6.5372e-01, -9.7695e-02,  2.2858e-01,  ..., -9.1234e-03,
          -9.0559e-01,  2.3000e-01],
         [-6.3993e-01,  1.0021e-01, -7.5812e-03,  ..., -1.4195e-01,
          -3.4213e-01,  3.7010e-01]],

        [[-3.8393e-01, -1.3259e-01, -5.1040e-02,  ..., -6.0035e-01,
           2.2477e-01,  6.5014e-01],
         [-1.6727e-03,  1.5005e-01, -2.7926e-01,  ..., -5.2127e-04,
      


i=350
Sanity Check!!

Start: 11200
End: 11232

Size of LastHidden Layer Size:torch.Size([32, 62, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.8865e-01, -6.4229e-02, -3.3400e-01,  ..., -4.4815e-01,
           1.6199e-01,  5.6508e-01],
         [-7.2627e-02,  7.6649e-01, -3.3161e-01,  ...,  2.0412e-02,
           3.1197e-01,  8.1912e-01],
         [-1.0343e+00,  1.0374e-01, -2.2187e-01,  ..., -6.9695e-02,
          -3.0865e-01,  4.7673e-01],
         ...,
         [-5.6188e-01,  1.9403e-02, -2.2244e-01,  ..., -3.5854e-02,
           1.5855e-02,  3.9948e-01],
         [-6.8961e-01, -3.4538e-01,  4.7767e-02,  ..., -8.8189e-02,
          -6.2900e-02,  2.8185e-01],
         [-4.3203e-01, -7.0142e-02, -1.9778e-01,  ...,  8.2808e-02,
           1.1623e-01,  3.6614e-01]],

        [[-3.5241e-01, -1.0204e-01, -1.2148e-01,  ..., -3.1171e-01,
           2.3960e-01,  5.0130e-01],
         [-1.9378e-01, -3.9851e-01,  1.7858e-01,  ...,  1.5763e-01,
      


i=380
Sanity Check!!

Start: 12160
End: 12192

Size of LastHidden Layer Size:torch.Size([32, 84, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.7490e-01,  9.3026e-02,  8.7999e-02,  ..., -1.9884e-01,
           1.4974e-01,  6.6295e-01],
         [-8.7457e-01,  1.2701e-01, -1.9938e-02,  ...,  1.7423e-01,
           1.4250e-01, -3.9643e-01],
         [-6.2101e-01, -3.8663e-01,  1.5335e-01,  ..., -5.3572e-02,
          -3.1947e-01,  3.6939e-01],
         ...,
         [-7.5123e-01, -3.8037e-02,  6.1378e-03,  ...,  1.4114e-01,
           2.9152e-01,  2.9977e-01],
         [-5.5455e-01,  1.0724e-01,  3.8258e-01,  ..., -1.0300e-01,
          -3.4989e-01,  3.4894e-01],
         [-7.2601e-01, -3.2045e-02,  9.5943e-02,  ...,  1.0484e-01,
           1.9446e-01,  3.4714e-01]],

        [[-8.1336e-01,  5.5805e-03, -2.1696e-01,  ..., -4.2471e-01,
           7.7279e-03,  7.9507e-01],
         [-3.5190e-01,  7.9109e-02, -5.4653e-01,  ...,  5.4823e-01,
      


i=410
Sanity Check!!

Start: 13120
End: 13152

Size of LastHidden Layer Size:torch.Size([32, 51, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.1004e-01, -2.3838e-01,  1.8480e-02,  ..., -6.9152e-01,
           3.7035e-01,  6.4048e-01],
         [-1.2951e-01, -8.4925e-02, -4.4197e-01,  ..., -1.4863e-01,
           4.2659e-01,  3.2371e-01],
         [-3.6287e-01, -6.3949e-01,  4.7558e-01,  ...,  2.7888e-02,
           6.7924e-02, -2.8631e-01],
         ...,
         [-9.4184e-02, -5.5279e-01,  3.8446e-01,  ..., -6.9522e-03,
           1.2555e-01,  2.2916e-01],
         [-2.2193e-02, -2.5196e-01,  2.6493e-01,  ...,  8.3343e-02,
           2.9426e-01,  2.4809e-01],
         [-2.6984e-01, -3.7147e-01, -1.9434e-04,  ...,  4.8089e-02,
           4.4263e-01,  3.0640e-01]],

        [[-1.1435e+00, -2.0285e-01, -1.9592e-01,  ..., -3.6590e-01,
           2.7447e-01,  9.7083e-01],
         [-7.7828e-01,  1.4346e-01,  5.6957e-02,  ..., -2.0743e-02,
      


i=430
Sanity Check!!

Start: 13760
End: 13792

Size of LastHidden Layer Size:torch.Size([32, 55, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.9116e-01, -8.3254e-02, -1.5907e-01,  ..., -4.1790e-01,
           2.1644e-01,  7.0909e-01],
         [-4.2181e-01,  5.4307e-02, -1.1267e-01,  ..., -8.0470e-02,
           5.6047e-01,  3.8433e-01],
         [-8.6728e-01, -3.9440e-01,  4.7616e-01,  ..., -6.8217e-03,
           9.3223e-02, -2.2557e-01],
         ...,
         [-4.7549e-01, -2.7204e-01, -9.0004e-02,  ..., -5.3651e-02,
           2.3935e-01,  4.8654e-01],
         [-5.4101e-01, -2.0349e-01, -1.6630e-01,  ...,  1.3313e-01,
           3.2549e-01,  3.5619e-01],
         [-4.5742e-01, -1.1362e-01,  8.9052e-02,  ..., -8.1360e-02,
           1.2071e-01,  5.3570e-01]],

        [[-6.7815e-01,  2.8150e-02, -1.9139e-01,  ..., -3.3927e-01,
           2.3021e-01,  7.2686e-01],
         [ 6.2822e-01,  5.7001e-01, -2.5505e-01,  ..., -1.1474e-02,
      


i=450
Sanity Check!!

Start: 14400
End: 14432

Size of LastHidden Layer Size:torch.Size([32, 52, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.2934, -0.4237,  0.0899,  ..., -0.5639,  0.2857,  0.8358],
         [ 0.1178,  0.0471, -0.1684,  ..., -0.0018,  0.5432,  0.0887],
         [ 0.5006, -0.8013,  0.8435,  ..., -0.2393,  0.5072,  0.1207],
         ...,
         [ 1.2750, -0.9866,  1.0765,  ..., -0.6156,  1.1242,  0.3498],
         [-0.4114, -0.0090,  0.5199,  ..., -0.7006,  0.2373, -0.2317],
         [ 0.6939,  0.1513, -0.1636,  ..., -0.0476, -0.6430, -0.1974]],

        [[-0.7598,  0.0644,  0.0405,  ..., -0.5643,  0.5305,  0.7499],
         [-0.5251,  0.0150, -0.1476,  ..., -0.1987,  0.9172,  0.2710],
         [-0.7352, -0.6019,  0.1267,  ..., -0.3661,  0.1133,  0.7675],
         ...,
         [-0.4344,  0.0500,  0.2333,  ...,  0.3111,  0.8358,  0.0574],
         [-0.6685, -0.1161,  0.1875,  ..., -0.0039,  0.5802,  0.3203],
         [-0.4


i=480
Sanity Check!!

Start: 15360
End: 15392

Size of LastHidden Layer Size:torch.Size([32, 57, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7208, -0.0117, -0.2320,  ..., -0.2778, -0.0112,  0.7107],
         [-1.4350, -0.5163, -1.5149,  ..., -0.0649,  0.1336,  0.6820],
         [ 0.2364, -0.1422,  0.1291,  ...,  0.0500, -0.8533,  0.6045],
         ...,
         [-0.8141,  0.0056, -0.5310,  ..., -0.2093, -0.1436,  0.3807],
         [-0.7006,  0.1726, -0.1895,  ..., -0.2836, -0.1335,  0.4645],
         [-0.7662,  0.1630, -0.2515,  ..., -0.3533, -0.1792,  0.4732]],

        [[-0.6522,  0.1041,  0.0326,  ..., -0.4520,  0.2143,  0.7205],
         [-0.7343,  0.1931,  0.4312,  ..., -0.3786,  0.5287,  0.2138],
         [-0.6108,  0.5184, -0.0807,  ...,  0.0059,  0.3140,  0.4423],
         ...,
         [-0.6813, -0.0727, -0.0937,  ..., -0.0376,  0.5101,  0.4804],
         [-0.4068,  0.1707,  0.0695,  ..., -0.1342,  0.1399,  0.4522],
         [-0.4


i=510
Sanity Check!!

Start: 16320
End: 16352

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.2689e+00,  1.5835e-01,  3.8957e-02,  ..., -1.8984e-01,
          -6.2165e-02,  5.8116e-01],
         [ 1.5065e-01,  2.1353e-01, -5.6814e-01,  ..., -2.3314e-01,
           5.1182e-01,  1.2819e-01],
         [-8.2052e-01, -1.4622e-01, -1.6462e-01,  ..., -1.4244e-02,
           1.7444e-01,  3.4858e-01],
         ...,
         [-3.3360e-01,  2.1487e-01,  1.0465e-01,  ...,  2.6260e-01,
           4.3676e-02,  7.9604e-02],
         [-3.8968e-01,  1.4511e-01,  1.3608e-01,  ...,  1.9547e-01,
          -8.3537e-03,  1.4921e-02],
         [-4.7328e-01,  3.3583e-01,  2.1328e-01,  ..., -7.3196e-02,
          -2.6742e-01, -3.5352e-02]],

        [[-5.9584e-01, -4.3120e-02, -1.5499e-02,  ..., -4.0456e-01,
           7.5800e-02,  7.7500e-01],
         [ 1.4020e-01,  4.8057e-01, -1.6450e-01,  ..., -4.0382e-01,
      


i=530
Sanity Check!!

Start: 16960
End: 16992

Size of LastHidden Layer Size:torch.Size([32, 92, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.5187e-01, -4.7868e-01, -8.1698e-03,  ..., -6.3223e-01,
           1.8331e-01,  7.3355e-01],
         [ 1.4521e-01, -3.1421e-02, -4.3061e-01,  ..., -5.0091e-02,
           3.4634e-01,  3.5879e-01],
         [-6.5119e-02, -8.1290e-01,  2.9917e-02,  ..., -2.0097e-01,
           7.3581e-01,  8.4453e-02],
         ...,
         [-2.1956e-01, -4.5792e-01,  6.7917e-02,  ..., -2.2767e-01,
           4.4429e-01,  1.5168e-01],
         [-3.4852e-01, -6.2772e-01,  3.2500e-02,  ..., -1.6199e-01,
           4.0549e-01,  1.4126e-01],
         [-6.2427e-02, -3.4626e-01,  4.1247e-01,  ..., -2.7393e-01,
           1.8245e-01,  1.3574e-01]],

        [[-7.8414e-01, -1.7414e-01, -1.2652e-01,  ..., -3.4939e-01,
          -4.8765e-02,  5.2948e-01],
         [-1.5965e-01,  3.9632e-02, -3.2969e-01,  ...,  1.3889e-01,
      


i=550
Sanity Check!!

Start: 17600
End: 17632

Size of LastHidden Layer Size:torch.Size([32, 63, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.2519, -0.2461, -0.7374,  ..., -0.5334,  0.1491,  0.6979],
         [-1.1263, -0.7646, -0.7510,  ..., -0.2056,  0.7825,  0.1118],
         [ 0.6509, -0.0640, -0.1624,  ..., -0.1239, -0.9303, -0.1234],
         ...,
         [-0.1727, -0.1833, -0.1239,  ...,  0.1863,  0.0947,  0.0108],
         [-0.4987, -0.4408, -0.6262,  ...,  0.0810,  0.1735,  0.1172],
         [-0.2208,  0.0281, -0.2776,  ...,  0.0390, -0.0073,  0.2537]],

        [[-0.9474, -0.0138, -0.1542,  ..., -0.3065,  0.2882,  0.6728],
         [-0.8817, -1.0922, -0.1372,  ...,  0.2770, -0.0605, -0.5506],
         [-0.9674, -1.0002,  0.1031,  ..., -0.2880,  0.0119,  0.4500],
         ...,
         [-0.5163,  0.0059,  0.4684,  ...,  0.1393, -0.0993, -0.0389],
         [-0.3314, -0.0111,  0.5664,  ...,  0.1767,  0.0760, -0.0469],
         [-0.6


i=580
Sanity Check!!

Start: 18560
End: 18592

Size of LastHidden Layer Size:torch.Size([32, 56, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.2182e-01,  1.3252e-01,  7.9171e-02,  ..., -4.5673e-01,
           3.6067e-01,  6.9661e-01],
         [-5.2764e-01, -3.9448e-01, -3.6188e-01,  ..., -1.8608e-02,
          -1.1335e-01, -4.1660e-01],
         [-2.1581e-01, -7.8538e-01, -1.6379e-01,  ..., -1.8591e-01,
          -5.3375e-01,  3.4372e-01],
         ...,
         [-3.3662e-01,  2.1938e-01, -1.4513e-02,  ..., -4.5299e-01,
           6.4824e-03,  5.0337e-01],
         [-2.8732e-01,  2.2311e-01,  1.9276e-01,  ..., -5.0168e-01,
           1.7748e-01,  1.5640e-01],
         [-4.4903e-01,  3.8597e-02, -2.5146e-01,  ..., -1.7940e-01,
           3.0083e-01,  4.0589e-01]],

        [[-7.1501e-01, -9.5493e-03,  4.9564e-03,  ..., -4.0974e-01,
           7.5957e-02,  8.8390e-01],
         [-7.0525e-01,  2.1025e-01, -1.2967e-01,  ...,  1.8587e-01,
      


i=610
Sanity Check!!

Start: 19520
End: 19552

Size of LastHidden Layer Size:torch.Size([32, 46, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.2763e-01,  9.7609e-02, -4.2317e-02,  ..., -3.6094e-01,
           1.9713e-01,  5.0451e-01],
         [-5.7598e-01,  2.1665e-01, -5.2619e-01,  ...,  1.1638e-01,
          -1.3582e-01, -5.9129e-01],
         [-5.0675e-01, -6.4629e-02, -4.6631e-01,  ..., -1.0678e-01,
          -1.3274e-01,  3.0048e-01],
         ...,
         [-7.6753e-01, -3.0906e-02, -2.2497e-01,  ..., -1.4940e-01,
           3.1892e-01,  4.2274e-01],
         [-8.1126e-01,  1.1688e-03, -2.0302e-01,  ..., -1.0779e-01,
           3.2383e-01,  4.2646e-01],
         [-7.4502e-01, -8.6422e-02, -2.3620e-01,  ..., -1.8611e-01,
           3.0534e-01,  4.6836e-01]],

        [[-8.7616e-01, -1.0216e-01, -2.0519e-01,  ..., -6.5229e-01,
           2.0573e-01,  9.7243e-01],
         [-1.8859e-01,  2.4751e-01, -5.3206e-01,  ...,  1.0851e-01,
      


i=630
Sanity Check!!

Start: 20160
End: 20192

Size of LastHidden Layer Size:torch.Size([32, 55, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.9284e-01, -1.7260e-01, -6.2130e-02,  ..., -7.2328e-01,
           1.7515e-01,  8.1155e-01],
         [-6.1198e-01,  3.1680e-02, -3.0264e-02,  ..., -1.4901e-02,
           2.9216e-01,  2.0513e-01],
         [-4.3745e-01, -5.1479e-03,  5.8426e-02,  ..., -4.6014e-01,
          -6.4342e-01,  3.8801e-01],
         ...,
         [-5.3527e-01,  1.0828e-01, -8.0396e-02,  ...,  1.0981e-01,
          -3.0975e-02,  3.4374e-01],
         [-4.7475e-01, -2.1264e-02, -2.6597e-01,  ...,  1.4951e-01,
           2.1173e-01,  3.7328e-01],
         [-4.9986e-01, -3.1714e-02, -2.7112e-01,  ...,  2.2055e-01,
           2.6655e-01,  4.0405e-01]],

        [[-4.9245e-01,  2.8129e-01, -2.0923e-01,  ..., -4.1040e-01,
          -2.6276e-02,  6.9327e-01],
         [-3.1913e-02,  3.9439e-01, -7.6065e-01,  ...,  4.8595e-01,
      


i=660
Sanity Check!!

Start: 21120
End: 21152

Size of LastHidden Layer Size:torch.Size([32, 70, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-5.0157e-01, -2.4420e-01, -8.0009e-02,  ..., -2.9284e-01,
           2.4821e-01,  6.7549e-01],
         [-9.6741e-02, -5.4171e-02, -3.1891e-01,  ..., -9.9838e-02,
           1.3756e-01,  1.2959e-01],
         [-4.3143e-01, -8.9267e-01,  3.7268e-01,  ..., -3.3054e-01,
          -1.4517e-01,  2.0780e-01],
         ...,
         [-5.3563e-01, -3.3017e-01, -5.4493e-01,  ..., -1.8576e-02,
           2.9100e-01,  4.0978e-01],
         [-5.1682e-01, -2.9475e-01, -5.2284e-01,  ..., -3.2301e-02,
           2.6933e-01,  4.1800e-01],
         [-6.1838e-01, -2.7843e-01, -5.5605e-01,  ..., -3.8772e-02,
           2.3184e-01,  3.9419e-01]],

        [[-4.5520e-01, -8.4083e-02,  5.5470e-02,  ..., -5.5541e-01,
           1.5400e-01,  7.0571e-01],
         [-3.4733e-01, -2.4224e-01, -9.8030e-02,  ...,  9.1836e-02,
      


i=690
Sanity Check!!

Start: 22080
End: 22112

Size of LastHidden Layer Size:torch.Size([32, 56, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5544, -0.0862,  0.3035,  ..., -0.5572,  0.2782,  0.5866],
         [-0.0227, -0.1152, -0.2762,  ...,  0.4527,  0.4248, -0.0783],
         [ 0.0859, -1.1757,  0.6135,  ..., -0.1682,  0.6311,  0.5046],
         ...,
         [-0.1259,  0.0040,  0.4058,  ..., -0.1239,  0.2477,  0.1930],
         [-0.2208, -0.0702,  0.2952,  ..., -0.1382,  0.2254,  0.2475],
         [-0.1962, -0.5045,  0.0349,  ...,  0.1523,  0.4372, -0.0334]],

        [[-0.5521, -0.0022, -0.1304,  ..., -0.6259, -0.0921,  0.6032],
         [-1.2351,  0.1128, -0.2470,  ..., -0.0883,  0.5505, -0.0752],
         [-0.0691,  0.0080,  0.9783,  ..., -0.5221, -0.5862,  0.3752],
         ...,
         [-0.2616,  0.2798,  0.0680,  ..., -0.1643, -0.4003,  0.6029],
         [-0.4072,  0.1535,  0.1256,  ..., -0.1117, -0.3703,  0.4917],
         [-0.4


i=720
Sanity Check!!

Start: 23040
End: 23072

Size of LastHidden Layer Size:torch.Size([32, 63, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.4793e-01, -5.4592e-02, -1.8705e-01,  ..., -6.6345e-01,
           9.2141e-02,  3.0593e-01],
         [-3.2005e-01, -2.8747e-01, -9.6497e-02,  ..., -2.2126e-01,
          -2.0548e-01, -8.1939e-01],
         [-3.2362e-01, -4.3532e-01,  3.0008e-02,  ..., -6.2746e-01,
          -1.6421e-01, -4.4233e-01],
         ...,
         [-2.0181e-02, -2.7271e-01,  9.2865e-02,  ..., -2.5014e-01,
          -2.0712e-01, -7.5459e-02],
         [-7.4339e-02, -1.8348e-01, -2.2512e-02,  ..., -3.0033e-01,
          -2.0430e-01, -8.1034e-02],
         [-4.3949e-01, -7.5993e-02, -5.9639e-01,  ..., -9.8065e-02,
           1.5419e-01, -7.0344e-02]],

        [[-3.9597e-01, -1.8806e-01, -1.0178e-02,  ..., -2.0390e-01,
           3.6693e-01,  4.5783e-01],
         [-3.2628e-01, -2.6784e-01,  4.6617e-01,  ...,  1.5881e-01,
      


i=750
Sanity Check!!

Start: 24000
End: 24032

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.7402e-01,  4.3308e-02, -6.6685e-02,  ..., -6.5389e-01,
           5.1593e-02,  6.8153e-01],
         [ 1.8149e-01,  1.6206e-01, -3.9024e-01,  ...,  1.0464e-01,
           4.1374e-01,  7.4728e-01],
         [-6.5649e-01, -6.8560e-02, -1.2311e-01,  ..., -6.0962e-02,
          -3.1717e-01, -1.1512e-01],
         ...,
         [-2.7148e-01, -1.5590e-01, -7.9930e-01,  ..., -3.4327e-01,
           1.0197e-01,  5.4398e-01],
         [-3.2222e-03, -1.1584e-01,  2.6238e-01,  ..., -4.3047e-01,
          -1.0137e-01,  4.8702e-01],
         [-2.5144e-02, -1.4785e-01,  3.0187e-01,  ..., -5.1759e-01,
          -1.1151e-01,  5.0865e-01]],

        [[-5.0102e-01, -4.4239e-02,  3.6887e-02,  ..., -5.7559e-01,
           3.6614e-01,  7.5838e-01],
         [-3.0216e-01, -1.6106e-01, -1.7722e-01,  ...,  6.4014e-02,
      


i=780
Sanity Check!!

Start: 24960
End: 24992

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6996,  0.2371,  0.0776,  ..., -0.5329,  0.3657,  0.7073],
         [-0.1888,  0.1941, -0.4671,  ...,  0.2279,  0.8240, -0.1400],
         [-0.4439, -0.1115,  0.0114,  ..., -0.4619, -0.0622,  0.8055],
         ...,
         [-0.5240, -0.2119, -0.0296,  ..., -0.1586,  0.5180,  0.3207],
         [-0.3518, -0.1135,  0.3366,  ..., -0.2022,  0.3211,  0.3271],
         [-0.3507, -0.0266,  0.2969,  ..., -0.2359,  0.3305,  0.3525]],

        [[-0.9942, -0.3386, -0.2105,  ..., -0.6238,  0.4393,  0.6749],
         [-0.0983, -0.1010, -0.5727,  ...,  0.4600,  0.3373, -0.1439],
         [ 0.1739, -1.0021, -0.7929,  ..., -0.0462, -0.3520,  1.1194],
         ...,
         [-0.3225, -0.3267,  0.1132,  ...,  0.0595,  0.1490,  0.5010],
         [-0.9101, -0.0221,  0.1321,  ..., -0.3652,  0.1430,  0.5137],
         [-0.3


i=810
Sanity Check!!

Start: 25920
End: 25952

Size of LastHidden Layer Size:torch.Size([32, 98, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6753, -0.0316, -0.1101,  ..., -0.5460,  0.2093,  0.5896],
         [-0.4369, -0.1960, -0.3619,  ..., -0.0196,  0.3588, -0.4975],
         [-0.6205, -0.4559, -0.2417,  ..., -0.5645, -0.6316,  0.1413],
         ...,
         [-0.9499, -0.4859, -0.1570,  ..., -0.0503,  0.3984,  0.2075],
         [-0.8849, -0.3692,  0.1228,  ..., -0.1600,  0.3753,  0.3264],
         [-0.6170, -0.2830,  0.3279,  ..., -0.2348,  0.1247,  0.2210]],

        [[-0.8228,  0.1090, -0.2560,  ..., -0.4830,  0.0456,  0.7387],
         [-0.5810, -0.2467,  0.0514,  ..., -0.0437,  0.5543,  0.3042],
         [ 1.0057,  0.2365,  0.8537,  ..., -0.9338, -0.2186, -0.1554],
         ...,
         [-0.6539, -0.3964, -0.0341,  ...,  0.2380, -0.0187,  0.3090],
         [-0.5006, -0.1090,  0.2862,  ...,  0.0741, -0.4147,  0.4088],
         [-0.5


i=840
Sanity Check!!

Start: 26880
End: 26912

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6549, -0.0911, -0.0333,  ..., -0.6225,  0.1812,  0.8505],
         [-1.4364,  0.1012,  0.2203,  ..., -0.4098,  0.3273,  0.5678],
         [ 0.0284, -0.4819, -0.5480,  ..., -0.0800, -0.8731,  0.5752],
         ...,
         [-0.8966, -0.0466, -0.1139,  ..., -0.4049,  0.1083,  0.4612],
         [-0.4735, -0.1424, -0.3844,  ...,  0.2309,  0.3315,  0.4804],
         [-0.9110, -0.1361, -0.0666,  ..., -0.4527,  0.1485,  0.3071]],

        [[-0.6834, -0.2080, -0.1242,  ..., -0.6004,  0.1837,  0.7721],
         [-0.3887, -0.3368, -0.5011,  ...,  0.0032, -0.0233, -0.3083],
         [-0.4691, -1.2033, -0.7990,  ...,  0.5192, -0.5523,  0.7580],
         ...,
         [-0.5489, -0.2913, -0.1344,  ..., -0.1783,  0.1902,  0.4423],
         [-0.8319, -0.1275, -0.0862,  ..., -0.4158,  0.2393,  0.6567],
         [-0.4


i=870
Sanity Check!!

Start: 27840
End: 27872

Size of LastHidden Layer Size:torch.Size([32, 64, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5510, -0.0735,  0.0200,  ..., -0.6219, -0.0414,  0.7024],
         [-0.3995,  0.1709, -0.3352,  ...,  0.2251,  0.2006, -0.5227],
         [-0.6704, -0.5289, -0.1030,  ..., -0.3984, -0.3925,  0.2137],
         ...,
         [-0.5209,  0.0744,  0.0405,  ..., -0.4775, -0.0514,  0.3705],
         [-0.5824, -0.0767, -0.2957,  ..., -0.1912,  0.2108,  0.2796],
         [-0.3703,  0.1963,  0.1349,  ..., -0.4434, -0.1651,  0.2807]],

        [[-0.3792,  0.1205,  0.0820,  ..., -0.3049,  0.1475,  0.5297],
         [ 0.0305, -0.1417, -0.0348,  ...,  0.3236,  0.0901, -0.4602],
         [-0.2770, -0.5386,  0.1332,  ..., -0.1336, -0.4657,  0.4168],
         ...,
         [-0.4264, -0.3156,  0.0743,  ..., -0.2030,  0.1924,  0.3405],
         [-0.4070, -0.2292, -0.2603,  ..., -0.0094,  0.2092,  0.2308],
         [-0.4


i=900
Sanity Check!!

Start: 28800
End: 28832

Size of LastHidden Layer Size:torch.Size([32, 121, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5546, -0.0686, -0.0761,  ..., -0.6410, -0.0711,  0.6206],
         [-0.5235,  0.2351, -0.2528,  ...,  0.2732,  0.8567, -0.0634],
         [-0.2002,  0.2905, -0.1305,  ..., -0.3725,  0.1052,  0.5147],
         ...,
         [-0.3973, -0.0734, -0.1290,  ..., -0.1235, -0.2886,  0.4334],
         [-0.4093, -0.0735, -0.1004,  ..., -0.2032, -0.3269,  0.5302],
         [-0.3930, -0.0570, -0.1455,  ..., -0.0943, -0.2525,  0.4612]],

        [[-0.8712, -0.2206, -0.1919,  ..., -0.5202,  0.1806,  0.6663],
         [-1.3486,  0.3273, -0.2350,  ..., -0.6384,  0.4293,  0.4913],
         [-0.4411, -0.5780, -0.0215,  ..., -0.3075,  0.0890,  0.1518],
         ...,
         [-0.7479, -0.0179, -0.1745,  ..., -0.1626, -0.0592,  0.6146],
         [-0.4240,  0.0858, -0.0412,  ...,  0.0777, -0.0953,  0.4568],
         [-0.


i=930
Sanity Check!!

Start: 29760
End: 29792

Size of LastHidden Layer Size:torch.Size([32, 103, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.3422e-01, -6.1571e-02,  1.3881e-01,  ..., -5.9975e-01,
           2.1337e-01,  6.9611e-01],
         [-1.6799e-01, -1.6073e-01, -3.5239e-01,  ...,  2.7622e-01,
           1.8548e-01, -6.2681e-01],
         [-2.1982e-01, -1.0437e+00, -5.5631e-02,  ..., -1.3531e-02,
          -7.7794e-02,  7.8645e-01],
         ...,
         [-6.6869e-01, -4.2251e-01,  6.4977e-01,  ..., -2.5572e-01,
           2.9270e-01,  3.3959e-01],
         [-5.9237e-01, -3.8768e-01,  6.8801e-01,  ..., -3.1650e-01,
           1.3646e-01,  4.2197e-01],
         [-5.9169e-01, -3.9689e-01,  6.3499e-01,  ..., -2.6760e-01,
           2.6878e-01,  3.7454e-01]],

        [[-6.1267e-01,  9.8380e-02, -5.1189e-02,  ..., -2.7395e-01,
          -2.3371e-01,  2.7958e-01],
         [-3.2867e-02,  4.8202e-01, -1.0388e-01,  ..., -4.8361e-01,
     


i=950
Sanity Check!!

Start: 30400
End: 30432

Size of LastHidden Layer Size:torch.Size([32, 54, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.6350e-01, -1.0649e-01, -9.1566e-02,  ..., -3.0436e-01,
           2.8528e-01,  6.9944e-01],
         [-6.6115e-01, -1.3250e-01,  1.2076e-01,  ..., -8.7917e-02,
           1.1717e+00, -6.1525e-02],
         [-4.3092e-01, -2.6390e-01, -9.2921e-02,  ..., -7.1557e-01,
           8.6590e-02,  4.5634e-01],
         ...,
         [-6.3431e-01, -8.6238e-02,  1.9405e-01,  ..., -1.6403e-01,
           3.2501e-01,  3.0597e-01],
         [-8.5509e-01, -2.7676e-01,  1.1056e-02,  ..., -3.3138e-03,
           4.6937e-01,  1.4024e-01],
         [-7.3483e-01, -1.2878e-01,  1.6423e-01,  ..., -1.7160e-01,
           3.2976e-01,  2.5747e-01]],

        [[-7.5037e-01, -2.1419e-01, -8.7746e-02,  ..., -6.0839e-01,
           1.5990e-01,  8.2410e-01],
         [-3.3706e-01, -2.7237e-01, -5.0518e-01,  ...,  3.3242e-01,
      


i=980
Sanity Check!!

Start: 31360
End: 31392

Size of LastHidden Layer Size:torch.Size([32, 70, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4733, -0.2363, -0.4666,  ..., -0.1925,  0.1640,  0.7539],
         [ 0.9300,  0.3350, -0.4082,  ...,  0.0535,  0.5373,  0.1835],
         [-0.4143, -0.2628, -0.4959,  ...,  0.1765,  0.0266,  0.5150],
         ...,
         [ 0.1305, -0.2322,  0.2333,  ..., -0.2571, -0.0317, -0.1426],
         [-0.2277, -0.4853,  0.0424,  ..., -0.0581,  0.0614, -0.3402],
         [-0.0060, -0.0338,  0.3887,  ..., -0.0411,  0.1503,  0.0231]],

        [[-0.7333, -0.3657,  0.0400,  ..., -0.5049, -0.0601,  0.6154],
         [-0.3084, -0.2976, -0.3099,  ...,  0.0491, -0.0493, -0.7028],
         [-0.9084, -0.1046,  0.2507,  ..., -1.2244, -0.4566,  0.1071],
         ...,
         [-0.6952, -0.0765, -0.2569,  ...,  0.0584, -0.1120,  0.2929],
         [-0.4941, -0.0279, -0.2598,  ..., -0.2103, -0.3296,  0.3499],
         [-0.5


i=1010
Sanity Check!!

Start: 32320
End: 32352

Size of LastHidden Layer Size:torch.Size([32, 46, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.0661,  0.3709, -0.3496,  ..., -0.2674,  0.4942,  0.6756],
         [-0.0607, -0.1605, -0.2078,  ...,  0.0601,  0.7931, -0.0491],
         [ 1.0879,  0.1972, -0.3481,  ..., -0.2121,  0.1214,  0.0914],
         ...,
         [ 0.0160,  0.4967, -0.1353,  ...,  0.0141,  0.2799,  0.2212],
         [-0.0914,  0.0746, -0.3434,  ..., -0.0330,  0.4028,  0.1104],
         [ 0.0649,  0.4614, -0.1300,  ..., -0.0548,  0.2700,  0.1690]],

        [[-0.6065,  0.0715, -0.2232,  ..., -0.3653,  0.1194,  0.7291],
         [-0.7233,  0.0098, -0.2632,  ..., -0.0414,  0.3244, -0.1837],
         [-1.3194, -0.2831,  0.0865,  ..., -0.1463,  0.5342, -0.1411],
         ...,
         [-0.5822,  0.0652, -0.2352,  ..., -0.3043,  0.0034,  0.5040],
         [-0.3366,  0.0496,  0.0442,  ..., -0.2906, -0.0862,  0.4667],
         [-0.


i=1040
Sanity Check!!

Start: 33280
End: 33312

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.9056e-01, -2.4773e-01, -1.5268e-01,  ..., -2.4516e-01,
           4.5618e-01,  7.1414e-01],
         [-9.4613e-01, -2.4371e-01,  2.8584e-01,  ...,  2.7697e-02,
           4.9655e-01,  1.7265e-01],
         [-3.4171e-01,  2.6006e-01, -3.1270e-01,  ..., -6.9505e-01,
          -4.2627e-02,  4.5531e-01],
         ...,
         [-5.6813e-01, -2.2191e-01,  3.8553e-01,  ..., -2.2888e-01,
           3.5154e-02,  3.4145e-01],
         [-8.3204e-01, -3.3962e-01, -2.9289e-01,  ..., -4.2345e-02,
           1.9423e-01,  2.6765e-01],
         [-5.7426e-01, -2.5253e-01,  2.0428e-01,  ..., -1.7179e-01,
          -3.6972e-03,  2.0634e-01]],

        [[-1.1230e+00, -3.1389e-01, -2.5617e-01,  ..., -4.3964e-01,
           2.7945e-01,  9.4211e-01],
         [-3.1844e-01,  8.4205e-02, -1.1565e-01,  ...,  4.2429e-01,
     


i=1060
Sanity Check!!

Start: 33920
End: 33952

Size of LastHidden Layer Size:torch.Size([32, 74, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7811, -0.1746, -0.0828,  ..., -0.1909,  0.4543,  0.5978],
         [-0.3392, -0.3710, -0.2171,  ...,  0.2427,  0.1501, -0.2993],
         [-0.7133, -0.9621, -0.2947,  ...,  0.1892, -0.4193,  0.4224],
         ...,
         [-0.4749, -0.2092,  0.2200,  ..., -0.3465, -0.1156,  0.2681],
         [-0.6577, -0.0860,  0.0948,  ..., -0.2047,  0.2455, -0.2062],
         [-0.6853,  0.0167,  0.2019,  ..., -0.2328,  0.1010,  0.1774]],

        [[-0.7716, -0.0492,  0.2389,  ..., -0.6263,  0.0793,  0.6792],
         [-0.7327,  0.5957,  0.0320,  ..., -0.3638,  0.1472,  0.2833],
         [-1.2207, -0.4054, -0.2337,  ..., -0.1281, -0.2610, -0.3671],
         ...,
         [-0.2331,  0.0892,  0.3759,  ..., -0.2457, -0.1630, -0.1680],
         [-0.2253, -0.1028,  0.4844,  ..., -0.1923, -0.0621, -0.2043],
         [-0.


i=1090
Sanity Check!!

Start: 34880
End: 34912

Size of LastHidden Layer Size:torch.Size([32, 110, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5346, -0.0287,  0.0455,  ..., -0.3310,  0.1547,  0.6433],
         [-0.2356, -0.2806,  0.0981,  ...,  0.6524,  0.1309, -0.3512],
         [-0.5928, -0.7125,  0.3123,  ...,  0.1428, -0.5968,  0.5875],
         ...,
         [-0.4837, -0.3830,  0.4304,  ...,  0.1084,  0.3211,  0.0844],
         [-0.5217, -0.4277,  0.4004,  ...,  0.0512,  0.3410,  0.1286],
         [-0.3310, -0.4718,  0.4764,  ..., -0.1126,  0.1147,  0.0254]],

        [[-0.6101,  0.3387,  0.0294,  ..., -0.5164,  0.3316,  0.4851],
         [ 0.3006,  0.4952, -0.2298,  ..., -0.0530,  0.4976,  0.3256],
         [-1.2340,  0.2244,  0.4906,  ..., -0.4362, -0.4538, -0.1107],
         ...,
         [-0.4705,  0.0805,  0.1688,  ..., -0.0906,  0.1764,  0.3594],
         [-0.5660, -0.0728, -0.1001,  ...,  0.0698,  0.2469,  0.3293],
         [-0


i=1120
Sanity Check!!

Start: 35840
End: 35872

Size of LastHidden Layer Size:torch.Size([32, 50, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7018,  0.0224, -0.2131,  ..., -0.2097,  0.0560,  0.7255],
         [-1.0541,  0.3161, -0.5020,  ...,  0.1372, -0.2823, -0.3396],
         [-0.6146,  0.3071, -0.2367,  ...,  0.1578, -0.6429,  0.4434],
         ...,
         [-0.5783,  0.1790,  0.0770,  ...,  0.1073, -0.0509,  0.1999],
         [-0.5806,  0.1259, -0.0626,  ...,  0.1364,  0.0458,  0.1618],
         [-0.3343,  0.1390, -0.0062,  ..., -0.1621, -0.1318,  0.4002]],

        [[-0.6607, -0.2863,  0.2746,  ..., -0.2786,  0.4615,  0.5041],
         [-0.3607, -0.0378, -0.1817,  ...,  0.0640,  0.6757, -0.2034],
         [-0.2646, -0.6142,  0.4455,  ..., -0.3579,  0.5913,  0.3836],
         ...,
         [-0.3859, -0.4000,  0.1571,  ...,  0.0585,  0.9155, -0.0489],
         [-0.6130, -0.7293, -0.1990,  ...,  0.2424,  0.9035, -0.1814],
         [-0.


i=1150
Sanity Check!!

Start: 36800
End: 36832

Size of LastHidden Layer Size:torch.Size([32, 66, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-9.7991e-01, -2.6461e-01,  5.2714e-01,  ..., -6.3770e-01,
           3.2011e-01,  4.0133e-01],
         [-1.0146e+00, -5.3041e-01,  6.5516e-01,  ..., -3.9999e-02,
           8.0878e-01, -2.6136e-01],
         [-3.5836e-02, -7.9684e-01, -1.3930e-02,  ..., -1.1992e-01,
          -8.6087e-02,  4.9555e-01],
         ...,
         [-4.8866e-01, -3.9162e-01,  1.3946e+00,  ...,  4.0874e-02,
           5.7251e-02, -8.1174e-01],
         [-2.3838e-01, -4.8952e-02,  8.0714e-01,  ..., -3.7226e-01,
          -1.9144e-01, -9.3012e-01],
         [ 6.8861e-01, -3.3536e-01, -8.6276e-02,  ...,  1.6297e-01,
          -8.5190e-01, -3.9093e-01]],

        [[-6.8040e-01, -1.5861e-01, -5.7293e-02,  ..., -5.0460e-01,
           1.5769e-01,  7.5312e-01],
         [-8.9516e-02, -5.0021e-02, -4.3693e-01,  ...,  5.2375e-02,
     


i=1180
Sanity Check!!

Start: 37760
End: 37792

Size of LastHidden Layer Size:torch.Size([32, 62, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.8957e-01, -1.1079e-02, -2.6238e-01,  ..., -4.3576e-01,
           1.2776e-01,  4.5516e-01],
         [-3.6270e-01, -4.9385e-01, -2.6048e-01,  ...,  3.6345e-01,
          -1.1859e-01, -4.0362e-01],
         [-1.9654e-01, -1.0224e+00, -9.1131e-02,  ..., -2.9252e-01,
          -2.0556e-01,  2.5045e-01],
         ...,
         [-2.1814e-01,  2.9772e-03,  2.3897e-01,  ..., -1.2083e-01,
           8.5316e-02, -2.4416e-01],
         [-2.3140e-01,  2.6695e-02,  2.7457e-01,  ..., -1.7601e-01,
           1.5180e-01, -1.5527e-01],
         [-2.7843e-01,  4.6965e-01,  6.6236e-01,  ...,  1.0585e-01,
           1.1752e-01,  9.5961e-01]],

        [[-5.4814e-01, -2.9283e-02, -7.4734e-02,  ..., -5.4170e-01,
          -1.7777e-01,  5.6601e-01],
         [-2.0291e-01,  1.1982e-01, -1.1110e-01,  ...,  4.9770e-01,
     


i=1210
Sanity Check!!

Start: 38720
End: 38752

Size of LastHidden Layer Size:torch.Size([32, 104, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-5.9672e-01, -9.6384e-02,  3.0960e-02,  ..., -3.3153e-01,
           1.7950e-01,  5.5102e-01],
         [-7.7510e-01, -1.2687e-01, -4.6250e-01,  ...,  7.1049e-02,
          -1.1950e-01, -6.9801e-01],
         [-1.0357e+00, -3.3568e-01, -3.8522e-01,  ...,  2.1100e-01,
          -6.6955e-01,  1.5271e-01],
         ...,
         [-7.1934e-01, -1.2917e-02, -1.6626e-01,  ..., -4.1359e-01,
           2.2745e-01,  2.9131e-01],
         [-7.7392e-01, -2.6172e-01, -5.2168e-01,  ..., -1.0007e-01,
           4.0436e-01,  4.0681e-01],
         [-5.6491e-01, -2.1241e-02, -1.1367e-01,  ..., -3.9772e-01,
           2.0837e-01,  2.4052e-01]],

        [[-6.8826e-01, -5.6386e-02, -1.4726e-01,  ..., -5.7291e-01,
           2.0407e-01,  5.6978e-01],
         [-6.7055e-01, -1.5583e-01, -2.8633e-01,  ..., -2.3679e-02,
    


i=1240
Sanity Check!!

Start: 39680
End: 39712

Size of LastHidden Layer Size:torch.Size([32, 75, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.1833e-01, -6.5693e-03, -9.6273e-02,  ..., -5.9616e-01,
           2.6384e-01,  5.7008e-01],
         [-1.0244e+00,  2.5134e-01, -4.8872e-01,  ..., -3.4336e-01,
           8.4424e-01,  2.6877e-01],
         [-4.5525e-01, -6.4733e-01, -2.3941e-01,  ..., -2.2253e-01,
           5.2076e-02,  1.5825e-01],
         ...,
         [-4.6633e-01,  2.7864e-01, -6.4257e-03,  ..., -3.2443e-01,
           3.0815e-01,  3.4871e-01],
         [-5.1736e-01,  6.0960e-01,  2.1568e-01,  ..., -3.4207e-01,
           4.4657e-01,  2.2499e-01],
         [-5.3730e-01,  1.5965e-01, -3.9313e-02,  ..., -3.8832e-01,
           4.2769e-01,  1.3002e-01]],

        [[-3.3439e-01, -6.8260e-02,  2.2243e-01,  ..., -1.3759e-01,
           3.3524e-01,  8.7434e-01],
         [ 4.4146e-02,  1.5315e-02, -3.3160e-01,  ...,  4.2670e-01,
     


i=1270
Sanity Check!!

Start: 40640
End: 40672

Size of LastHidden Layer Size:torch.Size([32, 56, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.0761e+00, -2.2096e-01, -2.2628e-01,  ..., -3.4045e-01,
           2.9129e-01,  8.1764e-01],
         [-1.6989e-02,  2.3019e-01, -4.7108e-01,  ...,  2.6307e-01,
           5.5510e-01,  2.1838e-01],
         [-6.4820e-01, -7.7813e-01, -2.9025e-01,  ..., -4.7809e-01,
          -1.9668e-01,  6.0799e-01],
         ...,
         [-9.7618e-01, -2.2849e-01, -2.1521e-02,  ..., -1.2122e-01,
           2.4973e-01,  3.8975e-01],
         [-9.7805e-01, -2.6074e-01, -9.6309e-02,  ..., -7.5972e-02,
           2.4273e-01,  4.0527e-01],
         [-8.0418e-01, -9.7077e-02,  5.5713e-02,  ..., -2.6904e-01,
           7.5156e-02,  4.0306e-01]],

        [[-5.0606e-01, -4.3157e-02, -5.0550e-03,  ..., -3.3520e-01,
           1.5792e-01,  5.8491e-01],
         [-4.1915e-01, -3.7846e-01,  3.1821e-03,  ...,  5.8046e-01,
     


i=1290
Sanity Check!!

Start: 41280
End: 41312

Size of LastHidden Layer Size:torch.Size([32, 71, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.6944e-01, -2.9009e-02, -2.8337e-01,  ..., -5.1143e-01,
           1.5050e-01,  6.0310e-01],
         [-5.0464e-01, -3.2198e-01, -2.5615e-01,  ..., -7.8674e-03,
          -1.1970e-01, -6.4515e-01],
         [-1.1811e+00, -1.1481e+00, -2.9844e-01,  ...,  4.7719e-01,
          -5.8136e-01,  1.6838e-01],
         ...,
         [-9.2278e-01, -5.4667e-01, -5.4345e-01,  ...,  1.7444e-01,
           3.0097e-01,  5.3920e-01],
         [-5.2231e-01, -1.9766e-01, -4.7799e-02,  ..., -2.4515e-01,
           8.7660e-02,  4.6572e-01],
         [-7.4922e-01, -4.0131e-01, -3.3336e-01,  ...,  1.1784e-01,
           2.7728e-01,  4.4553e-01]],

        [[-3.1525e-01,  3.2395e-02,  1.4782e-01,  ..., -1.8034e-01,
           1.7422e-01,  6.8808e-01],
         [-3.8154e-01, -9.6758e-02, -2.4818e-01,  ...,  2.1266e-01,
     


i=1320
Sanity Check!!

Start: 42240
End: 42272

Size of LastHidden Layer Size:torch.Size([32, 100, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7191,  0.0447, -0.0604,  ..., -0.5074,  0.2318,  0.8542],
         [-0.4827, -0.4323, -0.0425,  ...,  0.1140,  0.3774, -0.1805],
         [-0.3889, -0.7376, -0.0084,  ..., -0.3323, -0.2551,  0.6064],
         ...,
         [-0.7111, -0.2596,  0.2206,  ..., -0.0768,  0.4631,  0.7050],
         [-0.5745, -0.1447,  0.3250,  ..., -0.2265,  0.1652,  0.6396],
         [-0.7649, -0.2068,  0.2286,  ..., -0.0812,  0.4320,  0.6892]],

        [[-0.8756, -0.1103,  0.0508,  ..., -0.3089,  0.4871,  0.6464],
         [-0.3440, -0.0275, -0.1925,  ...,  0.1305,  0.4683, -0.1438],
         [-0.4983, -0.4669, -0.3348,  ...,  0.0722, -0.4181,  0.7149],
         ...,
         [-0.8094, -0.2663, -0.2483,  ...,  0.1695,  0.5847,  0.2483],
         [-0.6446, -0.0217,  0.3770,  ..., -0.2346,  0.2190,  0.2467],
         [-0


i=1350
Sanity Check!!

Start: 43200
End: 43232

Size of LastHidden Layer Size:torch.Size([32, 51, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.1279,  0.0843,  0.0767,  ..., -0.3324,  0.1529,  0.5432],
         [ 0.3386,  0.1888,  0.1408,  ...,  0.1467,  1.0326,  0.1923],
         [-0.1351, -0.2231, -0.0135,  ..., -0.0375, -0.1797,  0.5736],
         ...,
         [-0.1549, -0.2134, -0.2191,  ..., -0.1233,  0.3011,  0.1477],
         [-0.1662,  0.1402,  0.2217,  ..., -0.1241,  0.2685, -0.1108],
         [-0.2232, -0.1534,  0.0375,  ..., -0.0511,  0.4632,  0.2461]],

        [[-0.8655, -0.0786, -0.1779,  ..., -0.6099,  0.1803,  0.8392],
         [-0.1410, -0.0315, -0.3360,  ...,  0.1464,  0.5014,  0.0198],
         [-0.1385, -0.8733, -0.4977,  ..., -0.3828, -0.7765,  0.9127],
         ...,
         [-0.6431,  0.0463,  0.2480,  ..., -0.1841,  0.1262,  0.4203],
         [-0.5767,  0.1027,  0.2052,  ..., -0.2389,  0.1663,  0.4567],
         [-0.


i=1380
Sanity Check!!

Start: 44160
End: 44192

Size of LastHidden Layer Size:torch.Size([32, 58, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.8234e-01, -7.1443e-04,  1.3396e-01,  ..., -6.5841e-01,
           1.0859e-01,  4.7485e-01],
         [-9.0338e-01,  1.4877e-01,  1.3340e-01,  ..., -7.6993e-01,
           4.6966e-01, -1.6064e-01],
         [-2.9121e-01, -9.3190e-01,  7.3921e-01,  ..., -6.6983e-01,
          -6.8631e-02, -7.3289e-01],
         ...,
         [-5.8987e-01, -3.8426e-01,  3.7528e-01,  ..., -3.9990e-01,
           6.9107e-02,  3.0319e-01],
         [-4.7416e-01, -9.4258e-02,  4.5738e-01,  ..., -4.8488e-01,
          -3.6460e-03,  2.3290e-01],
         [-4.1032e-01, -1.1918e-01,  4.6158e-01,  ..., -4.1204e-01,
          -1.5748e-02,  1.2822e-01]],

        [[-2.8812e-01,  8.7883e-02, -1.5393e-01,  ..., -5.6507e-01,
           5.4471e-01,  3.6270e-01],
         [ 1.2300e-01, -7.0247e-01, -1.4626e-01,  ...,  3.4969e-01,
     


i=1400
Sanity Check!!

Start: 44800
End: 44832

Size of LastHidden Layer Size:torch.Size([32, 49, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6398,  0.1266,  0.2515,  ..., -0.6232, -0.0795,  0.6765],
         [-0.2187, -0.2616, -0.2510,  ...,  0.1645,  0.0411, -0.5107],
         [-0.3472, -0.5463, -0.3894,  ..., -0.4424, -0.5993,  0.1541],
         ...,
         [-0.3631, -0.4585,  0.1410,  ..., -0.4038, -0.2558,  0.4822],
         [-0.4346, -0.2071,  0.1216,  ..., -0.5589,  0.0216,  0.4251],
         [-0.2307, -0.3364,  0.2222,  ..., -0.3180, -0.2996,  0.3928]],

        [[-0.7528, -0.1274,  0.1637,  ..., -0.3352, -0.0060,  0.6301],
         [-0.1781,  0.1099, -0.2105,  ...,  0.2213,  0.1806, -0.1416],
         [-0.9587, -1.0314,  0.4606,  ...,  0.0407, -0.0961, -0.1961],
         ...,
         [ 0.7897,  0.0854, -0.1296,  ...,  0.1313, -0.8169, -0.3233],
         [-0.3569, -0.1287, -0.1734,  ..., -0.2598,  0.0880,  0.4759],
         [-0.


i=1430
Sanity Check!!

Start: 45760
End: 45792

Size of LastHidden Layer Size:torch.Size([32, 92, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.3053e-01, -3.8574e-01, -9.1089e-02,  ..., -4.6015e-01,
           1.0230e-01,  8.5229e-01],
         [-7.6038e-01,  1.0503e-02,  2.0329e-01,  ...,  1.8308e-01,
           2.7244e-01,  3.2038e-02],
         [-7.8100e-01, -6.1500e-01,  6.3108e-02,  ..., -2.9259e-01,
          -5.1301e-01,  2.4917e-01],
         ...,
         [-2.1522e-01, -1.6738e-01,  4.0801e-02,  ..., -1.6584e-01,
          -5.2119e-01,  3.4314e-01],
         [-4.1508e-01, -4.1343e-01, -2.1907e-01,  ..., -8.6683e-02,
          -1.5667e-01,  1.8290e-01],
         [-5.3129e-01,  7.8200e-02,  1.6427e-01,  ...,  1.1310e-01,
          -5.9707e-01,  5.4062e-01]],

        [[-9.1256e-02,  1.2465e-01,  1.1536e-03,  ..., -1.6702e-01,
           1.4748e-01,  7.1859e-01],
         [ 5.5565e-02,  1.3305e-01, -3.4385e-01,  ...,  1.3803e-02,
     


i=1460
Sanity Check!!

Start: 46720
End: 46752

Size of LastHidden Layer Size:torch.Size([32, 72, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-5.7956e-01, -2.5352e-02, -2.3352e-01,  ..., -3.2217e-01,
           4.8143e-01,  4.3101e-01],
         [-8.9917e-01,  3.5145e-01, -3.3309e-01,  ...,  1.5649e-01,
           1.0117e+00, -6.7885e-02],
         [-6.0139e-01, -5.5835e-03,  1.3494e-02,  ..., -4.9870e-01,
           1.2715e-01, -5.8347e-01],
         ...,
         [-5.6902e-01,  1.9674e-01, -2.2185e-02,  ...,  1.9598e-01,
           1.6839e-01,  2.1197e-02],
         [-1.6564e-01,  9.6809e-02,  9.9824e-02,  ...,  1.0567e-01,
           1.1265e-01,  2.0028e-01],
         [-1.5340e-02,  4.3002e-02,  1.9808e-01,  ...,  2.7864e-02,
           6.8000e-02,  3.2022e-01]],

        [[-4.5377e-01,  2.5766e-01, -2.9201e-01,  ..., -3.5785e-01,
           1.9100e-01,  7.5715e-01],
         [-3.4400e-01,  1.2431e-01, -3.0891e-01,  ...,  2.9804e-01,
     


i=1480
Sanity Check!!

Start: 47360
End: 47392

Size of LastHidden Layer Size:torch.Size([32, 102, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.5501e-01,  7.7588e-02,  5.1225e-02,  ..., -3.3230e-01,
           6.7065e-02,  6.0445e-01],
         [-2.5025e-02,  1.2213e-01,  6.5487e-03,  ..., -3.3382e-01,
           6.1467e-01,  4.0685e-01],
         [-1.0263e+00,  1.6866e-01,  2.9988e-01,  ..., -4.5915e-01,
           7.9558e-01, -1.9854e-01],
         ...,
         [-1.9654e-01, -1.8484e-01,  3.7896e-01,  ..., -2.3717e-02,
           2.4490e-01,  3.6392e-01],
         [-5.1619e-01, -3.0521e-01, -5.5961e-05,  ...,  8.7033e-02,
           5.2843e-01,  4.1074e-01],
         [-4.2174e-01, -2.3850e-01, -7.4062e-02,  ...,  8.5486e-02,
           5.2303e-01,  3.5411e-01]],

        [[-6.0603e-01, -1.3281e-01, -1.0819e-01,  ..., -4.0802e-01,
           2.0320e-01,  6.4036e-01],
         [ 6.0212e-02,  1.7285e-02, -4.4371e-01,  ...,  3.5749e-02,
    


i=1510
Sanity Check!!

Start: 48320
End: 48352

Size of LastHidden Layer Size:torch.Size([32, 93, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7974,  0.0159, -0.1375,  ..., -0.2411,  0.1950,  0.6480],
         [-0.6984, -0.0983,  0.3681,  ..., -0.1204,  0.6509,  0.0813],
         [-1.3527,  0.0903,  0.0537,  ..., -0.8324, -0.3807,  0.4424],
         ...,
         [-0.8613, -0.3148, -0.1256,  ...,  0.0151,  0.5969,  0.3103],
         [-0.5640, -0.0653,  0.2158,  ..., -0.0497,  0.3570,  0.3002],
         [-0.8713, -0.3430, -0.4374,  ...,  0.2538,  0.3809,  0.5075]],

        [[-0.4512,  0.0344,  0.0059,  ..., -0.6786,  0.0738,  0.7177],
         [-1.2196, -0.4344, -0.5986,  ..., -0.6089,  0.2068,  0.6893],
         [-0.4137, -0.0482,  0.1648,  ..., -0.5006, -0.1223, -0.5408],
         ...,
         [-0.5269,  0.0715,  0.0603,  ..., -0.3261, -0.3808,  0.0890],
         [-0.4794,  0.0103,  0.1496,  ..., -0.3132,  0.0318,  0.1285],
         [-0.


i=1540
Sanity Check!!

Start: 49280
End: 49312

Size of LastHidden Layer Size:torch.Size([32, 62, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3190,  0.1757,  0.1896,  ..., -0.2686,  0.1675,  0.6816],
         [-0.2615,  0.3141, -0.3414,  ...,  0.3395,  0.5541, -0.1766],
         [-0.4394, -0.2354, -0.4885,  ...,  0.2190,  0.0179,  0.5489],
         ...,
         [ 0.0728, -0.1139,  0.4968,  ..., -0.0593,  0.0987,  0.1801],
         [ 0.0286, -0.0271,  0.3678,  ..., -0.0816,  0.1032,  0.2268],
         [-0.0057,  0.1244,  0.2630,  ..., -0.1065,  0.1259,  0.1631]],

        [[-0.4181, -0.2917,  0.2976,  ..., -0.2125,  0.3785,  0.6532],
         [-0.0699, -0.4342,  0.0471,  ...,  0.2439,  0.2992, -0.1495],
         [-0.5512, -0.7803,  0.1459,  ..., -0.1884,  0.2235,  0.2346],
         ...,
         [ 0.0556, -0.4301,  0.5568,  ...,  0.0435, -0.0528, -0.0768],
         [ 0.0920, -0.0397,  0.5349,  ...,  0.2280,  0.1494, -0.0830],
         [-0.


i=1570
Sanity Check!!

Start: 50240
End: 50272

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.1681e-01, -4.6182e-01,  5.9422e-02,  ..., -3.8880e-01,
           9.4569e-02,  5.1601e-01],
         [-7.1660e-01,  1.4279e-01, -2.6924e-01,  ..., -1.7123e-02,
           7.9799e-01,  5.6780e-02],
         [-1.2790e-01, -1.1909e+00,  1.7800e-01,  ..., -5.2491e-01,
           1.6898e-01, -5.5409e-01],
         ...,
         [-1.6252e-01, -4.4478e-01,  6.1244e-02,  ..., -9.0693e-02,
           4.5602e-01,  2.0640e-01],
         [-2.4827e-01, -5.5354e-01,  4.0187e-01,  ..., -8.8766e-02,
           2.0735e-01, -1.4757e-01],
         [ 2.3500e-01, -7.6196e-03,  2.2865e-01,  ...,  7.9277e-04,
          -5.5860e-02,  3.7836e-01]],

        [[-5.3244e-01, -4.6835e-02,  4.0743e-02,  ..., -4.1343e-01,
           2.1669e-02,  6.3845e-01],
         [ 4.7353e-01,  5.4547e-01, -4.6258e-01,  ..., -4.7148e-02,
     


i=1600
Sanity Check!!

Start: 51200
End: 51232

Size of LastHidden Layer Size:torch.Size([32, 66, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.1089e-01, -1.6821e-01,  2.0879e-01,  ..., -2.9169e-01,
           2.6159e-01,  8.2176e-01],
         [-2.2692e-01, -2.1332e-01,  6.2322e-03,  ...,  1.4835e-01,
           3.0534e-01,  5.5722e-02],
         [-4.0558e-01, -8.0826e-01,  1.7385e-01,  ..., -1.9845e-01,
          -3.5662e-01,  6.7372e-01],
         ...,
         [-2.5199e-01, -2.6697e-01,  6.1807e-01,  ..., -3.0749e-02,
           1.1068e-01,  5.2697e-01],
         [-1.7385e-01, -6.8262e-02,  5.9062e-01,  ..., -3.4655e-02,
           3.7276e-02,  5.0793e-01],
         [-3.0631e-01, -4.4486e-01,  4.6461e-01,  ...,  1.2827e-01,
           2.7509e-01,  5.6538e-01]],

        [[-1.0362e+00, -7.4121e-02, -1.0906e-01,  ..., -3.3829e-01,
           4.0497e-01,  8.3473e-01],
         [-3.4419e-01, -4.7054e-01, -5.2448e-01,  ...,  1.1484e-01,
     


i=1630
Sanity Check!!

Start: 52160
End: 52192

Size of LastHidden Layer Size:torch.Size([32, 61, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.0269e-01,  7.1275e-04,  4.6279e-02,  ..., -4.5669e-01,
           2.3427e-01,  6.5363e-01],
         [-5.0927e-01, -5.3110e-02, -1.3220e-01,  ..., -7.7408e-02,
           1.9144e-01, -4.2693e-01],
         [-5.1627e-01, -4.8831e-01,  1.3056e-01,  ..., -2.0895e-01,
          -5.4963e-01,  5.5507e-01],
         ...,
         [-3.9472e-01, -2.4220e-01,  4.0400e-01,  ..., -4.0355e-02,
          -6.7121e-02,  1.5441e-01],
         [-1.7485e-01,  2.4590e-03,  4.0965e-01,  ..., -4.1821e-02,
           9.4490e-02, -2.0762e-01],
         [-1.0930e-01, -1.2356e-02,  3.8546e-01,  ..., -3.1004e-02,
           1.2122e-01, -1.9480e-01]],

        [[-3.2275e-01, -5.1727e-01,  1.1121e-01,  ..., -5.9681e-01,
           3.2106e-01,  7.8569e-01],
         [ 2.2852e-01, -6.1362e-02, -9.7381e-02,  ..., -3.6339e-02,
     


i=1660
Sanity Check!!

Start: 53120
End: 53152

Size of LastHidden Layer Size:torch.Size([32, 85, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.3408e-01,  7.4592e-02, -6.4419e-02,  ..., -4.8164e-01,
           2.6781e-01,  5.5573e-01],
         [-3.9077e-01,  9.2923e-03, -1.6465e-01,  ..., -1.0893e-01,
           2.8301e-01, -5.8216e-01],
         [-1.9799e-01, -4.6346e-01,  1.5239e-01,  ..., -2.9148e-01,
          -4.7083e-01, -1.5661e-01],
         ...,
         [-8.2186e-01,  2.4676e-02, -2.9425e-01,  ..., -1.1999e-01,
           2.6301e-01,  3.9343e-01],
         [-8.6009e-01, -1.5682e-01, -3.4591e-01,  ..., -8.6983e-03,
           3.3206e-01,  3.1158e-01],
         [-8.9876e-01, -4.5643e-02, -4.0072e-01,  ..., -1.2384e-01,
           2.7741e-01,  3.0819e-01]],

        [[-8.7794e-01, -4.0558e-02, -7.0775e-02,  ..., -2.3561e-01,
           6.7390e-02,  7.3793e-01],
         [-1.0536e+00,  3.9628e-01,  5.4010e-02,  ...,  1.1054e-01,
     


i=1680
Sanity Check!!

Start: 53760
End: 53792

Size of LastHidden Layer Size:torch.Size([32, 56, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6800,  0.1472, -0.0612,  ..., -0.0862,  0.0815,  0.5751],
         [-0.1573,  0.2973, -0.5203,  ...,  0.2956,  0.3587,  0.1714],
         [-0.7481, -0.0746,  0.3875,  ...,  0.3198, -0.0905, -0.3032],
         ...,
         [-0.7036,  0.0768, -0.4195,  ...,  0.0624,  0.0120,  0.4629],
         [-0.6132, -0.0554,  0.1923,  ...,  0.2306, -0.2806,  0.2096],
         [-0.7201,  0.1226, -0.3066,  ..., -0.0124, -0.0835,  0.4601]],

        [[-0.2427, -0.2362, -0.0333,  ..., -0.3565,  0.2436,  0.5166],
         [ 0.1475,  0.0067, -0.2569,  ...,  0.0457,  0.3437,  0.1695],
         [-0.1664, -0.6126,  0.0745,  ...,  0.0389,  0.1448, -0.3017],
         ...,
         [-0.0808, -0.2543,  0.2518,  ..., -0.2044,  0.1870,  0.0999],
         [-0.1664, -0.4297, -0.1914,  ..., -0.0775,  0.2719,  0.1007],
         [-0.


i=1710
Sanity Check!!

Start: 54720
End: 54752

Size of LastHidden Layer Size:torch.Size([32, 66, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5811, -0.0481, -0.0222,  ..., -0.4722,  0.0084,  0.8056],
         [-0.7959, -0.3710, -0.2735,  ...,  0.0794,  0.3657, -0.7167],
         [-0.5715, -0.2303, -0.0551,  ..., -0.2161, -0.2024,  0.8684],
         ...,
         [-0.4412, -0.4625,  0.1441,  ...,  0.0023,  0.2790,  0.1590],
         [ 0.2225, -0.3506,  0.2633,  ..., -0.1192,  0.2365,  0.2484],
         [-0.3227, -0.6951,  0.0816,  ...,  0.0335,  0.3247, -0.0191]],

        [[-0.8617,  0.0148, -0.1581,  ..., -0.3013,  0.1721,  0.6167],
         [-0.5601,  0.0648, -0.2381,  ..., -0.0861, -0.0984, -0.5380],
         [-1.0396, -0.6927, -0.4200,  ...,  0.4761, -0.9501,  0.3220],
         ...,
         [-0.8117, -0.2397,  0.1281,  ..., -0.1736,  0.2829,  0.4800],
         [-0.7296, -0.1101, -0.0423,  ..., -0.0731,  0.2793,  0.4489],
         [-0.


i=1740
Sanity Check!!

Start: 55680
End: 55712

Size of LastHidden Layer Size:torch.Size([32, 50, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.0093, -0.1271, -0.2306,  ..., -0.5149,  0.1722,  1.0066],
         [-0.5753,  0.0229, -0.3885,  ...,  0.1471,  0.0945, -0.2974],
         [-0.4477, -0.3775, -0.6069,  ..., -0.3100, -0.5189,  0.6909],
         ...,
         [-1.0947, -0.1750,  0.0806,  ..., -0.2064,  0.2015,  0.8802],
         [-1.0987, -0.2349,  0.0762,  ..., -0.2285,  0.2484,  0.8604],
         [-0.9940, -0.1528,  0.0529,  ..., -0.3231,  0.0141,  0.7753]],

        [[-0.0446, -0.0382,  0.2009,  ..., -0.1000,  0.6883,  0.4686],
         [ 0.1332, -0.4041,  0.0233,  ...,  0.3974,  0.4271, -0.1816],
         [-0.0781, -0.6564,  0.8106,  ..., -0.3620, -0.2321,  0.0646],
         ...,
         [-0.1395, -0.1115,  0.1745,  ...,  0.4431,  0.4560, -0.4498],
         [-0.1269, -0.0154,  0.4235,  ...,  0.3803,  0.4507, -0.0520],
         [-0.