### Import Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import json
from transformers import BertTokenizer, glue_convert_examples_to_features, BertModel
import numpy as np

### Import Preprocessed Dataset

Pre processed, tokenized WikiSql data is stored in './data/tokenized*'

In [2]:
dev_set_tokenized = pd.read_json('./data/tokenized_dev.jsonl',lines = True)
dev_set_tokenized.head()

Unnamed: 0,phase,table_id,question,sql,tokenized_query,query,tokenized_question
0,1,1-10015132-11,What position does the player who played for b...,"{'sel': 3, 'conds': [[5, 0, 'Butler CC (KS)']]...","[SELECT, position, FROM, table_, WHERE, school...",SELECT position FROM table_ WHERE school/club ...,"[what, position, does, the, player, who, playe..."
1,1,1-10015132-11,How many schools did player number 3 play at?,"{'sel': 5, 'conds': [[1, 0, '3']], 'agg': 3}","[SELECT, COUNT, (, school/club, team, ), FROM,...",SELECT COUNT ( school/club team ) FROM table_ ...,"[how, many, schools, did, player, number, 3, p..."
2,1,1-10015132-11,What school did player number 21 play for?,"{'sel': 5, 'conds': [[1, 0, '21']], 'agg': 0}","[SELECT, school/club, team, FROM, table_, WHER...",SELECT school/club team FROM table_ WHERE no ....,"[what, school, did, player, number, 21, play, ..."
3,1,1-10015132-11,Who is the player that wears number 42?,"{'sel': 0, 'conds': [[1, 0, '42']], 'agg': 0}","[SELECT, player, FROM, table_, WHERE, no, ., E...",SELECT player FROM table_ WHERE no . EQL 42,"[who, is, the, player, that, wears, number, 42..."
4,1,1-10015132-11,What player played guard for toronto in 1996-97?,"{'sel': 0, 'conds': [[3, 0, 'Guard'], [4, 0, '...","[SELECT, player, FROM, table_, WHERE, position...",SELECT player FROM table_ WHERE position EQL g...,"[what, player, played, guard, for, toronto, in..."


In [3]:
dev_set = pd.read_json('./data/dev.jsonl',lines = True)
dev_set.head()

Unnamed: 0,phase,table_id,question,sql
0,1,1-10015132-11,What position does the player who played for b...,"{'sel': 3, 'conds': [[5, 0, 'Butler CC (KS)']]..."
1,1,1-10015132-11,How many schools did player number 3 play at?,"{'sel': 5, 'conds': [[1, 0, '3']], 'agg': 3}"
2,1,1-10015132-11,What school did player number 21 play for?,"{'sel': 5, 'conds': [[1, 0, '21']], 'agg': 0}"
3,1,1-10015132-11,Who is the player that wears number 42?,"{'sel': 0, 'conds': [[1, 0, '42']], 'agg': 0}"
4,1,1-10015132-11,What player played guard for toronto in 1996-97?,"{'sel': 0, 'conds': [[3, 0, 'Guard'], [4, 0, '..."


In [4]:
dev_tables = pd.read_json('./data/dev.tables.jsonl',lines = True)
dev_tables.head()

Unnamed: 0,header,page_title,types,id,section_title,caption,rows,name,page_id
0,"[Player, No., Nationality, Position, Years in ...",Toronto Raptors all-time roster,"[text, text, text, text, text, text]",1-10015132-11,L,L,"[[Antonio Lang, 21, United States, Guard-Forwa...",table_10015132_11,
1,"[Player, No., Nationality, Position, Years in ...",Toronto Raptors all-time roster,"[text, text, text, text, text, text]",1-10015132-9,J,J,"[[Jarrett Jack, 1, United States, Guard, 2009-...",table_10015132_9,
2,[Entered office as Head of State or Government...,List of longest-serving G8 leaders,"[text, text, text, text, text]",1-10026563-1,Chronology of longest serving G8 Leaders,Chronology of longest serving G8 Leaders,"[[16 October 1964, 15 November 1975, 5 April 1...",table_10026563_1,
3,"[Country ( exonym ), Capital ( exonym ), Count...",,"[text, text, text, text, text]",1-1008653-1,,,"[[Afghanistan, Kabul, Afghanestan افغانستان, K...",table_1008653_1,
4,"[No. in set, No. in series, Title, Directed by...",,"[real, real, text, text, text, text, text]",1-10088101-1,,,"[[1, 174, "" Per Manum "", Kim Manners, Chris Ca...",table_10088101_1,


In [5]:
dev_tables_tokenized = pd.read_json('./data/tokenized_dev.tables.jsonl',lines = True)
dev_tables_tokenized.head()

Unnamed: 0,header,page_title,types,id,section_title,caption,rows,name,page_id,tokenized_header
0,"[Player, No., Nationality, Position, Years in ...",Toronto Raptors all-time roster,"[text, text, text, text, text, text]",1-10015132-11,L,L,"[[Antonio Lang, 21, United States, Guard-Forwa...",table_10015132_11,,"[[player], [no, .], [nationality], [position],..."
1,"[Player, No., Nationality, Position, Years in ...",Toronto Raptors all-time roster,"[text, text, text, text, text, text]",1-10015132-9,J,J,"[[Jarrett Jack, 1, United States, Guard, 2009-...",table_10015132_9,,"[[player], [no, .], [nationality], [position],..."
2,[Entered office as Head of State or Government...,List of longest-serving G8 leaders,"[text, text, text, text, text]",1-10026563-1,Chronology of longest serving G8 Leaders,Chronology of longest serving G8 Leaders,"[[16 October 1964, 15 November 1975, 5 April 1...",table_10026563_1,,"[[entered, office, as, head, of, state, or, go..."
3,"[Country ( exonym ), Capital ( exonym ), Count...",,"[text, text, text, text, text]",1-1008653-1,,,"[[Afghanistan, Kabul, Afghanestan افغانستان, K...",table_1008653_1,,"[[country, (, exonym, )], [capital, (, exonym,..."
4,"[No. in set, No. in series, Title, Directed by...",,"[real, real, text, text, text, text, text]",1-10088101-1,,,"[[1, 174, "" Per Manum "", Kim Manners, Chris Ca...",table_10088101_1,,"[[no, ., in, set], [no, ., in, series], [title..."


Load the data into `sql_data` and `table_data`, the `table_data` needs to be a dictionary

In [6]:
sql_data = []
with open('./data/tokenized_dev.jsonl') as lines:
    for i,line in enumerate(lines):
        data = json.loads(line.strip())
        sql_data.append(data)

In [7]:
table_data = {}
with open('./data/tokenized_dev.tables.jsonl') as lines:
     for i, line in enumerate(lines):
        data =  json.loads(line.strip())
        table_data[data['id']] = data

In [8]:
print(len(table_data))

2716


#### Universal function for loading the data

By passing the value `train` `dev` or `test` load the training dataset, validation dataset or the testing dataset respecitvily

In [9]:
def load_data_set(dataset_name):
    """
    Universal function to load the dataset
    Input: dataset_name - str {train, test, dev}
    Output: (sql_data, table_data) - (list, dictionary): containg the tokenized queries and the table dataset.
    """
    print(f"Loading {dataset_name} dataset")
    sql_file = './data/tokenized_'+dataset_name+'.jsonl'
    tables_file = './data/tokenized_'+dataset_name+'.tables.jsonl'
    sql_query_count = 0
    table_count = 0
    sql_data = []
    table_data = {}
    
    with open(sql_file) as lines:
        for line in lines:
            sql = json.loads(line.strip()) #remove white space before and after 
            sql_data.append(sql)
            sql_query_count += 1
    with open(tables_file) as lines:
        for line in lines:
            tab = json.loads(line.strip())
            table_data[tab['id']] = tab
            table_count += 1
    print(f"Loaded {sql_query_count} queries and {table_count} tables")
    return sql_data,table_data

In [10]:
train_sql, train_table = load_data_set('train')
print(f"{train_sql[0]}\n{train_table[train_sql[0]['table_id']]}")

Loading train dataset
Loaded 56355 queries and 18585 tables
{'phase': 1, 'table_id': '1-1000181-1', 'question': 'Tell me what the notes are for South Australia ', 'sql': {'sel': 5, 'conds': [[3, 0, 'SOUTH AUSTRALIA']], 'agg': 0}, 'tokenized_query': ['SELECT', 'notes', 'FROM', 'table_', 'WHERE', 'current', 'slogan', 'EQL', 'south', 'australia'], 'query': 'SELECT notes FROM table_ WHERE current slogan EQL south australia', 'tokenized_question': ['tell', 'me', 'what', 'the', 'notes', 'are', 'for', 'south', 'australia']}
{'id': '1-1000181-1', 'header': ['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes'], 'types': ['text', 'text', 'text', 'text', 'text', 'text'], 'rows': [['Australian Capital Territory', 'blue/white', 'Yaa·nna', 'ACT · CELEBRATION OF A CENTURY 2013', 'YIL·00A', 'Slogan screenprinted on plate'], ['New South Wales', 'black/yellow', 'aa·nn·aa', 'NEW SOUTH WALES', 'BX·99·HI', 'No slogan on current series'], ['New South Wales', 'b

In [11]:
tuple(x for x in train_sql[0]['sql']['conds'])
#val_sql[0]['sql']['sel']

([3, 0, 'SOUTH AUSTRALIA'],)

#### Function to create batch sequences.

Given a batch size this function creates batch sequences.  
The required indecies are passsed to idxes.

(This method of loading data is discouraged and need to implement `torch.utils.data.Dataset` and `Dataloader`)

In [12]:
def gen_batch_sequence(sql_data, table_data,idxes,start, end):
    # [3,5,7,8]
    question_seq = []
    column_seq = []
    number_of_col = []
    answer_seq = []
    query_seq = []
    ground_truth_cond_seq = []
    raw_data = []
    for i in range(start, end):
        sql = sql_data[idxes[i]]
        table_id = sql['table_id']
        question_seq.append(sql['tokenized_question'])
        column_seq.append(table_data[table_id]['tokenized_header'])
        number_of_col.append(len(table_data[table_id]['header']))
        answer_seq.append((sql['sql']['agg'],
                           sql['sql']['sel'],
                           len(sql['sql']['conds']),
                           tuple(x[0] for x in sql['sql']['conds']),
                           tuple(x[1] for x in sql['sql']['conds'])
                          ))
        query_seq.append(sql['tokenized_query'])
        ground_truth_cond_seq.append(sql['sql']['conds'])
        raw_data.append((sql['question'],table_data[table_id]['header'], sql['query']))
        
    return (question_seq, column_seq, number_of_col, answer_seq, query_seq, ground_truth_cond_seq,raw_data)

-----


#### Import BERT tokenizer 

In [13]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
bertModel = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Freeze the bertModel
for params in bertModel.parameters():
    params.requires_grad = False

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
q_seq,col_seq, num_col, answer_seq, query_seq,gt_cond,rd = gen_batch_sequence(train_sql,train_table,[4,23,654],0,3)

In [15]:
col_seq[0]

[['order', 'year'],
 ['manufacturer'],
 ['model'],
 ['fleet', 'series', '(', 'quantity', ')'],
 ['powertrain', '(', 'engine/transmission', ')'],
 ['fuel', 'propulsion']]

In [16]:
for i, (one_q, one_col) in enumerate(zip(q_seq, col_seq)):
    #print(f"{i}: {one_q} || {one_col}")
    one_col_all = [x for toks in one_col for x in toks + ['[SEP]'] ]
    print(one_col_all)

['order', 'year', '[SEP]', 'manufacturer', '[SEP]', 'model', '[SEP]', 'fleet', 'series', '(', 'quantity', ')', '[SEP]', 'powertrain', '(', 'engine/transmission', ')', '[SEP]', 'fuel', 'propulsion', '[SEP]']
['player', '[SEP]', 'no', '.', '[SEP]', 'nationality', '[SEP]', 'position', '[SEP]', 'years', 'in', 'toronto', '[SEP]', 'school/club', 'team', '[SEP]']
['#', '[SEP]', 'episode', '[SEP]', 'air', 'date', '[SEP]', 'rating', '[SEP]', 'share', '[SEP]', '18–49', '(', 'rating/share', ')', '[SEP]', 'viewers', '(', 'm', ')', '[SEP]', 'weekly', 'rank', '[SEP]']


In [17]:
text = ['[SEP]']
encode = tokenizer.encode_plus(text=text,add_special_tokens = True,
                                #truncation=True,
                                return_token_type_ids = True,
                                padding = 'longest', 
                                return_attention_mask = True,
                                return_tensors = 'pt')
encode

{'input_ids': tensor([[101, 102, 102]]), 'token_type_ids': tensor([[0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1]])}

In [18]:
print(q_seq)
batch_queries = [' '.join(x) for x in q_seq]
q_encode = tokenizer.batch_encode_plus(batch_text_or_text_pairs=batch_queries ,
                                #max_length = 60,
                                add_special_tokens = True,
                                #truncation=True,
                                return_token_type_ids = True,
                                padding = 'longest', 
                                return_attention_mask = True,
                                return_tensors = 'pt')
bert_op = bertModel(**q_encode)
print(type(bert_op))
#print(q_encode["input_ids"])

[['how', 'many', 'times', 'is', 'the', 'fuel', 'propulsion', 'is', 'cng', '?'], ['what', 'number', 'is', 'the', 'player', 'that', 'played', '1998-2001'], ['what', 'is', 'the', 'episode', 'where', '18-49', 'has', 'a', 'rating/share', 'of', '3.5/9']]
<class 'transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions'>


In [19]:
print(bert_op.keys())

odict_keys(['last_hidden_state', 'pooler_output'])


In [20]:
'''
Converting This: [['player'],
 ['no', '.'],
 ['nationality'],
 ['position'],
 ['years', 'in', 'toronto'],
 ['school/club', 'team']]
 
 Into : player | no . | nationality | position | years in toronto | school/club team |
 
 for each element in the batch
'''

separator = '[SEP]'
header_batch_list=list(
                   map(lambda col: ' '.join([x for tok in col for x in tok + [separator]]), col_seq)
                    )
print(header_batch_list[0])

# for col in col_seq:
#     print(' '.join([x for tok in col for x in tok + [separator]]))
col_encode = tokenizer.batch_encode_plus(batch_text_or_text_pairs= header_batch_list,
                                 # max_length = 60,
                                add_special_tokens = True,
                                #truncation=True,
                                return_token_type_ids = True,
                                padding = 'longest', 
                                return_attention_mask = True,
                                return_tensors = 'pt')
col_encode['input_ids']

order year [SEP] manufacturer [SEP] model [SEP] fleet series ( quantity ) [SEP] powertrain ( engine/transmission ) [SEP] fuel propulsion [SEP]


tensor([[  101,  2344,  2095,   102,  7751,   102,  2944,   102,  4170,  2186,
          1006, 11712,  1007,   102,  2373, 23654,  1006,  3194,  1013,  6726,
          1007,   102,  4762, 16404,   102,   102,     0,     0,     0,     0],
        [  101,  2447,   102,  2053,  1012,   102, 10662,   102,  2597,   102,
          2086,  1999,  4361,   102,  2082,  1013,  2252,  2136,   102,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1001,   102,  2792,   102,  2250,  3058,   102,  5790,   102,
          3745,   102,  2324,  1516,  4749,  1006,  5790,  1013,  3745,  1007,
           102,  7193,  1006,  1049,  1007,   102,  4882,  4635,   102,   102]])

In [21]:
'''
Converting This: [['player'],
 ['no', '.'],
 ['nationality'],
 ['position'],
 ['years', 'in', 'toronto'],
 ['school/club', 'team']]
 
 Into : player [SEP] no . [SEP] nationality [SEP] position [SEP] years in toronto [SEP] school/club team [SEP]
 
 for each element in the batch
 
 and combining the question with the above table header
 Eg:
 ['what', 'number', 'is', 'the', 'player', 'that', 'played', '1998-2001']
     and
 ['player', '[SEP]', 'no', '.', '[SEP]', 'nationality', '[SEP]', 'position', '[SEP]', 'years', 'in', 'toronto', '[SEP]', 'school/club', 'team', '[SEP]']
 
 into: what number is the player that played 1998-2001 [SEP] player [SEP] no . [SEP] nationality [SEP] position [SEP] years in toronto [SEP] school/club team [SEP]
'''

separator = '[SEP]'
header_batch_list=list(
                   map(lambda col: [x for tok in col for x in tok + [separator]], col_seq)
                    )
# print(header_batch_list[0])
# print(q_seq)

input_string = []
input_lengths = []
for i in range(len(header_batch_list)):
    merged_list = q_seq[i] +[separator]+ header_batch_list[i][:-1]
    input_lengths.append(len(merged_list))
    input_string.append(' '.join(merged_list))

print(input_string)    
print(input_lengths)
tokenizer_args = { 'add_special_tokens' : True,
                   'return_token_type_ids' : True,
                   'padding' : 'longest', 
                   'return_attention_mask' : True,
                   'return_tensors' : 'pt'}
# for col in col_seq:
#     print(' '.join([x for tok in col for x in tok + [separator]]))
col_encode = tokenizer.batch_encode_plus(batch_text_or_text_pairs= input_string, **tokenizer_args)
col_encode['input_ids']

['how many times is the fuel propulsion is cng ? [SEP] order year [SEP] manufacturer [SEP] model [SEP] fleet series ( quantity ) [SEP] powertrain ( engine/transmission ) [SEP] fuel propulsion', 'what number is the player that played 1998-2001 [SEP] player [SEP] no . [SEP] nationality [SEP] position [SEP] years in toronto [SEP] school/club team', 'what is the episode where 18-49 has a rating/share of 3.5/9 [SEP] # [SEP] episode [SEP] air date [SEP] rating [SEP] share [SEP] 18–49 ( rating/share ) [SEP] viewers ( m ) [SEP] weekly rank']
[31, 24, 35]


tensor([[  101,  2129,  2116,  2335,  2003,  1996,  4762, 16404,  2003, 27166,
          2290,  1029,   102,  2344,  2095,   102,  7751,   102,  2944,   102,
          4170,  2186,  1006, 11712,  1007,   102,  2373, 23654,  1006,  3194,
          1013,  6726,  1007,   102,  4762, 16404,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2054,  2193,  2003,  1996,  2447,  2008,  2209,  2687,  1011,
          2541,   102,  2447,   102,  2053,  1012,   102, 10662,   102,  2597,
           102,  2086,  1999,  4361,   102,  2082,  1013,  2252,  2136,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2054,  2003,  1996,  2792,  2073,  2324,  1011,  4749,  2038,
          1037,  5790,  1013,  3745,  1997,  1017,  1012,  1019,  1013,  1023,
           102,  1001,   102,  2792,   102,  2250,  3058,   102,

In [22]:
sort_perm = np.array(sorted(range(len(input_lengths)), key=lambda k: input_lengths[k], reverse=True))
print(input_lengths)
print(sort_perm)

[31, 24, 35]
[2 0 1]


In [23]:
bert_op.last_hidden_state[sort_perm][1]

tensor([[-0.0578,  0.2445,  0.3791,  ..., -0.2576,  0.5997,  0.4238],
        [ 0.4633,  0.2480, -0.1827,  ..., -0.1066,  0.4124,  0.4206],
        [ 0.4549, -0.4792,  0.5284,  ..., -0.2278,  0.3549, -0.0176],
        ...,
        [ 0.3265,  0.2410,  0.2214,  ...,  0.3473,  0.1328,  0.2675],
        [ 0.2006,  0.0144,  0.2987,  ...,  0.2660,  0.2688,  0.1953],
        [ 0.1585, -0.0806,  0.3209,  ...,  0.2431,  0.2044,  0.0794]])

In [24]:
col_encode['input_ids'].shape

torch.Size([3, 49])

In [25]:
bert_op = bertModel(**col_encode)


In [26]:
print(bert_op.last_hidden_state[0])
bert_op.last_hidden_state.shape

tensor([[-0.4704, -0.1517,  0.2459,  ..., -0.3243,  0.0083,  0.5512],
        [ 0.0544,  0.0891, -0.2584,  ...,  0.0502,  0.5777,  0.2143],
        [ 0.0880, -0.7884,  0.5079,  ...,  0.0542,  0.2422, -0.4345],
        ...,
        [-0.1529, -0.2589, -0.1672,  ..., -0.1298,  0.3174,  0.0245],
        [ 0.0591,  0.0180,  0.2980,  ..., -0.3232,  0.0631, -0.0081],
        [-0.1199, -0.2840, -0.0588,  ..., -0.1264,  0.2774,  0.0207]])


torch.Size([3, 49, 768])

In [27]:
print(list(col_encode.keys()))

['input_ids', 'token_type_ids', 'attention_mask']


In [28]:
x = torch.randn(2,3)
y = torch.randn(2,4)
z = torch.cat((x,y), dim=1)
print(f"X: {x}\nY: {y}\nZ: {z}")

X: tensor([[ 0.7017,  0.4421,  0.2288],
        [-0.9537,  1.6103,  0.4782]])
Y: tensor([[-0.5404,  1.4974,  0.4391,  0.5706],
        [-0.2123, -0.5529, -0.6614,  0.8204]])
Z: tensor([[ 0.7017,  0.4421,  0.2288, -0.5404,  1.4974,  0.4391,  0.5706],
        [-0.9537,  1.6103,  0.4782, -0.2123, -0.5529, -0.6614,  0.8204]])


In [29]:
con_cat_string = torch.cat((q_encode['input_ids'],col_encode['input_ids']),dim = 1)
print(con_cat_string)

tensor([[  101,  2129,  2116,  2335,  2003,  1996,  4762, 16404,  2003, 27166,
          2290,  1029,   102,     0,     0,     0,     0,     0,     0,     0,
             0,   101,  2129,  2116,  2335,  2003,  1996,  4762, 16404,  2003,
         27166,  2290,  1029,   102,  2344,  2095,   102,  7751,   102,  2944,
           102,  4170,  2186,  1006, 11712,  1007,   102,  2373, 23654,  1006,
          3194,  1013,  6726,  1007,   102,  4762, 16404,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2054,  2193,  2003,  1996,  2447,  2008,  2209,  2687,  1011,
          2541,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   101,  2054,  2193,  2003,  1996,  2447,  2008,  2209,  2687,
          1011,  2541,   102,  2447,   102,  2053,  1012,   102, 10662,   102,
          2597,   102,  2086,  1999,  4361,   102,  2082,  1013,  2252,  2136,
           102,     0,     0,     0,     0,     0, 

In [30]:
tokenizer.convert_ids_to_tokens(col_encode['input_ids'][0])

['[CLS]',
 'how',
 'many',
 'times',
 'is',
 'the',
 'fuel',
 'propulsion',
 'is',
 'cn',
 '##g',
 '?',
 '[SEP]',
 'order',
 'year',
 '[SEP]',
 'manufacturer',
 '[SEP]',
 'model',
 '[SEP]',
 'fleet',
 'series',
 '(',
 'quantity',
 ')',
 '[SEP]',
 'power',
 '##train',
 '(',
 'engine',
 '/',
 'transmission',
 ')',
 '[SEP]',
 'fuel',
 'propulsion',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [31]:
## function to train the model
def epoch_train(model, optim, batch_size, sql_data, table_data):
    model.train()

## WordEmbedding Module

In [32]:
#class WordEmbedding(nn.Module):
class WordEmbedding:
    def __init__(self, bert_encoder, max_length=None, separator = '[SEP]'):
        #super(WordEmbedding, self).__init__()
        self.bert_tokenizer = BertTokenizer.from_pretrained(bert_encoder)
        self.bert_model = BertModel.from_pretrained(bert_encoder)
        self.max_length = max_length
        self.gpu = torch.cuda.is_available()
        self.seperator = separator
        
        self.bert_args = { 'add_special_tokens' : True,
                           'return_token_type_ids' : True,
                           'padding' : 'longest', 
                           'return_attention_mask' : True,
                           'return_tensors' : 'pt'}
        
        if(self.max_length != None):
            bert_args['max_length'] = self.max_length
            bert_args['padding'] = 'max_length'
        for param in self.bert_model.parameters():
            param.requires_grad = False
        assert list(self.bert_model.parameters())[0].requires_grad == False 
        
        if self.gpu:
            self.bert_model = self.bert_model.to('cuda')
    
    def gen_x_batch(self,q_batch,col_batch):
        '''
        Input: q_batch: list of tokenized query string i.e. List[List].
               col_batch: list of tokenzed header of the corresponding table header. List[List[List]]
        Output: ret_dict: a dictionary containing the 
        '''
        batch_queries = [' '.join(x) for x in q_batch]
        if batch_queries == []:
            print(q_batch, col_batch)
        '-------'
        header_batch_list=list(
                   map(lambda col: [x for tok in col for x in tok + [separator]], col_batch)
                    )

        input_string_list = []
        for i in range(len(header_batch_list)):
            input_string_list.append(' '.join(q_batch[i] +[separator]+ header_batch_list[i][:-1]))

        '-------'
        inp_encode = self.bert_tokenizer.batch_encode_plus(batch_text_or_text_pairs=input_string_list ,**self.bert_args)
        if self.gpu:
            for key in inp_encode.keys():
                inp_encode[key] = inp_encode[key].to('cuda')
        bert_op = self.bert_model(**inp_encode)
        
        # odict_keys(['last_hidden_state', 'pooler_output'])    
        return bert_op
        

In [33]:
#train_sql, train_table = load_data_set(data_set)
#def test_wordembed_module(data_set, batch_size = 32):
def test_wordembed_module(train_sql,train_table,batch_size = 32): 
    word_emb = WordEmbedding('bert-base-uncased')
    start = 0
    end = batch_size
    num_x = len(train_sql)
    idxes = np.random.permutation(num_x)
    i = 0
    while start < num_x:
        end = start+batch_size if start+batch_size <= num_x else num_x
        ret_tuple = gen_batch_sequence(train_sql,train_table,idxes, start, end)
        if ret_tuple[0] == []:
            print(start, end)
        bert_op = word_emb.gen_x_batch(ret_tuple[0],ret_tuple[1])
        last_hidden_state,pooler_output = bert_op.last_hidden_state, bert_op.pooler_output 
        
        assert last_hidden_state.size()[2] == 768
        assert last_hidden_state.size()[0] <= batch_size
        assert pooler_output.size()[1] == 768
        assert pooler_output.size()[0] <= batch_size
        
        if (i%10 == 0):
            print(f"\ni={i}\nSanity Check!!\n")
            print(f"Start: {start}\nEnd: {end}\n")
            print(f"Size of LastHidden Layer Size:{last_hidden_state.size()}\nPooler output size:{pooler_output.size()}\n\
            LastHidden Layer: {last_hidden_state}\nPooler output: {pooler_output}\n")
            
        i = i+1
        start = end
    
test_wordembed_module(train_sql,train_table)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



i=0
Sanity Check!!

Start: 0
End: 32

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6544,  0.1763, -0.0774,  ..., -0.1759,  0.3545,  0.6723],
         [-0.6663, -0.3331,  0.3918,  ...,  0.0515,  0.7160, -0.5079],
         [ 0.0083,  0.0929,  0.1484,  ..., -0.2990, -0.1425, -0.4201],
         ...,
         [-0.9517, -0.2704, -0.1495,  ...,  0.2189,  0.3806,  0.2235],
         [-0.9437, -0.2753, -0.0475,  ...,  0.1511,  0.3644,  0.2012],
         [-0.8771, -0.2010,  0.0540,  ...,  0.2396,  0.4265,  0.1825]],

        [[-0.7824, -0.0397, -0.0327,  ..., -0.6223,  0.1949,  0.7265],
         [-0.5587, -0.1078, -0.4789,  ...,  0.0247,  0.3517, -0.5973],
         [-0.9256, -0.9540, -0.2178,  ...,  0.1710, -0.5906,  0.4124],
         ...,
         [-0.5134, -0.0889,  0.0447,  ..., -0.2877,  0.1543,  0.4718],
         [-0.7539, -0.0421, -0.0520,  ..., -0.3541,  0.1782,  0.5109],
         [-0.7074, -0.0


i=30
Sanity Check!!

Start: 960
End: 992

Size of LastHidden Layer Size:torch.Size([32, 63, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.8250e-01,  3.0641e-01, -1.8792e-01,  ..., -3.4128e-01,
           1.1040e-01,  6.5085e-01],
         [-5.0780e-01,  2.7914e-02, -4.6818e-01,  ...,  2.8315e-01,
           3.0015e-03, -1.3070e-01],
         [-3.4365e-01, -2.9274e-02, -3.3346e-01,  ..., -3.9273e-01,
          -2.1480e-01, -1.2021e+00],
         ...,
         [ 8.5065e-02, -9.6070e-02, -5.8596e-02,  ...,  3.8597e-02,
           1.9581e-01, -1.1540e-01],
         [-6.6192e-02,  2.2644e-02, -8.3045e-02,  ..., -2.8487e-02,
           6.1404e-02,  1.1860e-03],
         [-1.0697e-01, -6.6165e-02, -7.4189e-02,  ..., -1.1979e-01,
          -5.7429e-02, -1.6055e-01]],

        [[-6.4885e-01,  2.4711e-01, -4.4432e-02,  ..., -5.0441e-01,
           3.3087e-01,  5.7659e-01],
         [-6.0321e-01, -3.6916e-02,  3.4033e-01,  ...,  2.9304e-01,
           


i=50
Sanity Check!!

Start: 1600
End: 1632

Size of LastHidden Layer Size:torch.Size([32, 65, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6655, -0.1104,  0.0052,  ..., -0.5679, -0.0335,  0.4598],
         [-0.7863,  0.1062,  0.4520,  ..., -0.0572,  0.7872, -0.1568],
         [-1.1098, -0.0267,  0.3401,  ..., -0.2735,  0.3232, -0.3017],
         ...,
         [-0.6829,  0.0628, -0.3940,  ..., -0.1659,  0.1507,  0.1710],
         [-0.5380, -0.3249,  0.1763,  ..., -0.2522,  0.1322,  0.0969],
         [-0.6480,  0.0035, -0.4986,  ..., -0.0430,  0.3606,  0.1168]],

        [[-0.9729, -0.2691, -0.0927,  ..., -0.2892,  0.4451,  0.7431],
         [-0.9952, -0.2738,  0.4534,  ..., -0.0809,  0.3953, -0.0135],
         [-0.8419,  0.3287, -0.4063,  ..., -1.0277, -0.4503,  0.4446],
         ...,
         [-0.7841, -0.2444, -0.0218,  ..., -0.1421,  0.1883,  0.2540],
         [-0.2802, -0.0548,  0.0064,  ...,  0.0764,  0.1111,  0.1645],
         [-0.3485


i=80
Sanity Check!!

Start: 2560
End: 2592

Size of LastHidden Layer Size:torch.Size([32, 54, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7106,  0.2916,  0.0200,  ..., -0.5641,  0.2253,  0.6684],
         [-0.5073,  0.0120, -0.0700,  ..., -0.0720,  0.0440, -0.3624],
         [-0.5261, -0.1712, -0.2129,  ..., -0.2552, -0.5007,  0.4942],
         ...,
         [-0.6103,  0.2189,  0.1630,  ..., -0.1194,  0.1482,  0.1868],
         [-0.7592,  0.0770, -0.1229,  ..., -0.1041,  0.2830,  0.2881],
         [-0.6621,  0.2316,  0.1960,  ..., -0.1098,  0.1640,  0.1628]],

        [[-0.9021, -0.1009, -0.1488,  ..., -0.3801,  0.2503,  0.7511],
         [-0.0695,  0.2081, -0.4150,  ...,  0.2271,  0.7234,  0.0318],
         [-1.0229, -0.9129, -0.0829,  ..., -0.0282,  0.0384,  0.3136],
         ...,
         [-0.2729,  0.1720, -0.1481,  ..., -0.0356, -0.3312,  0.6410],
         [-0.3786,  0.1456, -0.0037,  ..., -0.0830, -0.2518,  0.5861],
         [-0.8359


i=110
Sanity Check!!

Start: 3520
End: 3552

Size of LastHidden Layer Size:torch.Size([32, 65, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.8744e-01, -6.5829e-01,  7.9162e-02,  ..., -5.7185e-01,
           3.1987e-01,  6.6133e-01],
         [-4.7247e-01, -7.6275e-01, -6.3956e-01,  ...,  2.2838e-01,
           6.8348e-01, -5.1802e-01],
         [-9.4420e-01, -7.9984e-01,  6.4435e-01,  ..., -1.1825e-01,
           3.8240e-01, -5.1363e-02],
         ...,
         [-3.6194e-01, -1.7337e-01,  4.6551e-01,  ...,  2.1366e-02,
           8.1949e-02,  2.2231e-01],
         [-2.4440e-01, -6.8566e-01,  3.0970e-02,  ...,  2.1470e-01,
          -7.1505e-03, -8.1751e-02],
         [-3.0408e-01, -2.2205e-01,  4.7771e-01,  ..., -3.5451e-02,
           8.3040e-02,  2.4144e-01]],

        [[-4.1975e-01,  2.1394e-01,  4.0870e-01,  ..., -4.4266e-01,
           2.0367e-01,  6.9864e-01],
         [-2.3704e-01,  1.1989e-01, -1.9770e-01,  ...,  1.4869e-01,
        


i=130
Sanity Check!!

Start: 4160
End: 4192

Size of LastHidden Layer Size:torch.Size([32, 73, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5183,  0.1778,  0.4014,  ..., -0.1838,  0.2109,  0.6424],
         [ 0.0087,  0.1525, -0.2785,  ...,  0.3053,  0.6374, -0.1929],
         [-0.3339, -0.3729,  0.1007,  ...,  0.3984, -0.5759,  0.4431],
         ...,
         [-0.9405, -0.0893, -0.3348,  ...,  0.1340,  0.6362,  0.2349],
         [-0.7252,  0.0160, -0.2474,  ...,  0.0522,  0.4437, -0.0327],
         [-0.8509, -0.0177, -0.2901,  ...,  0.0956,  0.5830,  0.1988]],

        [[-0.3994,  0.0800,  0.1591,  ..., -0.5285,  0.0929,  0.4767],
         [-1.1025, -0.2821, -0.1114,  ..., -0.3353,  0.0413,  0.5910],
         [-0.8313, -0.4825,  0.7334,  ...,  0.1044,  0.2419, -0.2507],
         ...,
         [-0.6274, -0.2133,  0.0998,  ..., -0.3631,  0.0691,  0.2109],
         [-0.5783, -0.0819,  0.2197,  ..., -0.2845, -0.0336,  0.0315],
         [-0.617


i=160
Sanity Check!!

Start: 5120
End: 5152

Size of LastHidden Layer Size:torch.Size([32, 63, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5154,  0.1846, -0.1332,  ..., -0.4628,  0.0095,  0.6848],
         [-0.4385, -0.0258, -0.1169,  ...,  0.1905,  0.0585, -0.4031],
         [-0.0530, -0.4451, -0.3632,  ..., -0.0037, -0.6789,  0.4187],
         ...,
         [-0.3418, -0.1270,  0.3109,  ..., -0.0838, -0.1058,  0.1816],
         [-0.3145, -0.1702,  0.2714,  ..., -0.1717, -0.0714,  0.2474],
         [-0.3632, -0.1317,  0.0294,  ..., -0.1573,  0.0783,  0.2690]],

        [[-0.4776,  0.0997,  0.1340,  ..., -0.1651,  0.4404,  0.5136],
         [-0.1036,  0.0702, -0.1528,  ...,  0.0801,  0.8248, -0.2189],
         [-0.1498, -0.6552, -0.4859,  ...,  0.2859,  0.3606,  0.3500],
         ...,
         [-0.1493, -0.4183,  0.3924,  ...,  0.0769,  0.5445,  0.1846],
         [-0.3478, -0.0598,  0.1502,  ...,  0.0114,  0.4163,  0.1934],
         [-0.299


i=190
Sanity Check!!

Start: 6080
End: 6112

Size of LastHidden Layer Size:torch.Size([32, 106, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.6486e-01, -9.6664e-03, -4.0180e-01,  ..., -5.4172e-01,
          -3.1315e-02,  6.2706e-01],
         [-1.1220e-01, -7.4371e-02, -3.7634e-01,  ...,  1.2972e-02,
           2.5879e-02,  7.8749e-02],
         [-2.0305e-01, -7.5597e-01, -3.2899e-01,  ..., -2.8869e-01,
          -3.4807e-01,  5.8388e-02],
         ...,
         [-5.9063e-01, -2.5051e-01, -5.8845e-01,  ...,  1.4929e-01,
          -6.3785e-02,  5.3506e-01],
         [-4.0864e-01, -5.1599e-01, -2.6577e-01,  ...,  5.0970e-02,
          -4.2284e-01,  3.0388e-01],
         [-6.2299e-01, -1.8831e-01, -4.4749e-01,  ...,  1.0028e-01,
          -2.2488e-01,  6.2438e-01]],

        [[-9.9521e-01, -2.4038e-01, -1.2455e-01,  ..., -1.8489e-03,
           3.4978e-01,  5.7734e-01],
         [-5.7769e-01, -2.5797e-01, -8.0803e-02,  ...,  4.2803e-04,
       


i=210
Sanity Check!!

Start: 6720
End: 6752

Size of LastHidden Layer Size:torch.Size([32, 80, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-5.8878e-01,  1.4640e-02, -1.2933e-01,  ..., -4.2932e-01,
           1.5445e-02,  8.3867e-01],
         [-3.1739e-01,  1.1470e-01, -3.7646e-01,  ...,  5.2762e-01,
           1.5049e-01, -2.3638e-01],
         [-7.2468e-02, -6.3404e-01, -2.5631e-01,  ...,  3.0938e-02,
          -5.5309e-01,  1.1991e+00],
         ...,
         [-6.1202e-01, -4.4113e-01, -1.1862e-02,  ...,  2.2220e-02,
          -2.2108e-01,  3.0238e-01],
         [-2.6328e-01, -6.4517e-02,  1.9324e-01,  ..., -3.1076e-02,
          -3.5139e-01,  2.4813e-01],
         [-4.7583e-01, -3.4741e-01,  1.2751e-01,  ..., -6.5509e-02,
          -3.5368e-01,  3.2202e-01]],

        [[-6.0836e-01,  5.2222e-02, -1.0164e-01,  ..., -6.1873e-01,
           2.1357e-01,  6.2955e-01],
         [-1.5922e-01, -2.0577e-01, -1.3190e-01,  ...,  9.4161e-02,
        


i=240
Sanity Check!!

Start: 7680
End: 7712

Size of LastHidden Layer Size:torch.Size([32, 69, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.2036e-01, -8.2321e-02,  1.8743e-02,  ..., -4.1983e-01,
           4.6255e-02,  4.6361e-01],
         [-1.8132e-01,  2.5457e-02, -3.3861e-01,  ...,  1.8428e-01,
           4.9951e-01, -3.9192e-02],
         [-7.2695e-01, -7.9376e-01, -2.7185e-01,  ...,  1.9565e-01,
           7.5550e-02,  1.9363e-01],
         ...,
         [-2.9376e-02, -1.5518e-01,  5.9072e-01,  ..., -6.2033e-02,
          -5.9472e-02, -8.6379e-02],
         [-6.6445e-02, -2.2476e-01,  5.7061e-01,  ..., -8.6170e-02,
          -1.3331e-02,  1.6166e-02],
         [-2.5421e-01, -4.6175e-01, -2.4202e-01,  ...,  8.5062e-02,
           3.9872e-01, -1.1004e-01]],

        [[-4.5240e-01,  1.9977e-01, -2.9100e-01,  ..., -4.0320e-01,
           4.0687e-02,  8.8993e-01],
         [-3.6599e-01,  5.7631e-02, -5.5171e-01,  ...,  1.7346e-01,
        


i=260
Sanity Check!!

Start: 8320
End: 8352

Size of LastHidden Layer Size:torch.Size([32, 66, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6446,  0.2056,  0.2623,  ..., -0.1862,  0.3153,  0.7798],
         [ 0.0164,  0.1102, -0.2113,  ...,  0.1709,  0.9146,  0.4173],
         [-0.5031, -0.4582,  0.6620,  ..., -0.0706,  0.8978,  0.0746],
         ...,
         [-1.2441,  0.0594,  0.3696,  ...,  0.0707,  0.6920,  0.0231],
         [ 0.2232, -0.1208,  0.2135,  ..., -0.4139,  0.1751,  0.0171],
         [ 0.5909,  0.2206, -0.0706,  ...,  0.1443, -0.4801, -0.3163]],

        [[-0.9824, -0.1078, -0.1567,  ..., -0.5332,  0.3663,  0.6928],
         [-0.5176, -0.1259, -0.5978,  ..., -0.1016,  0.8224, -0.2818],
         [-0.5809, -0.6578, -0.3824,  ..., -0.4860, -0.0529,  0.6465],
         ...,
         [-0.7383, -0.2385, -0.2253,  ...,  0.1071,  0.3387,  0.3938],
         [-0.5693, -0.1536,  0.2124,  ..., -0.2009,  0.4381,  0.6790],
         [-0.475


i=290
Sanity Check!!

Start: 9280
End: 9312

Size of LastHidden Layer Size:torch.Size([32, 79, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.2358e-01,  9.6913e-02, -3.3328e-01,  ..., -5.7797e-01,
          -1.5562e-01,  6.7621e-01],
         [-7.3952e-01,  8.4995e-02, -3.8188e-01,  ...,  7.2847e-03,
          -3.3560e-01, -4.0583e-01],
         [-9.8045e-01,  1.3405e-01, -5.7386e-02,  ..., -4.8138e-01,
          -3.6455e-01,  5.3562e-01],
         ...,
         [-4.8390e-01, -2.1700e-01, -3.0873e-01,  ..., -7.4060e-02,
          -4.9563e-01,  1.7564e-01],
         [-7.0461e-01, -2.8499e-01, -6.5214e-01,  ..., -1.1630e-04,
          -1.6083e-01,  4.8819e-01],
         [-8.1029e-01, -3.7945e-01, -6.0406e-01,  ..., -1.1547e-01,
          -2.1085e-01,  5.5048e-01]],

        [[-5.2019e-01, -6.0232e-01,  2.7475e-01,  ..., -2.8102e-01,
           1.9722e-01,  8.3594e-01],
         [-3.5611e-03, -2.7076e-01, -1.3316e-01,  ...,  5.2563e-02,
        


i=320
Sanity Check!!

Start: 10240
End: 10272

Size of LastHidden Layer Size:torch.Size([32, 51, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6257, -0.1306, -0.0982,  ..., -0.5181,  0.1724,  0.5468],
         [-0.1390, -0.0632, -0.5378,  ...,  0.2271,  0.3382, -0.1145],
         [-0.6480, -1.2413,  0.5555,  ...,  0.0157, -0.0349,  0.0773],
         ...,
         [-0.6056, -0.1468, -0.0363,  ..., -0.0274, -0.0552,  0.3150],
         [-0.5809, -0.1949,  0.1554,  ..., -0.1374, -0.1414,  0.2921],
         [-0.5113, -0.0884,  0.0296,  ..., -0.0351, -0.0493,  0.3312]],

        [[-0.6732, -0.5383, -0.1167,  ..., -0.0375,  0.1428,  0.4354],
         [-0.3263, -0.5342, -0.5709,  ...,  0.1274,  0.0874, -0.5796],
         [-0.5050, -0.5725,  0.1207,  ..., -0.2399,  0.0064, -0.0611],
         ...,
         [-0.1271, -0.3037,  0.0158,  ...,  0.1497,  0.3264,  0.0212],
         [-0.1998, -0.3244, -0.0601,  ...,  0.1944,  0.3080,  0.1572],
         [-0.0


i=350
Sanity Check!!

Start: 11200
End: 11232

Size of LastHidden Layer Size:torch.Size([32, 62, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.7836e-01, -4.2136e-02, -1.4850e-02,  ..., -3.4994e-01,
           1.6469e-01,  4.2705e-01],
         [ 3.5652e-01,  3.2122e-01, -4.7606e-01,  ..., -6.3397e-01,
           6.4985e-01,  6.8086e-01],
         [-1.0340e+00, -3.1993e-02, -3.5375e-01,  ..., -7.7124e-01,
           1.8546e-01,  9.0902e-02],
         ...,
         [-3.3273e-01, -1.5511e-02,  1.3103e-01,  ..., -1.2009e-01,
           2.7219e-01,  1.4378e-01],
         [-3.6579e-01, -8.1302e-02,  1.5436e-01,  ..., -9.3683e-02,
           2.7716e-01, -3.8173e-02],
         [-2.8875e-01,  2.8363e-02,  1.9354e-01,  ..., -3.0347e-01,
           2.5991e-01,  1.0980e-01]],

        [[-6.2695e-01,  3.3978e-01,  9.2654e-02,  ..., -4.9175e-01,
           2.7325e-01,  5.1470e-01],
         [-5.7881e-01, -3.8549e-01,  2.1506e-01,  ...,  5.8247e-01,
      


i=370
Sanity Check!!

Start: 11840
End: 11872

Size of LastHidden Layer Size:torch.Size([32, 73, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.8284, -0.0948, -0.1975,  ..., -0.4572,  0.2128,  0.7586],
         [-1.0498,  0.0907, -0.4129,  ..., -0.5953,  0.8395,  0.4497],
         [-0.9536, -0.4527,  0.1310,  ..., -0.1738,  0.1438,  0.1199],
         ...,
         [-1.1236, -0.0703,  0.0549,  ..., -0.1222,  0.1159,  0.7175],
         [-0.9840, -0.1634,  0.0055,  ..., -0.1151,  0.0347,  0.6669],
         [-1.0575, -0.0924,  0.2049,  ..., -0.1310,  0.0237,  0.5372]],

        [[-0.6484, -0.0880,  0.0055,  ..., -0.5838,  0.2235,  0.7765],
         [-0.4526, -0.2714, -0.5831,  ..., -0.0437,  0.8217,  0.3647],
         [ 0.1720, -0.7354, -0.6153,  ...,  0.5275, -0.3171, -0.1742],
         ...,
         [-0.8537,  0.1173, -0.0061,  ..., -0.1880, -0.0327,  0.6746],
         [-0.8854,  0.0863,  0.1791,  ..., -0.1360, -0.0448,  0.4803],
         [-0.4


i=400
Sanity Check!!

Start: 12800
End: 12832

Size of LastHidden Layer Size:torch.Size([32, 157, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.5336e-01, -7.9393e-02,  1.4092e-01,  ..., -4.3536e-01,
           2.8206e-01,  1.0704e+00],
         [-1.3320e+00,  2.0237e-01, -4.1544e-01,  ..., -6.4812e-02,
           4.3589e-01,  5.0323e-02],
         [-8.2579e-01, -2.8702e-01,  3.9109e-01,  ..., -2.3783e-01,
           5.1469e-02, -1.2131e-01],
         ...,
         [-7.1417e-01, -1.4054e-01, -1.4926e-01,  ...,  3.2812e-01,
           4.3755e-01,  5.5819e-01],
         [-5.9507e-01, -4.8992e-02,  3.3455e-01,  ...,  2.2682e-01,
           3.2286e-01,  5.8980e-01],
         [-5.1784e-01, -1.4000e-02,  1.9121e-01,  ...,  3.9831e-01,
           3.8110e-01,  4.7147e-01]],

        [[-6.4956e-01,  1.5274e-01, -1.3106e-01,  ..., -6.4578e-01,
           7.3786e-02,  6.7631e-01],
         [-3.7207e-01, -4.8302e-02, -8.5486e-03,  ..., -2.0063e-01,
     


i=420
Sanity Check!!

Start: 13440
End: 13472

Size of LastHidden Layer Size:torch.Size([32, 87, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.0229e+00, -6.2088e-01, -7.4822e-02,  ..., -2.3288e-01,
          -1.1014e-01,  7.8447e-01],
         [-3.5020e-02,  5.6579e-01, -3.6392e-01,  ..., -3.7057e-02,
           2.8819e-01,  5.7251e-01],
         [-7.7835e-01,  5.4278e-01, -7.1000e-01,  ..., -2.4642e-01,
          -2.4851e-01,  4.7915e-01],
         ...,
         [-9.9867e-01,  1.0193e-02, -5.1722e-01,  ...,  7.5485e-02,
           1.0920e-01,  8.1876e-02],
         [-1.0115e+00,  3.4564e-02, -4.6214e-01,  ...,  1.0930e-01,
           1.1017e-01,  9.6574e-02],
         [-8.5966e-01, -2.2405e-02,  2.2693e-01,  ..., -1.9861e-01,
          -2.4457e-01,  5.3571e-02]],

        [[-6.4363e-01,  3.1688e-02,  1.0253e-01,  ..., -2.8949e-01,
           3.7047e-02,  3.7897e-01],
         [-8.1277e-01,  1.0759e-01, -1.9506e-01,  ...,  2.6632e-01,
      


i=450
Sanity Check!!

Start: 14400
End: 14432

Size of LastHidden Layer Size:torch.Size([32, 70, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.9411e-01, -9.0705e-03, -6.1931e-03,  ..., -3.1465e-01,
           4.7208e-01,  4.8580e-01],
         [-2.0093e-01,  1.5919e-01, -6.4522e-01,  ...,  2.4033e-01,
           4.3840e-01, -4.7699e-01],
         [-6.6151e-01,  1.1900e-01, -1.9804e-01,  ..., -3.2953e-02,
          -1.7790e-01, -1.9002e-01],
         ...,
         [ 2.1046e-01, -1.4188e-04,  4.3260e-01,  ...,  2.2942e-02,
           2.0951e-01, -9.9417e-02],
         [-1.6965e-01, -6.8797e-02,  5.1491e-02,  ...,  2.4028e-01,
           2.0248e-01,  9.1105e-03],
         [-2.6190e-01,  3.2382e-01,  1.1686e-01,  ...,  4.7529e-01,
           2.8768e-01, -2.7627e-02]],

        [[-7.4042e-01, -6.3020e-02, -1.1922e-01,  ..., -5.0777e-01,
           4.7810e-01,  6.3372e-01],
         [-3.3422e-01, -5.8386e-02, -6.0272e-01,  ...,  2.6079e-01,
      


i=480
Sanity Check!!

Start: 15360
End: 15392

Size of LastHidden Layer Size:torch.Size([32, 76, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4074,  0.1622,  0.0246,  ..., -0.3775, -0.0852,  0.7205],
         [ 0.1871,  0.2394, -0.1570,  ..., -0.3463,  0.6545,  0.3020],
         [-1.0902,  0.0349,  0.3767,  ..., -0.0520,  0.5391,  0.1205],
         ...,
         [-0.3405, -0.1537,  0.0212,  ...,  0.0415, -0.0655,  0.0992],
         [-0.3573, -0.2714, -0.0484,  ...,  0.1486, -0.1153,  0.0886],
         [-0.3705, -0.3240, -0.0772,  ...,  0.0963,  0.1768,  0.0249]],

        [[-0.4834,  0.1407,  0.1929,  ..., -0.1840,  0.1091,  0.6879],
         [-0.9774,  0.0871,  0.5783,  ..., -0.0291,  0.6503,  0.0798],
         [-0.1533, -0.0886,  0.2792,  ..., -0.0734, -0.0945,  0.4756],
         ...,
         [-0.8166, -0.2514,  0.1202,  ...,  0.2441,  0.5425,  0.3709],
         [-0.7095, -0.1606,  0.1995,  ...,  0.1243,  0.4553,  0.3673],
         [-0.7


i=510
Sanity Check!!

Start: 16320
End: 16352

Size of LastHidden Layer Size:torch.Size([32, 53, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.5715e-01, -8.7918e-02, -2.1813e-01,  ..., -5.1912e-01,
           1.7754e-01,  6.4598e-01],
         [-1.8966e-02, -1.5254e-01, -5.3028e-01,  ...,  1.7634e-01,
           1.7252e-01,  2.5443e-01],
         [-1.4451e-01, -5.1548e-01,  1.1425e-01,  ..., -1.3466e-03,
           2.5978e-01,  3.5424e-01],
         ...,
         [-6.0659e-01, -2.9674e-01, -2.1943e-01,  ..., -4.1108e-02,
           2.6882e-01,  4.4356e-01],
         [-6.5886e-01, -2.9680e-01, -2.9442e-01,  ...,  8.7556e-02,
           4.3240e-01,  4.0438e-01],
         [-6.6592e-01, -3.2528e-01, -3.3044e-01,  ...,  6.9426e-02,
           4.1867e-01,  3.6082e-01]],

        [[-4.3944e-01,  2.5199e-02,  1.0244e-01,  ..., -3.5370e-01,
           2.2836e-01,  7.8099e-01],
         [ 4.7694e-01,  3.9115e-01, -1.4811e-01,  ..., -2.3310e-02,
      


i=530
Sanity Check!!

Start: 16960
End: 16992

Size of LastHidden Layer Size:torch.Size([32, 54, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7542,  0.0033, -0.0573,  ..., -0.2506,  0.1430,  0.5222],
         [-0.9670, -0.4293,  0.3533,  ..., -0.1744,  0.6891,  0.0272],
         [-1.2715, -0.2418,  0.1069,  ...,  0.0062,  0.1988,  0.0821],
         ...,
         [-0.7463, -0.3394, -0.0371,  ...,  0.2264,  0.4703,  0.3129],
         [-0.7449, -0.1859,  0.0137,  ...,  0.0174,  0.3417,  0.4012],
         [-0.7550, -0.2949, -0.0835,  ...,  0.1375,  0.4629,  0.3282]],

        [[-0.7889, -0.6095, -0.3345,  ..., -0.0227,  0.0501,  0.7520],
         [ 0.3178, -0.3925, -0.4277,  ..., -0.0153,  0.4599,  0.5804],
         [-0.7159,  0.1326, -0.8179,  ..., -0.2647, -1.1056,  0.3668],
         ...,
         [-0.4935, -0.4214, -0.1498,  ...,  0.1328,  0.0605,  0.2005],
         [-0.3746, -0.3161, -0.0040,  ..., -0.1877,  0.0152,  0.2728],
         [ 0.2


i=560
Sanity Check!!

Start: 17920
End: 17952

Size of LastHidden Layer Size:torch.Size([32, 63, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.7368e-01,  1.4560e-02,  4.7910e-02,  ..., -5.7962e-01,
           2.3045e-01,  6.0330e-01],
         [-2.3579e-01, -2.7237e-01, -3.2412e-01,  ..., -1.0958e-02,
           3.6087e-01, -2.8241e-01],
         [-2.0728e-01, -8.4258e-01, -5.1371e-04,  ..., -2.0510e-01,
           3.1879e-01,  5.3414e-01],
         ...,
         [-8.3722e-02,  8.7403e-02,  4.9732e-01,  ..., -1.3933e-01,
           5.7755e-01, -5.5962e-02],
         [-1.4532e-01,  1.1269e-01,  1.4471e-01,  ..., -2.3983e-01,
           3.9691e-01,  1.8180e-01],
         [-2.6400e-01, -2.1553e-01,  1.2981e-01,  ..., -3.8016e-01,
           5.1338e-01,  2.3265e-01]],

        [[-4.9803e-01,  9.7129e-02, -2.7229e-01,  ..., -2.1478e-01,
           1.1545e-01,  5.7420e-01],
         [-3.3785e-01, -4.7124e-02, -2.2153e-01,  ..., -9.3102e-02,
      


i=590
Sanity Check!!

Start: 18880
End: 18912

Size of LastHidden Layer Size:torch.Size([32, 73, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6368, -0.0211,  0.0603,  ..., -0.3585,  0.3406,  0.7414],
         [-0.8024, -0.2266,  0.5016,  ...,  0.0765,  0.4678, -0.1432],
         [-0.6453, -0.3865,  0.3542,  ..., -0.5052,  0.1045,  0.5412],
         ...,
         [-0.8978, -0.1722,  0.0422,  ...,  0.0705,  0.4161,  0.5131],
         [-0.8975, -0.2015, -0.0279,  ...,  0.1242,  0.5131,  0.4389],
         [-0.9073, -0.1953,  0.0284,  ...,  0.0471,  0.4368,  0.4912]],

        [[-0.7123, -0.1914, -0.1102,  ..., -0.5869,  0.1738,  0.6746],
         [-0.0897,  0.2311, -0.7076,  ...,  0.3869,  0.1522, -0.3682],
         [-0.6047, -0.8501, -0.5145,  ...,  0.4342, -0.8413,  0.1316],
         ...,
         [-0.8354, -0.1942, -0.2070,  ..., -0.3060,  0.1676,  0.6509],
         [-0.3586, -0.0679, -0.2133,  ..., -0.1244,  0.2174,  0.5570],
         [-0.7

KeyboardInterrupt: 

In [None]:
from utils.utils import load_data_set
train_sql, train_table = load_data_set('test')

In [2]:
from model.wordEmbedding import test_wordembed_module

test_wordembed_module(train_sql, train_table)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



i=0
Sanity Check!!

Start: 0
End: 32

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7360, -0.3183, -0.1671,  ..., -0.0541,  0.3680,  0.6488],
         [ 0.1146,  0.1187, -0.3189,  ...,  0.4162,  0.3891,  0.2190],
         [-0.8834, -0.8451,  0.4053,  ...,  0.1687, -0.0132,  0.3803],
         ...,
         [ 0.2565, -0.5440,  0.3062,  ...,  0.2203, -0.1344, -0.6219],
         [-0.0052,  0.3801,  0.2257,  ...,  0.0346, -0.6065,  0.1337],
         [ 0.8852,  0.0332, -0.2031,  ...,  0.2007, -0.5840, -0.3072]],

        [[-0.5216,  0.2069, -0.3126,  ..., -0.2208, -0.1085,  0.4717],
         [-0.5596,  0.3364, -0.4079,  ...,  0.3152,  0.2703, -0.0337],
         [-0.4433, -0.4492, -0.2297,  ...,  0.1253, -0.3978,  0.7244],
         ...,
         [-0.5026,  0.2304,  0.1200,  ...,  0.0251, -0.4091,  0.3194],
         [-0.6771, -0.0647,  0.2924,  ...,  0.2989, -0.4582,  0.5327],
         [-0.5909,  0.2


i=30
Sanity Check!!

Start: 960
End: 992

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3537, -0.4447,  0.0796,  ..., -0.6434,  0.1948,  0.8515],
         [-0.9852, -0.1915, -0.1561,  ..., -0.1698,  0.9630, -0.2320],
         [ 0.8798,  0.1429,  0.6082,  ..., -1.0878,  0.5556,  0.3765],
         ...,
         [-0.1552, -0.3071,  0.3599,  ..., -0.3503,  0.1202,  0.3466],
         [-0.3092, -0.4811,  0.2938,  ..., -0.2282,  0.2688,  0.3767],
         [ 0.0138, -0.3922,  0.3843,  ..., -0.2030,  0.1247,  0.4370]],

        [[-0.4347, -0.6686,  0.0100,  ..., -0.0589,  0.4466,  0.8045],
         [-0.5453, -1.2426, -0.6027,  ...,  0.2512, -0.0025, -0.6876],
         [-0.6054, -1.0845,  0.7297,  ..., -0.3916, -0.0925,  0.3934],
         ...,
         [-0.2433, -0.2602,  0.3958,  ...,  0.0854,  0.1304,  0.2709],
         [-0.3735, -0.4544,  0.4562,  ...,  0.0743,  0.3404,  0.3983],
         [-0.3145, 


i=60
Sanity Check!!

Start: 1920
End: 1952

Size of LastHidden Layer Size:torch.Size([32, 56, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4186, -0.0572,  0.0276,  ...,  0.1255,  0.0612,  0.7230],
         [ 0.3679,  0.5419, -0.6098,  ...,  0.1080,  0.7428,  0.3347],
         [-0.6119,  0.4308, -0.9248,  ..., -0.0860,  0.3031,  0.1657],
         ...,
         [-0.5177, -0.1298, -0.5078,  ...,  0.4387,  0.2018,  0.2584],
         [-0.2327,  0.0586, -0.2301,  ..., -0.0572,  0.0420,  0.4678],
         [-0.2754, -0.0618, -0.3754,  ..., -0.0844,  0.0877,  0.5161]],

        [[-0.5108,  0.0236,  0.1160,  ..., -0.3224,  0.1250,  0.7706],
         [-0.5650,  0.1232, -0.3174,  ...,  0.5099,  0.2544, -0.2093],
         [-0.4858, -0.2105, -0.1697,  ...,  0.1047, -0.4189,  0.6615],
         ...,
         [-0.5259, -0.2331,  0.4895,  ..., -0.1473,  0.0127,  0.2249],
         [-0.5382, -0.2782,  0.4199,  ..., -0.0694,  0.0462,  0.1749],
         [-0.5001


i=90
Sanity Check!!

Start: 2880
End: 2912

Size of LastHidden Layer Size:torch.Size([32, 76, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.8624, -0.0187, -0.1579,  ..., -0.4395,  0.2138,  0.7921],
         [-0.0390,  0.2538, -0.2520,  ...,  0.2850,  0.4865,  0.3346],
         [-0.4315, -0.8027, -0.0808,  ..., -0.2743,  0.1257,  0.3221],
         ...,
         [-0.9073, -0.1401,  0.1069,  ...,  0.1155,  0.4105,  0.3667],
         [-0.3261,  0.1077, -0.0609,  ...,  0.0315, -0.2177,  0.5651],
         [-0.8243, -0.1224,  0.0633,  ...,  0.0428,  0.3990,  0.4129]],

        [[-0.7829, -0.0038, -0.1921,  ..., -0.5213,  0.0922,  0.8662],
         [ 0.1274, -0.1997, -0.2169,  ...,  0.1693,  0.4594, -0.1525],
         [-0.1129, -0.3617, -0.4215,  ..., -0.2722, -0.8605,  0.5087],
         ...,
         [-0.3882,  0.1002,  0.2162,  ..., -0.2754, -0.2753,  0.3368],
         [-0.3701,  0.1535,  0.1294,  ..., -0.1883, -0.2319,  0.3708],
         [-0.9301


i=120
Sanity Check!!

Start: 3840
End: 3872

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3948, -0.5427,  0.1586,  ..., -0.5660,  0.2053,  0.8219],
         [-1.1501, -0.3278, -0.1114,  ..., -0.3997,  0.6982, -0.3293],
         [ 0.7502,  0.0768,  0.1487,  ..., -0.7399, -0.5405,  0.0103],
         ...,
         [-0.5051, -0.6034,  0.2577,  ..., -0.0563,  0.4751,  0.1759],
         [-0.2960, -0.4927,  0.3569,  ..., -0.2643,  0.3156,  0.5073],
         [-0.0766, -0.4147,  0.4691,  ..., -0.3306,  0.2041,  0.4186]],

        [[-0.7253, -0.0551,  0.1401,  ..., -0.3754,  0.0924,  0.6613],
         [ 0.0786,  0.0625, -0.4453,  ...,  0.2793,  0.6067, -0.2657],
         [-0.3025, -0.6640, -0.2896,  ...,  0.3652, -0.5787,  0.4859],
         ...,
         [-0.8421,  0.0559, -0.0869,  ...,  0.0471,  0.3065,  0.0776],
         [-0.5852,  0.1226,  0.2084,  ..., -0.1577, -0.1306,  0.2824],
         [-0.764


i=150
Sanity Check!!

Start: 4800
End: 4832

Size of LastHidden Layer Size:torch.Size([32, 105, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.4253e-01, -8.6975e-02,  3.6155e-01,  ..., -2.2380e-01,
           4.0058e-01,  7.5443e-01],
         [-4.0917e-01,  4.0057e-02, -3.0581e-01,  ...,  2.6498e-01,
           7.3313e-01, -1.6402e-01],
         [-6.2069e-01, -7.6083e-01, -5.7245e-01,  ...,  4.4086e-01,
          -1.2565e-02,  3.5093e-01],
         ...,
         [-3.0202e-01, -2.1803e-01,  5.7064e-01,  ...,  2.7582e-02,
           2.9641e-01,  2.7669e-02],
         [-2.2543e-01, -1.1438e-01,  3.3806e-01,  ..., -1.1727e-01,
           5.8577e-01,  8.6509e-02],
         [-5.6543e-01, -3.5831e-01,  8.0651e-03,  ...,  2.7779e-01,
           7.7613e-01, -1.7268e-02]],

        [[-5.7425e-01, -4.1227e-01,  4.0035e-01,  ..., -6.6335e-02,
           9.9156e-01,  3.8805e-01],
         [-1.3457e-01, -6.3196e-01, -1.9743e-01,  ...,  4.2630e-01,
       


i=180
Sanity Check!!

Start: 5760
End: 5792

Size of LastHidden Layer Size:torch.Size([32, 55, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4363, -0.0332, -0.0328,  ..., -0.5426,  0.2267,  0.9964],
         [-0.4078,  0.0218, -0.4376,  ...,  0.1637,  0.3922,  0.0331],
         [-0.5302, -0.4966, -0.2366,  ..., -0.1519, -0.1707,  0.3473],
         ...,
         [ 0.6975,  0.2356, -0.3238,  ...,  0.0394, -0.6425, -0.3234],
         [-0.6081,  0.0951, -0.1261,  ..., -0.6081,  0.1326,  0.1859],
         [ 0.7041,  0.2305, -0.3287,  ...,  0.0363, -0.6333, -0.3266]],

        [[-0.7087, -0.0854, -0.2203,  ..., -0.5064, -0.2378,  0.7105],
         [-0.3732, -0.3434, -0.2664,  ...,  0.4716, -0.3841, -0.3570],
         [ 0.0086, -0.5215, -0.3138,  ..., -0.1640, -0.8351,  0.8068],
         ...,
         [-0.6661, -0.2615,  0.2164,  ...,  0.1655, -0.1906,  0.3871],
         [-0.6947, -0.3279,  0.1532,  ...,  0.0583, -0.1989,  0.4291],
         [-0.455


i=210
Sanity Check!!

Start: 6720
End: 6752

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5145,  0.1573,  0.0535,  ..., -0.2117,  0.0764,  0.4801],
         [-0.0304,  0.2422,  0.3203,  ...,  0.0713,  0.6923,  0.3812],
         [-0.3512, -0.1021, -0.0320,  ..., -0.0925, -0.3584,  0.6901],
         ...,
         [-0.8079,  0.0408,  0.1762,  ...,  0.1758,  0.3430,  0.6220],
         [-0.9688, -0.0300,  0.1934,  ...,  0.0635,  0.3545,  0.6855],
         [-0.8163,  0.1393,  0.3476,  ...,  0.2252,  0.3642,  0.7053]],

        [[-0.7769, -0.3943, -0.1693,  ..., -0.3609,  0.3050,  0.6691],
         [-0.7632, -0.0372, -0.7481,  ...,  0.0511,  0.2125,  1.0245],
         [-0.8173, -0.3167, -0.8009,  ...,  0.5744, -0.2053,  0.7937],
         ...,
         [-0.9817, -0.1895,  0.1847,  ..., -0.3031,  0.4132,  0.5537],
         [-0.8962, -0.0496,  0.1055,  ..., -0.0879,  0.6232,  0.5733],
         [-0.435


i=240
Sanity Check!!

Start: 7680
End: 7712

Size of LastHidden Layer Size:torch.Size([32, 88, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.1707e-01, -5.1129e-02,  3.5682e-01,  ..., -2.6782e-01,
           1.8480e-01,  7.0959e-01],
         [-4.7106e-01,  2.0449e-02, -2.9024e-01,  ...,  3.8921e-01,
           4.6555e-01, -1.2668e-01],
         [-7.9470e-01, -7.4755e-01, -2.9854e-01,  ...,  5.4106e-01,
          -2.0614e-01,  5.8338e-01],
         ...,
         [-4.9607e-01,  8.0248e-03,  1.4669e-01,  ..., -2.3045e-01,
           4.9948e-01,  1.3716e-01],
         [-3.2579e-01, -1.7671e-02,  1.8631e-01,  ..., -1.6404e-01,
           2.8632e-02,  1.0202e-01],
         [-3.6871e-01, -1.8099e-01,  1.3341e-01,  ..., -5.1001e-02,
           3.8075e-01,  4.6004e-02]],

        [[-2.5108e-01,  1.1162e-01, -7.3489e-03,  ..., -2.2489e-01,
           2.2215e-01,  3.5225e-01],
         [-6.9244e-01, -2.7181e-01, -4.0518e-01,  ...,  1.2016e-03,
        


i=270
Sanity Check!!

Start: 8640
End: 8672

Size of LastHidden Layer Size:torch.Size([32, 91, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.4658e-01, -8.9169e-02,  1.5627e-02,  ..., -1.6727e-01,
           1.5573e-01,  6.6239e-01],
         [-3.2453e-01,  1.9813e-01, -4.8500e-01,  ..., -9.8879e-02,
           4.2817e-01,  1.5102e-02],
         [-4.9896e-01, -1.3785e-01,  3.5641e-01,  ..., -7.9018e-02,
           1.1855e-01, -6.6620e-02],
         ...,
         [-2.0610e-01,  2.4267e-01,  2.8147e-01,  ...,  3.9738e-02,
           3.9852e-01,  1.5022e-01],
         [-2.8501e-01, -3.8770e-02,  1.7191e-01,  ...,  7.8340e-02,
           3.1193e-01, -1.7292e-01],
         [-2.9828e-01,  9.0853e-02,  2.3115e-01,  ...,  1.5333e-01,
           2.7582e-01,  4.2379e-02]],

        [[-7.1974e-01, -3.1592e-02,  5.0772e-02,  ..., -1.0127e-01,
           1.9937e-01,  5.0063e-01],
         [ 2.3679e-01,  4.4096e-01, -3.2389e-01,  ..., -9.9903e-02,
        


i=300
Sanity Check!!

Start: 9600
End: 9632

Size of LastHidden Layer Size:torch.Size([32, 73, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3685,  0.1023, -0.1839,  ..., -0.4142,  0.3850,  0.7769],
         [-0.0167, -0.3020, -0.5505,  ...,  0.2999,  0.3360, -0.2119],
         [-0.1345, -0.5262, -0.1045,  ..., -0.5378,  0.1235,  0.9993],
         ...,
         [-0.6214, -0.4552, -0.1632,  ..., -0.0301,  0.4347,  0.4931],
         [-0.5699, -0.5243, -0.3519,  ...,  0.1020,  0.3311,  0.3272],
         [-0.2018,  0.0385,  0.0798,  ..., -0.2344,  0.2813,  0.4695]],

        [[-0.6822, -0.5049, -0.2099,  ..., -0.1331, -0.2372,  0.8407],
         [ 0.3383,  0.3539, -0.4475,  ..., -0.1557,  0.5274,  0.7261],
         [-1.4048, -0.3300, -0.4731,  ...,  0.2106, -0.4514,  0.0940],
         ...,
         [-0.6597, -0.4524, -0.2481,  ...,  0.1178, -0.1335,  0.2183],
         [-0.6991, -0.3189, -0.2325,  ...,  0.0780, -0.1264,  0.1938],
         [-0.716


i=330
Sanity Check!!

Start: 10560
End: 10592

Size of LastHidden Layer Size:torch.Size([32, 78, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.4689e-01, -2.2892e-01, -2.0777e-02,  ..., -3.7563e-01,
          -2.4517e-01,  7.1734e-01],
         [ 2.2723e-01,  2.8927e-01,  3.7502e-01,  ..., -6.1775e-01,
          -3.1285e-02,  3.0190e-01],
         [-9.7581e-01, -4.5677e-01,  3.7328e-02,  ...,  5.5114e-02,
          -3.1954e-01, -1.8019e-01],
         ...,
         [-6.0512e-01,  5.3864e-02, -1.3480e-01,  ..., -2.3965e-02,
          -2.4135e-01,  2.0487e-01],
         [-6.5372e-01, -9.7695e-02,  2.2858e-01,  ..., -9.1234e-03,
          -9.0559e-01,  2.3000e-01],
         [-6.3993e-01,  1.0021e-01, -7.5812e-03,  ..., -1.4195e-01,
          -3.4213e-01,  3.7010e-01]],

        [[-3.8393e-01, -1.3259e-01, -5.1040e-02,  ..., -6.0035e-01,
           2.2477e-01,  6.5014e-01],
         [-1.6727e-03,  1.5005e-01, -2.7926e-01,  ..., -5.2127e-04,
      


i=350
Sanity Check!!

Start: 11200
End: 11232

Size of LastHidden Layer Size:torch.Size([32, 62, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.8865e-01, -6.4229e-02, -3.3400e-01,  ..., -4.4815e-01,
           1.6199e-01,  5.6508e-01],
         [-7.2627e-02,  7.6649e-01, -3.3161e-01,  ...,  2.0412e-02,
           3.1197e-01,  8.1912e-01],
         [-1.0343e+00,  1.0374e-01, -2.2187e-01,  ..., -6.9695e-02,
          -3.0865e-01,  4.7673e-01],
         ...,
         [-5.6188e-01,  1.9403e-02, -2.2244e-01,  ..., -3.5854e-02,
           1.5855e-02,  3.9948e-01],
         [-6.8961e-01, -3.4538e-01,  4.7767e-02,  ..., -8.8189e-02,
          -6.2900e-02,  2.8185e-01],
         [-4.3203e-01, -7.0142e-02, -1.9778e-01,  ...,  8.2808e-02,
           1.1623e-01,  3.6614e-01]],

        [[-3.5241e-01, -1.0204e-01, -1.2148e-01,  ..., -3.1171e-01,
           2.3960e-01,  5.0130e-01],
         [-1.9378e-01, -3.9851e-01,  1.7858e-01,  ...,  1.5763e-01,
      


i=380
Sanity Check!!

Start: 12160
End: 12192

Size of LastHidden Layer Size:torch.Size([32, 84, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.7490e-01,  9.3026e-02,  8.7999e-02,  ..., -1.9884e-01,
           1.4974e-01,  6.6295e-01],
         [-8.7457e-01,  1.2701e-01, -1.9938e-02,  ...,  1.7423e-01,
           1.4250e-01, -3.9643e-01],
         [-6.2101e-01, -3.8663e-01,  1.5335e-01,  ..., -5.3572e-02,
          -3.1947e-01,  3.6939e-01],
         ...,
         [-7.5123e-01, -3.8037e-02,  6.1378e-03,  ...,  1.4114e-01,
           2.9152e-01,  2.9977e-01],
         [-5.5455e-01,  1.0724e-01,  3.8258e-01,  ..., -1.0300e-01,
          -3.4989e-01,  3.4894e-01],
         [-7.2601e-01, -3.2045e-02,  9.5943e-02,  ...,  1.0484e-01,
           1.9446e-01,  3.4714e-01]],

        [[-8.1336e-01,  5.5805e-03, -2.1696e-01,  ..., -4.2471e-01,
           7.7279e-03,  7.9507e-01],
         [-3.5190e-01,  7.9109e-02, -5.4653e-01,  ...,  5.4823e-01,
      


i=410
Sanity Check!!

Start: 13120
End: 13152

Size of LastHidden Layer Size:torch.Size([32, 51, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.1004e-01, -2.3838e-01,  1.8480e-02,  ..., -6.9152e-01,
           3.7035e-01,  6.4048e-01],
         [-1.2951e-01, -8.4925e-02, -4.4197e-01,  ..., -1.4863e-01,
           4.2659e-01,  3.2371e-01],
         [-3.6287e-01, -6.3949e-01,  4.7558e-01,  ...,  2.7888e-02,
           6.7924e-02, -2.8631e-01],
         ...,
         [-9.4184e-02, -5.5279e-01,  3.8446e-01,  ..., -6.9522e-03,
           1.2555e-01,  2.2916e-01],
         [-2.2193e-02, -2.5196e-01,  2.6493e-01,  ...,  8.3343e-02,
           2.9426e-01,  2.4809e-01],
         [-2.6984e-01, -3.7147e-01, -1.9434e-04,  ...,  4.8089e-02,
           4.4263e-01,  3.0640e-01]],

        [[-1.1435e+00, -2.0285e-01, -1.9592e-01,  ..., -3.6590e-01,
           2.7447e-01,  9.7083e-01],
         [-7.7828e-01,  1.4346e-01,  5.6957e-02,  ..., -2.0743e-02,
      


i=430
Sanity Check!!

Start: 13760
End: 13792

Size of LastHidden Layer Size:torch.Size([32, 55, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.9116e-01, -8.3254e-02, -1.5907e-01,  ..., -4.1790e-01,
           2.1644e-01,  7.0909e-01],
         [-4.2181e-01,  5.4307e-02, -1.1267e-01,  ..., -8.0470e-02,
           5.6047e-01,  3.8433e-01],
         [-8.6728e-01, -3.9440e-01,  4.7616e-01,  ..., -6.8217e-03,
           9.3223e-02, -2.2557e-01],
         ...,
         [-4.7549e-01, -2.7204e-01, -9.0004e-02,  ..., -5.3651e-02,
           2.3935e-01,  4.8654e-01],
         [-5.4101e-01, -2.0349e-01, -1.6630e-01,  ...,  1.3313e-01,
           3.2549e-01,  3.5619e-01],
         [-4.5742e-01, -1.1362e-01,  8.9052e-02,  ..., -8.1360e-02,
           1.2071e-01,  5.3570e-01]],

        [[-6.7815e-01,  2.8150e-02, -1.9139e-01,  ..., -3.3927e-01,
           2.3021e-01,  7.2686e-01],
         [ 6.2822e-01,  5.7001e-01, -2.5505e-01,  ..., -1.1474e-02,
      


i=450
Sanity Check!!

Start: 14400
End: 14432

Size of LastHidden Layer Size:torch.Size([32, 52, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.2934, -0.4237,  0.0899,  ..., -0.5639,  0.2857,  0.8358],
         [ 0.1178,  0.0471, -0.1684,  ..., -0.0018,  0.5432,  0.0887],
         [ 0.5006, -0.8013,  0.8435,  ..., -0.2393,  0.5072,  0.1207],
         ...,
         [ 1.2750, -0.9866,  1.0765,  ..., -0.6156,  1.1242,  0.3498],
         [-0.4114, -0.0090,  0.5199,  ..., -0.7006,  0.2373, -0.2317],
         [ 0.6939,  0.1513, -0.1636,  ..., -0.0476, -0.6430, -0.1974]],

        [[-0.7598,  0.0644,  0.0405,  ..., -0.5643,  0.5305,  0.7499],
         [-0.5251,  0.0150, -0.1476,  ..., -0.1987,  0.9172,  0.2710],
         [-0.7352, -0.6019,  0.1267,  ..., -0.3661,  0.1133,  0.7675],
         ...,
         [-0.4344,  0.0500,  0.2333,  ...,  0.3111,  0.8358,  0.0574],
         [-0.6685, -0.1161,  0.1875,  ..., -0.0039,  0.5802,  0.3203],
         [-0.4


i=480
Sanity Check!!

Start: 15360
End: 15392

Size of LastHidden Layer Size:torch.Size([32, 57, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7208, -0.0117, -0.2320,  ..., -0.2778, -0.0112,  0.7107],
         [-1.4350, -0.5163, -1.5149,  ..., -0.0649,  0.1336,  0.6820],
         [ 0.2364, -0.1422,  0.1291,  ...,  0.0500, -0.8533,  0.6045],
         ...,
         [-0.8141,  0.0056, -0.5310,  ..., -0.2093, -0.1436,  0.3807],
         [-0.7006,  0.1726, -0.1895,  ..., -0.2836, -0.1335,  0.4645],
         [-0.7662,  0.1630, -0.2515,  ..., -0.3533, -0.1792,  0.4732]],

        [[-0.6522,  0.1041,  0.0326,  ..., -0.4520,  0.2143,  0.7205],
         [-0.7343,  0.1931,  0.4312,  ..., -0.3786,  0.5287,  0.2138],
         [-0.6108,  0.5184, -0.0807,  ...,  0.0059,  0.3140,  0.4423],
         ...,
         [-0.6813, -0.0727, -0.0937,  ..., -0.0376,  0.5101,  0.4804],
         [-0.4068,  0.1707,  0.0695,  ..., -0.1342,  0.1399,  0.4522],
         [-0.4


i=510
Sanity Check!!

Start: 16320
End: 16352

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.2689e+00,  1.5835e-01,  3.8957e-02,  ..., -1.8984e-01,
          -6.2165e-02,  5.8116e-01],
         [ 1.5065e-01,  2.1353e-01, -5.6814e-01,  ..., -2.3314e-01,
           5.1182e-01,  1.2819e-01],
         [-8.2052e-01, -1.4622e-01, -1.6462e-01,  ..., -1.4244e-02,
           1.7444e-01,  3.4858e-01],
         ...,
         [-3.3360e-01,  2.1487e-01,  1.0465e-01,  ...,  2.6260e-01,
           4.3676e-02,  7.9604e-02],
         [-3.8968e-01,  1.4511e-01,  1.3608e-01,  ...,  1.9547e-01,
          -8.3537e-03,  1.4921e-02],
         [-4.7328e-01,  3.3583e-01,  2.1328e-01,  ..., -7.3196e-02,
          -2.6742e-01, -3.5352e-02]],

        [[-5.9584e-01, -4.3120e-02, -1.5499e-02,  ..., -4.0456e-01,
           7.5800e-02,  7.7500e-01],
         [ 1.4020e-01,  4.8057e-01, -1.6450e-01,  ..., -4.0382e-01,
      


i=530
Sanity Check!!

Start: 16960
End: 16992

Size of LastHidden Layer Size:torch.Size([32, 92, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.5187e-01, -4.7868e-01, -8.1698e-03,  ..., -6.3223e-01,
           1.8331e-01,  7.3355e-01],
         [ 1.4521e-01, -3.1421e-02, -4.3061e-01,  ..., -5.0091e-02,
           3.4634e-01,  3.5879e-01],
         [-6.5119e-02, -8.1290e-01,  2.9917e-02,  ..., -2.0097e-01,
           7.3581e-01,  8.4453e-02],
         ...,
         [-2.1956e-01, -4.5792e-01,  6.7917e-02,  ..., -2.2767e-01,
           4.4429e-01,  1.5168e-01],
         [-3.4852e-01, -6.2772e-01,  3.2500e-02,  ..., -1.6199e-01,
           4.0549e-01,  1.4126e-01],
         [-6.2427e-02, -3.4626e-01,  4.1247e-01,  ..., -2.7393e-01,
           1.8245e-01,  1.3574e-01]],

        [[-7.8414e-01, -1.7414e-01, -1.2652e-01,  ..., -3.4939e-01,
          -4.8765e-02,  5.2948e-01],
         [-1.5965e-01,  3.9632e-02, -3.2969e-01,  ...,  1.3889e-01,
      


i=550
Sanity Check!!

Start: 17600
End: 17632

Size of LastHidden Layer Size:torch.Size([32, 63, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.2519, -0.2461, -0.7374,  ..., -0.5334,  0.1491,  0.6979],
         [-1.1263, -0.7646, -0.7510,  ..., -0.2056,  0.7825,  0.1118],
         [ 0.6509, -0.0640, -0.1624,  ..., -0.1239, -0.9303, -0.1234],
         ...,
         [-0.1727, -0.1833, -0.1239,  ...,  0.1863,  0.0947,  0.0108],
         [-0.4987, -0.4408, -0.6262,  ...,  0.0810,  0.1735,  0.1172],
         [-0.2208,  0.0281, -0.2776,  ...,  0.0390, -0.0073,  0.2537]],

        [[-0.9474, -0.0138, -0.1542,  ..., -0.3065,  0.2882,  0.6728],
         [-0.8817, -1.0922, -0.1372,  ...,  0.2770, -0.0605, -0.5506],
         [-0.9674, -1.0002,  0.1031,  ..., -0.2880,  0.0119,  0.4500],
         ...,
         [-0.5163,  0.0059,  0.4684,  ...,  0.1393, -0.0993, -0.0389],
         [-0.3314, -0.0111,  0.5664,  ...,  0.1767,  0.0760, -0.0469],
         [-0.6


i=580
Sanity Check!!

Start: 18560
End: 18592

Size of LastHidden Layer Size:torch.Size([32, 56, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.2182e-01,  1.3252e-01,  7.9171e-02,  ..., -4.5673e-01,
           3.6067e-01,  6.9661e-01],
         [-5.2764e-01, -3.9448e-01, -3.6188e-01,  ..., -1.8608e-02,
          -1.1335e-01, -4.1660e-01],
         [-2.1581e-01, -7.8538e-01, -1.6379e-01,  ..., -1.8591e-01,
          -5.3375e-01,  3.4372e-01],
         ...,
         [-3.3662e-01,  2.1938e-01, -1.4513e-02,  ..., -4.5299e-01,
           6.4824e-03,  5.0337e-01],
         [-2.8732e-01,  2.2311e-01,  1.9276e-01,  ..., -5.0168e-01,
           1.7748e-01,  1.5640e-01],
         [-4.4903e-01,  3.8597e-02, -2.5146e-01,  ..., -1.7940e-01,
           3.0083e-01,  4.0589e-01]],

        [[-7.1501e-01, -9.5493e-03,  4.9564e-03,  ..., -4.0974e-01,
           7.5957e-02,  8.8390e-01],
         [-7.0525e-01,  2.1025e-01, -1.2967e-01,  ...,  1.8587e-01,
      


i=610
Sanity Check!!

Start: 19520
End: 19552

Size of LastHidden Layer Size:torch.Size([32, 46, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.2763e-01,  9.7609e-02, -4.2317e-02,  ..., -3.6094e-01,
           1.9713e-01,  5.0451e-01],
         [-5.7598e-01,  2.1665e-01, -5.2619e-01,  ...,  1.1638e-01,
          -1.3582e-01, -5.9129e-01],
         [-5.0675e-01, -6.4629e-02, -4.6631e-01,  ..., -1.0678e-01,
          -1.3274e-01,  3.0048e-01],
         ...,
         [-7.6753e-01, -3.0906e-02, -2.2497e-01,  ..., -1.4940e-01,
           3.1892e-01,  4.2274e-01],
         [-8.1126e-01,  1.1688e-03, -2.0302e-01,  ..., -1.0779e-01,
           3.2383e-01,  4.2646e-01],
         [-7.4502e-01, -8.6422e-02, -2.3620e-01,  ..., -1.8611e-01,
           3.0534e-01,  4.6836e-01]],

        [[-8.7616e-01, -1.0216e-01, -2.0519e-01,  ..., -6.5229e-01,
           2.0573e-01,  9.7243e-01],
         [-1.8859e-01,  2.4751e-01, -5.3206e-01,  ...,  1.0851e-01,
      


i=630
Sanity Check!!

Start: 20160
End: 20192

Size of LastHidden Layer Size:torch.Size([32, 55, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.9284e-01, -1.7260e-01, -6.2130e-02,  ..., -7.2328e-01,
           1.7515e-01,  8.1155e-01],
         [-6.1198e-01,  3.1680e-02, -3.0264e-02,  ..., -1.4901e-02,
           2.9216e-01,  2.0513e-01],
         [-4.3745e-01, -5.1479e-03,  5.8426e-02,  ..., -4.6014e-01,
          -6.4342e-01,  3.8801e-01],
         ...,
         [-5.3527e-01,  1.0828e-01, -8.0396e-02,  ...,  1.0981e-01,
          -3.0975e-02,  3.4374e-01],
         [-4.7475e-01, -2.1264e-02, -2.6597e-01,  ...,  1.4951e-01,
           2.1173e-01,  3.7328e-01],
         [-4.9986e-01, -3.1714e-02, -2.7112e-01,  ...,  2.2055e-01,
           2.6655e-01,  4.0405e-01]],

        [[-4.9245e-01,  2.8129e-01, -2.0923e-01,  ..., -4.1040e-01,
          -2.6276e-02,  6.9327e-01],
         [-3.1913e-02,  3.9439e-01, -7.6065e-01,  ...,  4.8595e-01,
      


i=660
Sanity Check!!

Start: 21120
End: 21152

Size of LastHidden Layer Size:torch.Size([32, 70, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-5.0157e-01, -2.4420e-01, -8.0009e-02,  ..., -2.9284e-01,
           2.4821e-01,  6.7549e-01],
         [-9.6741e-02, -5.4171e-02, -3.1891e-01,  ..., -9.9838e-02,
           1.3756e-01,  1.2959e-01],
         [-4.3143e-01, -8.9267e-01,  3.7268e-01,  ..., -3.3054e-01,
          -1.4517e-01,  2.0780e-01],
         ...,
         [-5.3563e-01, -3.3017e-01, -5.4493e-01,  ..., -1.8576e-02,
           2.9100e-01,  4.0978e-01],
         [-5.1682e-01, -2.9475e-01, -5.2284e-01,  ..., -3.2301e-02,
           2.6933e-01,  4.1800e-01],
         [-6.1838e-01, -2.7843e-01, -5.5605e-01,  ..., -3.8772e-02,
           2.3184e-01,  3.9419e-01]],

        [[-4.5520e-01, -8.4083e-02,  5.5470e-02,  ..., -5.5541e-01,
           1.5400e-01,  7.0571e-01],
         [-3.4733e-01, -2.4224e-01, -9.8030e-02,  ...,  9.1836e-02,
      


i=690
Sanity Check!!

Start: 22080
End: 22112

Size of LastHidden Layer Size:torch.Size([32, 56, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5544, -0.0862,  0.3035,  ..., -0.5572,  0.2782,  0.5866],
         [-0.0227, -0.1152, -0.2762,  ...,  0.4527,  0.4248, -0.0783],
         [ 0.0859, -1.1757,  0.6135,  ..., -0.1682,  0.6311,  0.5046],
         ...,
         [-0.1259,  0.0040,  0.4058,  ..., -0.1239,  0.2477,  0.1930],
         [-0.2208, -0.0702,  0.2952,  ..., -0.1382,  0.2254,  0.2475],
         [-0.1962, -0.5045,  0.0349,  ...,  0.1523,  0.4372, -0.0334]],

        [[-0.5521, -0.0022, -0.1304,  ..., -0.6259, -0.0921,  0.6032],
         [-1.2351,  0.1128, -0.2470,  ..., -0.0883,  0.5505, -0.0752],
         [-0.0691,  0.0080,  0.9783,  ..., -0.5221, -0.5862,  0.3752],
         ...,
         [-0.2616,  0.2798,  0.0680,  ..., -0.1643, -0.4003,  0.6029],
         [-0.4072,  0.1535,  0.1256,  ..., -0.1117, -0.3703,  0.4917],
         [-0.4


i=720
Sanity Check!!

Start: 23040
End: 23072

Size of LastHidden Layer Size:torch.Size([32, 63, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.4793e-01, -5.4592e-02, -1.8705e-01,  ..., -6.6345e-01,
           9.2141e-02,  3.0593e-01],
         [-3.2005e-01, -2.8747e-01, -9.6497e-02,  ..., -2.2126e-01,
          -2.0548e-01, -8.1939e-01],
         [-3.2362e-01, -4.3532e-01,  3.0008e-02,  ..., -6.2746e-01,
          -1.6421e-01, -4.4233e-01],
         ...,
         [-2.0181e-02, -2.7271e-01,  9.2865e-02,  ..., -2.5014e-01,
          -2.0712e-01, -7.5459e-02],
         [-7.4339e-02, -1.8348e-01, -2.2512e-02,  ..., -3.0033e-01,
          -2.0430e-01, -8.1034e-02],
         [-4.3949e-01, -7.5993e-02, -5.9639e-01,  ..., -9.8065e-02,
           1.5419e-01, -7.0344e-02]],

        [[-3.9597e-01, -1.8806e-01, -1.0178e-02,  ..., -2.0390e-01,
           3.6693e-01,  4.5783e-01],
         [-3.2628e-01, -2.6784e-01,  4.6617e-01,  ...,  1.5881e-01,
      


i=750
Sanity Check!!

Start: 24000
End: 24032

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.7402e-01,  4.3308e-02, -6.6685e-02,  ..., -6.5389e-01,
           5.1593e-02,  6.8153e-01],
         [ 1.8149e-01,  1.6206e-01, -3.9024e-01,  ...,  1.0464e-01,
           4.1374e-01,  7.4728e-01],
         [-6.5649e-01, -6.8560e-02, -1.2311e-01,  ..., -6.0962e-02,
          -3.1717e-01, -1.1512e-01],
         ...,
         [-2.7148e-01, -1.5590e-01, -7.9930e-01,  ..., -3.4327e-01,
           1.0197e-01,  5.4398e-01],
         [-3.2222e-03, -1.1584e-01,  2.6238e-01,  ..., -4.3047e-01,
          -1.0137e-01,  4.8702e-01],
         [-2.5144e-02, -1.4785e-01,  3.0187e-01,  ..., -5.1759e-01,
          -1.1151e-01,  5.0865e-01]],

        [[-5.0102e-01, -4.4239e-02,  3.6887e-02,  ..., -5.7559e-01,
           3.6614e-01,  7.5838e-01],
         [-3.0216e-01, -1.6106e-01, -1.7722e-01,  ...,  6.4014e-02,
      


i=780
Sanity Check!!

Start: 24960
End: 24992

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6996,  0.2371,  0.0776,  ..., -0.5329,  0.3657,  0.7073],
         [-0.1888,  0.1941, -0.4671,  ...,  0.2279,  0.8240, -0.1400],
         [-0.4439, -0.1115,  0.0114,  ..., -0.4619, -0.0622,  0.8055],
         ...,
         [-0.5240, -0.2119, -0.0296,  ..., -0.1586,  0.5180,  0.3207],
         [-0.3518, -0.1135,  0.3366,  ..., -0.2022,  0.3211,  0.3271],
         [-0.3507, -0.0266,  0.2969,  ..., -0.2359,  0.3305,  0.3525]],

        [[-0.9942, -0.3386, -0.2105,  ..., -0.6238,  0.4393,  0.6749],
         [-0.0983, -0.1010, -0.5727,  ...,  0.4600,  0.3373, -0.1439],
         [ 0.1739, -1.0021, -0.7929,  ..., -0.0462, -0.3520,  1.1194],
         ...,
         [-0.3225, -0.3267,  0.1132,  ...,  0.0595,  0.1490,  0.5010],
         [-0.9101, -0.0221,  0.1321,  ..., -0.3652,  0.1430,  0.5137],
         [-0.3


i=810
Sanity Check!!

Start: 25920
End: 25952

Size of LastHidden Layer Size:torch.Size([32, 98, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6753, -0.0316, -0.1101,  ..., -0.5460,  0.2093,  0.5896],
         [-0.4369, -0.1960, -0.3619,  ..., -0.0196,  0.3588, -0.4975],
         [-0.6205, -0.4559, -0.2417,  ..., -0.5645, -0.6316,  0.1413],
         ...,
         [-0.9499, -0.4859, -0.1570,  ..., -0.0503,  0.3984,  0.2075],
         [-0.8849, -0.3692,  0.1228,  ..., -0.1600,  0.3753,  0.3264],
         [-0.6170, -0.2830,  0.3279,  ..., -0.2348,  0.1247,  0.2210]],

        [[-0.8228,  0.1090, -0.2560,  ..., -0.4830,  0.0456,  0.7387],
         [-0.5810, -0.2467,  0.0514,  ..., -0.0437,  0.5543,  0.3042],
         [ 1.0057,  0.2365,  0.8537,  ..., -0.9338, -0.2186, -0.1554],
         ...,
         [-0.6539, -0.3964, -0.0341,  ...,  0.2380, -0.0187,  0.3090],
         [-0.5006, -0.1090,  0.2862,  ...,  0.0741, -0.4147,  0.4088],
         [-0.5


i=840
Sanity Check!!

Start: 26880
End: 26912

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6549, -0.0911, -0.0333,  ..., -0.6225,  0.1812,  0.8505],
         [-1.4364,  0.1012,  0.2203,  ..., -0.4098,  0.3273,  0.5678],
         [ 0.0284, -0.4819, -0.5480,  ..., -0.0800, -0.8731,  0.5752],
         ...,
         [-0.8966, -0.0466, -0.1139,  ..., -0.4049,  0.1083,  0.4612],
         [-0.4735, -0.1424, -0.3844,  ...,  0.2309,  0.3315,  0.4804],
         [-0.9110, -0.1361, -0.0666,  ..., -0.4527,  0.1485,  0.3071]],

        [[-0.6834, -0.2080, -0.1242,  ..., -0.6004,  0.1837,  0.7721],
         [-0.3887, -0.3368, -0.5011,  ...,  0.0032, -0.0233, -0.3083],
         [-0.4691, -1.2033, -0.7990,  ...,  0.5192, -0.5523,  0.7580],
         ...,
         [-0.5489, -0.2913, -0.1344,  ..., -0.1783,  0.1902,  0.4423],
         [-0.8319, -0.1275, -0.0862,  ..., -0.4158,  0.2393,  0.6567],
         [-0.4


i=870
Sanity Check!!

Start: 27840
End: 27872

Size of LastHidden Layer Size:torch.Size([32, 64, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5510, -0.0735,  0.0200,  ..., -0.6219, -0.0414,  0.7024],
         [-0.3995,  0.1709, -0.3352,  ...,  0.2251,  0.2006, -0.5227],
         [-0.6704, -0.5289, -0.1030,  ..., -0.3984, -0.3925,  0.2137],
         ...,
         [-0.5209,  0.0744,  0.0405,  ..., -0.4775, -0.0514,  0.3705],
         [-0.5824, -0.0767, -0.2957,  ..., -0.1912,  0.2108,  0.2796],
         [-0.3703,  0.1963,  0.1349,  ..., -0.4434, -0.1651,  0.2807]],

        [[-0.3792,  0.1205,  0.0820,  ..., -0.3049,  0.1475,  0.5297],
         [ 0.0305, -0.1417, -0.0348,  ...,  0.3236,  0.0901, -0.4602],
         [-0.2770, -0.5386,  0.1332,  ..., -0.1336, -0.4657,  0.4168],
         ...,
         [-0.4264, -0.3156,  0.0743,  ..., -0.2030,  0.1924,  0.3405],
         [-0.4070, -0.2292, -0.2603,  ..., -0.0094,  0.2092,  0.2308],
         [-0.4


i=900
Sanity Check!!

Start: 28800
End: 28832

Size of LastHidden Layer Size:torch.Size([32, 121, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5546, -0.0686, -0.0761,  ..., -0.6410, -0.0711,  0.6206],
         [-0.5235,  0.2351, -0.2528,  ...,  0.2732,  0.8567, -0.0634],
         [-0.2002,  0.2905, -0.1305,  ..., -0.3725,  0.1052,  0.5147],
         ...,
         [-0.3973, -0.0734, -0.1290,  ..., -0.1235, -0.2886,  0.4334],
         [-0.4093, -0.0735, -0.1004,  ..., -0.2032, -0.3269,  0.5302],
         [-0.3930, -0.0570, -0.1455,  ..., -0.0943, -0.2525,  0.4612]],

        [[-0.8712, -0.2206, -0.1919,  ..., -0.5202,  0.1806,  0.6663],
         [-1.3486,  0.3273, -0.2350,  ..., -0.6384,  0.4293,  0.4913],
         [-0.4411, -0.5780, -0.0215,  ..., -0.3075,  0.0890,  0.1518],
         ...,
         [-0.7479, -0.0179, -0.1745,  ..., -0.1626, -0.0592,  0.6146],
         [-0.4240,  0.0858, -0.0412,  ...,  0.0777, -0.0953,  0.4568],
         [-0.


i=930
Sanity Check!!

Start: 29760
End: 29792

Size of LastHidden Layer Size:torch.Size([32, 103, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.3422e-01, -6.1571e-02,  1.3881e-01,  ..., -5.9975e-01,
           2.1337e-01,  6.9611e-01],
         [-1.6799e-01, -1.6073e-01, -3.5239e-01,  ...,  2.7622e-01,
           1.8548e-01, -6.2681e-01],
         [-2.1982e-01, -1.0437e+00, -5.5631e-02,  ..., -1.3531e-02,
          -7.7794e-02,  7.8645e-01],
         ...,
         [-6.6869e-01, -4.2251e-01,  6.4977e-01,  ..., -2.5572e-01,
           2.9270e-01,  3.3959e-01],
         [-5.9237e-01, -3.8768e-01,  6.8801e-01,  ..., -3.1650e-01,
           1.3646e-01,  4.2197e-01],
         [-5.9169e-01, -3.9689e-01,  6.3499e-01,  ..., -2.6760e-01,
           2.6878e-01,  3.7454e-01]],

        [[-6.1267e-01,  9.8380e-02, -5.1189e-02,  ..., -2.7395e-01,
          -2.3371e-01,  2.7958e-01],
         [-3.2867e-02,  4.8202e-01, -1.0388e-01,  ..., -4.8361e-01,
     


i=950
Sanity Check!!

Start: 30400
End: 30432

Size of LastHidden Layer Size:torch.Size([32, 54, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.6350e-01, -1.0649e-01, -9.1566e-02,  ..., -3.0436e-01,
           2.8528e-01,  6.9944e-01],
         [-6.6115e-01, -1.3250e-01,  1.2076e-01,  ..., -8.7917e-02,
           1.1717e+00, -6.1525e-02],
         [-4.3092e-01, -2.6390e-01, -9.2921e-02,  ..., -7.1557e-01,
           8.6590e-02,  4.5634e-01],
         ...,
         [-6.3431e-01, -8.6238e-02,  1.9405e-01,  ..., -1.6403e-01,
           3.2501e-01,  3.0597e-01],
         [-8.5509e-01, -2.7676e-01,  1.1056e-02,  ..., -3.3138e-03,
           4.6937e-01,  1.4024e-01],
         [-7.3483e-01, -1.2878e-01,  1.6423e-01,  ..., -1.7160e-01,
           3.2976e-01,  2.5747e-01]],

        [[-7.5037e-01, -2.1419e-01, -8.7746e-02,  ..., -6.0839e-01,
           1.5990e-01,  8.2410e-01],
         [-3.3706e-01, -2.7237e-01, -5.0518e-01,  ...,  3.3242e-01,
      


i=980
Sanity Check!!

Start: 31360
End: 31392

Size of LastHidden Layer Size:torch.Size([32, 70, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.4733, -0.2363, -0.4666,  ..., -0.1925,  0.1640,  0.7539],
         [ 0.9300,  0.3350, -0.4082,  ...,  0.0535,  0.5373,  0.1835],
         [-0.4143, -0.2628, -0.4959,  ...,  0.1765,  0.0266,  0.5150],
         ...,
         [ 0.1305, -0.2322,  0.2333,  ..., -0.2571, -0.0317, -0.1426],
         [-0.2277, -0.4853,  0.0424,  ..., -0.0581,  0.0614, -0.3402],
         [-0.0060, -0.0338,  0.3887,  ..., -0.0411,  0.1503,  0.0231]],

        [[-0.7333, -0.3657,  0.0400,  ..., -0.5049, -0.0601,  0.6154],
         [-0.3084, -0.2976, -0.3099,  ...,  0.0491, -0.0493, -0.7028],
         [-0.9084, -0.1046,  0.2507,  ..., -1.2244, -0.4566,  0.1071],
         ...,
         [-0.6952, -0.0765, -0.2569,  ...,  0.0584, -0.1120,  0.2929],
         [-0.4941, -0.0279, -0.2598,  ..., -0.2103, -0.3296,  0.3499],
         [-0.5


i=1010
Sanity Check!!

Start: 32320
End: 32352

Size of LastHidden Layer Size:torch.Size([32, 46, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.0661,  0.3709, -0.3496,  ..., -0.2674,  0.4942,  0.6756],
         [-0.0607, -0.1605, -0.2078,  ...,  0.0601,  0.7931, -0.0491],
         [ 1.0879,  0.1972, -0.3481,  ..., -0.2121,  0.1214,  0.0914],
         ...,
         [ 0.0160,  0.4967, -0.1353,  ...,  0.0141,  0.2799,  0.2212],
         [-0.0914,  0.0746, -0.3434,  ..., -0.0330,  0.4028,  0.1104],
         [ 0.0649,  0.4614, -0.1300,  ..., -0.0548,  0.2700,  0.1690]],

        [[-0.6065,  0.0715, -0.2232,  ..., -0.3653,  0.1194,  0.7291],
         [-0.7233,  0.0098, -0.2632,  ..., -0.0414,  0.3244, -0.1837],
         [-1.3194, -0.2831,  0.0865,  ..., -0.1463,  0.5342, -0.1411],
         ...,
         [-0.5822,  0.0652, -0.2352,  ..., -0.3043,  0.0034,  0.5040],
         [-0.3366,  0.0496,  0.0442,  ..., -0.2906, -0.0862,  0.4667],
         [-0.


i=1040
Sanity Check!!

Start: 33280
End: 33312

Size of LastHidden Layer Size:torch.Size([32, 60, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.9056e-01, -2.4773e-01, -1.5268e-01,  ..., -2.4516e-01,
           4.5618e-01,  7.1414e-01],
         [-9.4613e-01, -2.4371e-01,  2.8584e-01,  ...,  2.7697e-02,
           4.9655e-01,  1.7265e-01],
         [-3.4171e-01,  2.6006e-01, -3.1270e-01,  ..., -6.9505e-01,
          -4.2627e-02,  4.5531e-01],
         ...,
         [-5.6813e-01, -2.2191e-01,  3.8553e-01,  ..., -2.2888e-01,
           3.5154e-02,  3.4145e-01],
         [-8.3204e-01, -3.3962e-01, -2.9289e-01,  ..., -4.2345e-02,
           1.9423e-01,  2.6765e-01],
         [-5.7426e-01, -2.5253e-01,  2.0428e-01,  ..., -1.7179e-01,
          -3.6972e-03,  2.0634e-01]],

        [[-1.1230e+00, -3.1389e-01, -2.5617e-01,  ..., -4.3964e-01,
           2.7945e-01,  9.4211e-01],
         [-3.1844e-01,  8.4205e-02, -1.1565e-01,  ...,  4.2429e-01,
     


i=1060
Sanity Check!!

Start: 33920
End: 33952

Size of LastHidden Layer Size:torch.Size([32, 74, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7811, -0.1746, -0.0828,  ..., -0.1909,  0.4543,  0.5978],
         [-0.3392, -0.3710, -0.2171,  ...,  0.2427,  0.1501, -0.2993],
         [-0.7133, -0.9621, -0.2947,  ...,  0.1892, -0.4193,  0.4224],
         ...,
         [-0.4749, -0.2092,  0.2200,  ..., -0.3465, -0.1156,  0.2681],
         [-0.6577, -0.0860,  0.0948,  ..., -0.2047,  0.2455, -0.2062],
         [-0.6853,  0.0167,  0.2019,  ..., -0.2328,  0.1010,  0.1774]],

        [[-0.7716, -0.0492,  0.2389,  ..., -0.6263,  0.0793,  0.6792],
         [-0.7327,  0.5957,  0.0320,  ..., -0.3638,  0.1472,  0.2833],
         [-1.2207, -0.4054, -0.2337,  ..., -0.1281, -0.2610, -0.3671],
         ...,
         [-0.2331,  0.0892,  0.3759,  ..., -0.2457, -0.1630, -0.1680],
         [-0.2253, -0.1028,  0.4844,  ..., -0.1923, -0.0621, -0.2043],
         [-0.


i=1090
Sanity Check!!

Start: 34880
End: 34912

Size of LastHidden Layer Size:torch.Size([32, 110, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5346, -0.0287,  0.0455,  ..., -0.3310,  0.1547,  0.6433],
         [-0.2356, -0.2806,  0.0981,  ...,  0.6524,  0.1309, -0.3512],
         [-0.5928, -0.7125,  0.3123,  ...,  0.1428, -0.5968,  0.5875],
         ...,
         [-0.4837, -0.3830,  0.4304,  ...,  0.1084,  0.3211,  0.0844],
         [-0.5217, -0.4277,  0.4004,  ...,  0.0512,  0.3410,  0.1286],
         [-0.3310, -0.4718,  0.4764,  ..., -0.1126,  0.1147,  0.0254]],

        [[-0.6101,  0.3387,  0.0294,  ..., -0.5164,  0.3316,  0.4851],
         [ 0.3006,  0.4952, -0.2298,  ..., -0.0530,  0.4976,  0.3256],
         [-1.2340,  0.2244,  0.4906,  ..., -0.4362, -0.4538, -0.1107],
         ...,
         [-0.4705,  0.0805,  0.1688,  ..., -0.0906,  0.1764,  0.3594],
         [-0.5660, -0.0728, -0.1001,  ...,  0.0698,  0.2469,  0.3293],
         [-0


i=1120
Sanity Check!!

Start: 35840
End: 35872

Size of LastHidden Layer Size:torch.Size([32, 50, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7018,  0.0224, -0.2131,  ..., -0.2097,  0.0560,  0.7255],
         [-1.0541,  0.3161, -0.5020,  ...,  0.1372, -0.2823, -0.3396],
         [-0.6146,  0.3071, -0.2367,  ...,  0.1578, -0.6429,  0.4434],
         ...,
         [-0.5783,  0.1790,  0.0770,  ...,  0.1073, -0.0509,  0.1999],
         [-0.5806,  0.1259, -0.0626,  ...,  0.1364,  0.0458,  0.1618],
         [-0.3343,  0.1390, -0.0062,  ..., -0.1621, -0.1318,  0.4002]],

        [[-0.6607, -0.2863,  0.2746,  ..., -0.2786,  0.4615,  0.5041],
         [-0.3607, -0.0378, -0.1817,  ...,  0.0640,  0.6757, -0.2034],
         [-0.2646, -0.6142,  0.4455,  ..., -0.3579,  0.5913,  0.3836],
         ...,
         [-0.3859, -0.4000,  0.1571,  ...,  0.0585,  0.9155, -0.0489],
         [-0.6130, -0.7293, -0.1990,  ...,  0.2424,  0.9035, -0.1814],
         [-0.


i=1150
Sanity Check!!

Start: 36800
End: 36832

Size of LastHidden Layer Size:torch.Size([32, 66, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-9.7991e-01, -2.6461e-01,  5.2714e-01,  ..., -6.3770e-01,
           3.2011e-01,  4.0133e-01],
         [-1.0146e+00, -5.3041e-01,  6.5516e-01,  ..., -3.9999e-02,
           8.0878e-01, -2.6136e-01],
         [-3.5836e-02, -7.9684e-01, -1.3930e-02,  ..., -1.1992e-01,
          -8.6087e-02,  4.9555e-01],
         ...,
         [-4.8866e-01, -3.9162e-01,  1.3946e+00,  ...,  4.0874e-02,
           5.7251e-02, -8.1174e-01],
         [-2.3838e-01, -4.8952e-02,  8.0714e-01,  ..., -3.7226e-01,
          -1.9144e-01, -9.3012e-01],
         [ 6.8861e-01, -3.3536e-01, -8.6276e-02,  ...,  1.6297e-01,
          -8.5190e-01, -3.9093e-01]],

        [[-6.8040e-01, -1.5861e-01, -5.7293e-02,  ..., -5.0460e-01,
           1.5769e-01,  7.5312e-01],
         [-8.9516e-02, -5.0021e-02, -4.3693e-01,  ...,  5.2375e-02,
     


i=1180
Sanity Check!!

Start: 37760
End: 37792

Size of LastHidden Layer Size:torch.Size([32, 62, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.8957e-01, -1.1079e-02, -2.6238e-01,  ..., -4.3576e-01,
           1.2776e-01,  4.5516e-01],
         [-3.6270e-01, -4.9385e-01, -2.6048e-01,  ...,  3.6345e-01,
          -1.1859e-01, -4.0362e-01],
         [-1.9654e-01, -1.0224e+00, -9.1131e-02,  ..., -2.9252e-01,
          -2.0556e-01,  2.5045e-01],
         ...,
         [-2.1814e-01,  2.9772e-03,  2.3897e-01,  ..., -1.2083e-01,
           8.5316e-02, -2.4416e-01],
         [-2.3140e-01,  2.6695e-02,  2.7457e-01,  ..., -1.7601e-01,
           1.5180e-01, -1.5527e-01],
         [-2.7843e-01,  4.6965e-01,  6.6236e-01,  ...,  1.0585e-01,
           1.1752e-01,  9.5961e-01]],

        [[-5.4814e-01, -2.9283e-02, -7.4734e-02,  ..., -5.4170e-01,
          -1.7777e-01,  5.6601e-01],
         [-2.0291e-01,  1.1982e-01, -1.1110e-01,  ...,  4.9770e-01,
     


i=1210
Sanity Check!!

Start: 38720
End: 38752

Size of LastHidden Layer Size:torch.Size([32, 104, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-5.9672e-01, -9.6384e-02,  3.0960e-02,  ..., -3.3153e-01,
           1.7950e-01,  5.5102e-01],
         [-7.7510e-01, -1.2687e-01, -4.6250e-01,  ...,  7.1049e-02,
          -1.1950e-01, -6.9801e-01],
         [-1.0357e+00, -3.3568e-01, -3.8522e-01,  ...,  2.1100e-01,
          -6.6955e-01,  1.5271e-01],
         ...,
         [-7.1934e-01, -1.2917e-02, -1.6626e-01,  ..., -4.1359e-01,
           2.2745e-01,  2.9131e-01],
         [-7.7392e-01, -2.6172e-01, -5.2168e-01,  ..., -1.0007e-01,
           4.0436e-01,  4.0681e-01],
         [-5.6491e-01, -2.1241e-02, -1.1367e-01,  ..., -3.9772e-01,
           2.0837e-01,  2.4052e-01]],

        [[-6.8826e-01, -5.6386e-02, -1.4726e-01,  ..., -5.7291e-01,
           2.0407e-01,  5.6978e-01],
         [-6.7055e-01, -1.5583e-01, -2.8633e-01,  ..., -2.3679e-02,
    


i=1240
Sanity Check!!

Start: 39680
End: 39712

Size of LastHidden Layer Size:torch.Size([32, 75, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.1833e-01, -6.5693e-03, -9.6273e-02,  ..., -5.9616e-01,
           2.6384e-01,  5.7008e-01],
         [-1.0244e+00,  2.5134e-01, -4.8872e-01,  ..., -3.4336e-01,
           8.4424e-01,  2.6877e-01],
         [-4.5525e-01, -6.4733e-01, -2.3941e-01,  ..., -2.2253e-01,
           5.2076e-02,  1.5825e-01],
         ...,
         [-4.6633e-01,  2.7864e-01, -6.4257e-03,  ..., -3.2443e-01,
           3.0815e-01,  3.4871e-01],
         [-5.1736e-01,  6.0960e-01,  2.1568e-01,  ..., -3.4207e-01,
           4.4657e-01,  2.2499e-01],
         [-5.3730e-01,  1.5965e-01, -3.9313e-02,  ..., -3.8832e-01,
           4.2769e-01,  1.3002e-01]],

        [[-3.3439e-01, -6.8260e-02,  2.2243e-01,  ..., -1.3759e-01,
           3.3524e-01,  8.7434e-01],
         [ 4.4146e-02,  1.5315e-02, -3.3160e-01,  ...,  4.2670e-01,
     


i=1270
Sanity Check!!

Start: 40640
End: 40672

Size of LastHidden Layer Size:torch.Size([32, 56, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.0761e+00, -2.2096e-01, -2.2628e-01,  ..., -3.4045e-01,
           2.9129e-01,  8.1764e-01],
         [-1.6989e-02,  2.3019e-01, -4.7108e-01,  ...,  2.6307e-01,
           5.5510e-01,  2.1838e-01],
         [-6.4820e-01, -7.7813e-01, -2.9025e-01,  ..., -4.7809e-01,
          -1.9668e-01,  6.0799e-01],
         ...,
         [-9.7618e-01, -2.2849e-01, -2.1521e-02,  ..., -1.2122e-01,
           2.4973e-01,  3.8975e-01],
         [-9.7805e-01, -2.6074e-01, -9.6309e-02,  ..., -7.5972e-02,
           2.4273e-01,  4.0527e-01],
         [-8.0418e-01, -9.7077e-02,  5.5713e-02,  ..., -2.6904e-01,
           7.5156e-02,  4.0306e-01]],

        [[-5.0606e-01, -4.3157e-02, -5.0550e-03,  ..., -3.3520e-01,
           1.5792e-01,  5.8491e-01],
         [-4.1915e-01, -3.7846e-01,  3.1821e-03,  ...,  5.8046e-01,
     


i=1290
Sanity Check!!

Start: 41280
End: 41312

Size of LastHidden Layer Size:torch.Size([32, 71, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-7.6944e-01, -2.9009e-02, -2.8337e-01,  ..., -5.1143e-01,
           1.5050e-01,  6.0310e-01],
         [-5.0464e-01, -3.2198e-01, -2.5615e-01,  ..., -7.8674e-03,
          -1.1970e-01, -6.4515e-01],
         [-1.1811e+00, -1.1481e+00, -2.9844e-01,  ...,  4.7719e-01,
          -5.8136e-01,  1.6838e-01],
         ...,
         [-9.2278e-01, -5.4667e-01, -5.4345e-01,  ...,  1.7444e-01,
           3.0097e-01,  5.3920e-01],
         [-5.2231e-01, -1.9766e-01, -4.7799e-02,  ..., -2.4515e-01,
           8.7660e-02,  4.6572e-01],
         [-7.4922e-01, -4.0131e-01, -3.3336e-01,  ...,  1.1784e-01,
           2.7728e-01,  4.4553e-01]],

        [[-3.1525e-01,  3.2395e-02,  1.4782e-01,  ..., -1.8034e-01,
           1.7422e-01,  6.8808e-01],
         [-3.8154e-01, -9.6758e-02, -2.4818e-01,  ...,  2.1266e-01,
     


i=1320
Sanity Check!!

Start: 42240
End: 42272

Size of LastHidden Layer Size:torch.Size([32, 100, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7191,  0.0447, -0.0604,  ..., -0.5074,  0.2318,  0.8542],
         [-0.4827, -0.4323, -0.0425,  ...,  0.1140,  0.3774, -0.1805],
         [-0.3889, -0.7376, -0.0084,  ..., -0.3323, -0.2551,  0.6064],
         ...,
         [-0.7111, -0.2596,  0.2206,  ..., -0.0768,  0.4631,  0.7050],
         [-0.5745, -0.1447,  0.3250,  ..., -0.2265,  0.1652,  0.6396],
         [-0.7649, -0.2068,  0.2286,  ..., -0.0812,  0.4320,  0.6892]],

        [[-0.8756, -0.1103,  0.0508,  ..., -0.3089,  0.4871,  0.6464],
         [-0.3440, -0.0275, -0.1925,  ...,  0.1305,  0.4683, -0.1438],
         [-0.4983, -0.4669, -0.3348,  ...,  0.0722, -0.4181,  0.7149],
         ...,
         [-0.8094, -0.2663, -0.2483,  ...,  0.1695,  0.5847,  0.2483],
         [-0.6446, -0.0217,  0.3770,  ..., -0.2346,  0.2190,  0.2467],
         [-0


i=1350
Sanity Check!!

Start: 43200
End: 43232

Size of LastHidden Layer Size:torch.Size([32, 51, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.1279,  0.0843,  0.0767,  ..., -0.3324,  0.1529,  0.5432],
         [ 0.3386,  0.1888,  0.1408,  ...,  0.1467,  1.0326,  0.1923],
         [-0.1351, -0.2231, -0.0135,  ..., -0.0375, -0.1797,  0.5736],
         ...,
         [-0.1549, -0.2134, -0.2191,  ..., -0.1233,  0.3011,  0.1477],
         [-0.1662,  0.1402,  0.2217,  ..., -0.1241,  0.2685, -0.1108],
         [-0.2232, -0.1534,  0.0375,  ..., -0.0511,  0.4632,  0.2461]],

        [[-0.8655, -0.0786, -0.1779,  ..., -0.6099,  0.1803,  0.8392],
         [-0.1410, -0.0315, -0.3360,  ...,  0.1464,  0.5014,  0.0198],
         [-0.1385, -0.8733, -0.4977,  ..., -0.3828, -0.7765,  0.9127],
         ...,
         [-0.6431,  0.0463,  0.2480,  ..., -0.1841,  0.1262,  0.4203],
         [-0.5767,  0.1027,  0.2052,  ..., -0.2389,  0.1663,  0.4567],
         [-0.


i=1380
Sanity Check!!

Start: 44160
End: 44192

Size of LastHidden Layer Size:torch.Size([32, 58, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.8234e-01, -7.1443e-04,  1.3396e-01,  ..., -6.5841e-01,
           1.0859e-01,  4.7485e-01],
         [-9.0338e-01,  1.4877e-01,  1.3340e-01,  ..., -7.6993e-01,
           4.6966e-01, -1.6064e-01],
         [-2.9121e-01, -9.3190e-01,  7.3921e-01,  ..., -6.6983e-01,
          -6.8631e-02, -7.3289e-01],
         ...,
         [-5.8987e-01, -3.8426e-01,  3.7528e-01,  ..., -3.9990e-01,
           6.9107e-02,  3.0319e-01],
         [-4.7416e-01, -9.4258e-02,  4.5738e-01,  ..., -4.8488e-01,
          -3.6460e-03,  2.3290e-01],
         [-4.1032e-01, -1.1918e-01,  4.6158e-01,  ..., -4.1204e-01,
          -1.5748e-02,  1.2822e-01]],

        [[-2.8812e-01,  8.7883e-02, -1.5393e-01,  ..., -5.6507e-01,
           5.4471e-01,  3.6270e-01],
         [ 1.2300e-01, -7.0247e-01, -1.4626e-01,  ...,  3.4969e-01,
     


i=1400
Sanity Check!!

Start: 44800
End: 44832

Size of LastHidden Layer Size:torch.Size([32, 49, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6398,  0.1266,  0.2515,  ..., -0.6232, -0.0795,  0.6765],
         [-0.2187, -0.2616, -0.2510,  ...,  0.1645,  0.0411, -0.5107],
         [-0.3472, -0.5463, -0.3894,  ..., -0.4424, -0.5993,  0.1541],
         ...,
         [-0.3631, -0.4585,  0.1410,  ..., -0.4038, -0.2558,  0.4822],
         [-0.4346, -0.2071,  0.1216,  ..., -0.5589,  0.0216,  0.4251],
         [-0.2307, -0.3364,  0.2222,  ..., -0.3180, -0.2996,  0.3928]],

        [[-0.7528, -0.1274,  0.1637,  ..., -0.3352, -0.0060,  0.6301],
         [-0.1781,  0.1099, -0.2105,  ...,  0.2213,  0.1806, -0.1416],
         [-0.9587, -1.0314,  0.4606,  ...,  0.0407, -0.0961, -0.1961],
         ...,
         [ 0.7897,  0.0854, -0.1296,  ...,  0.1313, -0.8169, -0.3233],
         [-0.3569, -0.1287, -0.1734,  ..., -0.2598,  0.0880,  0.4759],
         [-0.


i=1430
Sanity Check!!

Start: 45760
End: 45792

Size of LastHidden Layer Size:torch.Size([32, 92, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-8.3053e-01, -3.8574e-01, -9.1089e-02,  ..., -4.6015e-01,
           1.0230e-01,  8.5229e-01],
         [-7.6038e-01,  1.0503e-02,  2.0329e-01,  ...,  1.8308e-01,
           2.7244e-01,  3.2038e-02],
         [-7.8100e-01, -6.1500e-01,  6.3108e-02,  ..., -2.9259e-01,
          -5.1301e-01,  2.4917e-01],
         ...,
         [-2.1522e-01, -1.6738e-01,  4.0801e-02,  ..., -1.6584e-01,
          -5.2119e-01,  3.4314e-01],
         [-4.1508e-01, -4.1343e-01, -2.1907e-01,  ..., -8.6683e-02,
          -1.5667e-01,  1.8290e-01],
         [-5.3129e-01,  7.8200e-02,  1.6427e-01,  ...,  1.1310e-01,
          -5.9707e-01,  5.4062e-01]],

        [[-9.1256e-02,  1.2465e-01,  1.1536e-03,  ..., -1.6702e-01,
           1.4748e-01,  7.1859e-01],
         [ 5.5565e-02,  1.3305e-01, -3.4385e-01,  ...,  1.3803e-02,
     


i=1460
Sanity Check!!

Start: 46720
End: 46752

Size of LastHidden Layer Size:torch.Size([32, 72, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-5.7956e-01, -2.5352e-02, -2.3352e-01,  ..., -3.2217e-01,
           4.8143e-01,  4.3101e-01],
         [-8.9917e-01,  3.5145e-01, -3.3309e-01,  ...,  1.5649e-01,
           1.0117e+00, -6.7885e-02],
         [-6.0139e-01, -5.5835e-03,  1.3494e-02,  ..., -4.9870e-01,
           1.2715e-01, -5.8347e-01],
         ...,
         [-5.6902e-01,  1.9674e-01, -2.2185e-02,  ...,  1.9598e-01,
           1.6839e-01,  2.1197e-02],
         [-1.6564e-01,  9.6809e-02,  9.9824e-02,  ...,  1.0567e-01,
           1.1265e-01,  2.0028e-01],
         [-1.5340e-02,  4.3002e-02,  1.9808e-01,  ...,  2.7864e-02,
           6.8000e-02,  3.2022e-01]],

        [[-4.5377e-01,  2.5766e-01, -2.9201e-01,  ..., -3.5785e-01,
           1.9100e-01,  7.5715e-01],
         [-3.4400e-01,  1.2431e-01, -3.0891e-01,  ...,  2.9804e-01,
     


i=1480
Sanity Check!!

Start: 47360
End: 47392

Size of LastHidden Layer Size:torch.Size([32, 102, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.5501e-01,  7.7588e-02,  5.1225e-02,  ..., -3.3230e-01,
           6.7065e-02,  6.0445e-01],
         [-2.5025e-02,  1.2213e-01,  6.5487e-03,  ..., -3.3382e-01,
           6.1467e-01,  4.0685e-01],
         [-1.0263e+00,  1.6866e-01,  2.9988e-01,  ..., -4.5915e-01,
           7.9558e-01, -1.9854e-01],
         ...,
         [-1.9654e-01, -1.8484e-01,  3.7896e-01,  ..., -2.3717e-02,
           2.4490e-01,  3.6392e-01],
         [-5.1619e-01, -3.0521e-01, -5.5961e-05,  ...,  8.7033e-02,
           5.2843e-01,  4.1074e-01],
         [-4.2174e-01, -2.3850e-01, -7.4062e-02,  ...,  8.5486e-02,
           5.2303e-01,  3.5411e-01]],

        [[-6.0603e-01, -1.3281e-01, -1.0819e-01,  ..., -4.0802e-01,
           2.0320e-01,  6.4036e-01],
         [ 6.0212e-02,  1.7285e-02, -4.4371e-01,  ...,  3.5749e-02,
    


i=1510
Sanity Check!!

Start: 48320
End: 48352

Size of LastHidden Layer Size:torch.Size([32, 93, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.7974,  0.0159, -0.1375,  ..., -0.2411,  0.1950,  0.6480],
         [-0.6984, -0.0983,  0.3681,  ..., -0.1204,  0.6509,  0.0813],
         [-1.3527,  0.0903,  0.0537,  ..., -0.8324, -0.3807,  0.4424],
         ...,
         [-0.8613, -0.3148, -0.1256,  ...,  0.0151,  0.5969,  0.3103],
         [-0.5640, -0.0653,  0.2158,  ..., -0.0497,  0.3570,  0.3002],
         [-0.8713, -0.3430, -0.4374,  ...,  0.2538,  0.3809,  0.5075]],

        [[-0.4512,  0.0344,  0.0059,  ..., -0.6786,  0.0738,  0.7177],
         [-1.2196, -0.4344, -0.5986,  ..., -0.6089,  0.2068,  0.6893],
         [-0.4137, -0.0482,  0.1648,  ..., -0.5006, -0.1223, -0.5408],
         ...,
         [-0.5269,  0.0715,  0.0603,  ..., -0.3261, -0.3808,  0.0890],
         [-0.4794,  0.0103,  0.1496,  ..., -0.3132,  0.0318,  0.1285],
         [-0.


i=1540
Sanity Check!!

Start: 49280
End: 49312

Size of LastHidden Layer Size:torch.Size([32, 62, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.3190,  0.1757,  0.1896,  ..., -0.2686,  0.1675,  0.6816],
         [-0.2615,  0.3141, -0.3414,  ...,  0.3395,  0.5541, -0.1766],
         [-0.4394, -0.2354, -0.4885,  ...,  0.2190,  0.0179,  0.5489],
         ...,
         [ 0.0728, -0.1139,  0.4968,  ..., -0.0593,  0.0987,  0.1801],
         [ 0.0286, -0.0271,  0.3678,  ..., -0.0816,  0.1032,  0.2268],
         [-0.0057,  0.1244,  0.2630,  ..., -0.1065,  0.1259,  0.1631]],

        [[-0.4181, -0.2917,  0.2976,  ..., -0.2125,  0.3785,  0.6532],
         [-0.0699, -0.4342,  0.0471,  ...,  0.2439,  0.2992, -0.1495],
         [-0.5512, -0.7803,  0.1459,  ..., -0.1884,  0.2235,  0.2346],
         ...,
         [ 0.0556, -0.4301,  0.5568,  ...,  0.0435, -0.0528, -0.0768],
         [ 0.0920, -0.0397,  0.5349,  ...,  0.2280,  0.1494, -0.0830],
         [-0.


i=1570
Sanity Check!!

Start: 50240
End: 50272

Size of LastHidden Layer Size:torch.Size([32, 67, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-3.1681e-01, -4.6182e-01,  5.9422e-02,  ..., -3.8880e-01,
           9.4569e-02,  5.1601e-01],
         [-7.1660e-01,  1.4279e-01, -2.6924e-01,  ..., -1.7123e-02,
           7.9799e-01,  5.6780e-02],
         [-1.2790e-01, -1.1909e+00,  1.7800e-01,  ..., -5.2491e-01,
           1.6898e-01, -5.5409e-01],
         ...,
         [-1.6252e-01, -4.4478e-01,  6.1244e-02,  ..., -9.0693e-02,
           4.5602e-01,  2.0640e-01],
         [-2.4827e-01, -5.5354e-01,  4.0187e-01,  ..., -8.8766e-02,
           2.0735e-01, -1.4757e-01],
         [ 2.3500e-01, -7.6196e-03,  2.2865e-01,  ...,  7.9277e-04,
          -5.5860e-02,  3.7836e-01]],

        [[-5.3244e-01, -4.6835e-02,  4.0743e-02,  ..., -4.1343e-01,
           2.1669e-02,  6.3845e-01],
         [ 4.7353e-01,  5.4547e-01, -4.6258e-01,  ..., -4.7148e-02,
     


i=1600
Sanity Check!!

Start: 51200
End: 51232

Size of LastHidden Layer Size:torch.Size([32, 66, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.1089e-01, -1.6821e-01,  2.0879e-01,  ..., -2.9169e-01,
           2.6159e-01,  8.2176e-01],
         [-2.2692e-01, -2.1332e-01,  6.2322e-03,  ...,  1.4835e-01,
           3.0534e-01,  5.5722e-02],
         [-4.0558e-01, -8.0826e-01,  1.7385e-01,  ..., -1.9845e-01,
          -3.5662e-01,  6.7372e-01],
         ...,
         [-2.5199e-01, -2.6697e-01,  6.1807e-01,  ..., -3.0749e-02,
           1.1068e-01,  5.2697e-01],
         [-1.7385e-01, -6.8262e-02,  5.9062e-01,  ..., -3.4655e-02,
           3.7276e-02,  5.0793e-01],
         [-3.0631e-01, -4.4486e-01,  4.6461e-01,  ...,  1.2827e-01,
           2.7509e-01,  5.6538e-01]],

        [[-1.0362e+00, -7.4121e-02, -1.0906e-01,  ..., -3.3829e-01,
           4.0497e-01,  8.3473e-01],
         [-3.4419e-01, -4.7054e-01, -5.2448e-01,  ...,  1.1484e-01,
     


i=1630
Sanity Check!!

Start: 52160
End: 52192

Size of LastHidden Layer Size:torch.Size([32, 61, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-6.0269e-01,  7.1275e-04,  4.6279e-02,  ..., -4.5669e-01,
           2.3427e-01,  6.5363e-01],
         [-5.0927e-01, -5.3110e-02, -1.3220e-01,  ..., -7.7408e-02,
           1.9144e-01, -4.2693e-01],
         [-5.1627e-01, -4.8831e-01,  1.3056e-01,  ..., -2.0895e-01,
          -5.4963e-01,  5.5507e-01],
         ...,
         [-3.9472e-01, -2.4220e-01,  4.0400e-01,  ..., -4.0355e-02,
          -6.7121e-02,  1.5441e-01],
         [-1.7485e-01,  2.4590e-03,  4.0965e-01,  ..., -4.1821e-02,
           9.4490e-02, -2.0762e-01],
         [-1.0930e-01, -1.2356e-02,  3.8546e-01,  ..., -3.1004e-02,
           1.2122e-01, -1.9480e-01]],

        [[-3.2275e-01, -5.1727e-01,  1.1121e-01,  ..., -5.9681e-01,
           3.2106e-01,  7.8569e-01],
         [ 2.2852e-01, -6.1362e-02, -9.7381e-02,  ..., -3.6339e-02,
     


i=1660
Sanity Check!!

Start: 53120
End: 53152

Size of LastHidden Layer Size:torch.Size([32, 85, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-4.3408e-01,  7.4592e-02, -6.4419e-02,  ..., -4.8164e-01,
           2.6781e-01,  5.5573e-01],
         [-3.9077e-01,  9.2923e-03, -1.6465e-01,  ..., -1.0893e-01,
           2.8301e-01, -5.8216e-01],
         [-1.9799e-01, -4.6346e-01,  1.5239e-01,  ..., -2.9148e-01,
          -4.7083e-01, -1.5661e-01],
         ...,
         [-8.2186e-01,  2.4676e-02, -2.9425e-01,  ..., -1.1999e-01,
           2.6301e-01,  3.9343e-01],
         [-8.6009e-01, -1.5682e-01, -3.4591e-01,  ..., -8.6983e-03,
           3.3206e-01,  3.1158e-01],
         [-8.9876e-01, -4.5643e-02, -4.0072e-01,  ..., -1.2384e-01,
           2.7741e-01,  3.0819e-01]],

        [[-8.7794e-01, -4.0558e-02, -7.0775e-02,  ..., -2.3561e-01,
           6.7390e-02,  7.3793e-01],
         [-1.0536e+00,  3.9628e-01,  5.4010e-02,  ...,  1.1054e-01,
     


i=1680
Sanity Check!!

Start: 53760
End: 53792

Size of LastHidden Layer Size:torch.Size([32, 56, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.6800,  0.1472, -0.0612,  ..., -0.0862,  0.0815,  0.5751],
         [-0.1573,  0.2973, -0.5203,  ...,  0.2956,  0.3587,  0.1714],
         [-0.7481, -0.0746,  0.3875,  ...,  0.3198, -0.0905, -0.3032],
         ...,
         [-0.7036,  0.0768, -0.4195,  ...,  0.0624,  0.0120,  0.4629],
         [-0.6132, -0.0554,  0.1923,  ...,  0.2306, -0.2806,  0.2096],
         [-0.7201,  0.1226, -0.3066,  ..., -0.0124, -0.0835,  0.4601]],

        [[-0.2427, -0.2362, -0.0333,  ..., -0.3565,  0.2436,  0.5166],
         [ 0.1475,  0.0067, -0.2569,  ...,  0.0457,  0.3437,  0.1695],
         [-0.1664, -0.6126,  0.0745,  ...,  0.0389,  0.1448, -0.3017],
         ...,
         [-0.0808, -0.2543,  0.2518,  ..., -0.2044,  0.1870,  0.0999],
         [-0.1664, -0.4297, -0.1914,  ..., -0.0775,  0.2719,  0.1007],
         [-0.


i=1710
Sanity Check!!

Start: 54720
End: 54752

Size of LastHidden Layer Size:torch.Size([32, 66, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-0.5811, -0.0481, -0.0222,  ..., -0.4722,  0.0084,  0.8056],
         [-0.7959, -0.3710, -0.2735,  ...,  0.0794,  0.3657, -0.7167],
         [-0.5715, -0.2303, -0.0551,  ..., -0.2161, -0.2024,  0.8684],
         ...,
         [-0.4412, -0.4625,  0.1441,  ...,  0.0023,  0.2790,  0.1590],
         [ 0.2225, -0.3506,  0.2633,  ..., -0.1192,  0.2365,  0.2484],
         [-0.3227, -0.6951,  0.0816,  ...,  0.0335,  0.3247, -0.0191]],

        [[-0.8617,  0.0148, -0.1581,  ..., -0.3013,  0.1721,  0.6167],
         [-0.5601,  0.0648, -0.2381,  ..., -0.0861, -0.0984, -0.5380],
         [-1.0396, -0.6927, -0.4200,  ...,  0.4761, -0.9501,  0.3220],
         ...,
         [-0.8117, -0.2397,  0.1281,  ..., -0.1736,  0.2829,  0.4800],
         [-0.7296, -0.1101, -0.0423,  ..., -0.0731,  0.2793,  0.4489],
         [-0.


i=1740
Sanity Check!!

Start: 55680
End: 55712

Size of LastHidden Layer Size:torch.Size([32, 50, 768])
Pooler output size:torch.Size([32, 768])
            LastHidden Layer: tensor([[[-1.0093, -0.1271, -0.2306,  ..., -0.5149,  0.1722,  1.0066],
         [-0.5753,  0.0229, -0.3885,  ...,  0.1471,  0.0945, -0.2974],
         [-0.4477, -0.3775, -0.6069,  ..., -0.3100, -0.5189,  0.6909],
         ...,
         [-1.0947, -0.1750,  0.0806,  ..., -0.2064,  0.2015,  0.8802],
         [-1.0987, -0.2349,  0.0762,  ..., -0.2285,  0.2484,  0.8604],
         [-0.9940, -0.1528,  0.0529,  ..., -0.3231,  0.0141,  0.7753]],

        [[-0.0446, -0.0382,  0.2009,  ..., -0.1000,  0.6883,  0.4686],
         [ 0.1332, -0.4041,  0.0233,  ...,  0.3974,  0.4271, -0.1816],
         [-0.0781, -0.6564,  0.8106,  ..., -0.3620, -0.2321,  0.0646],
         ...,
         [-0.1395, -0.1115,  0.1745,  ...,  0.4431,  0.4560, -0.4498],
         [-0.1269, -0.0154,  0.4235,  ...,  0.3803,  0.4507, -0.0520],
         [-0.