# 2nd try

In [17]:
import json
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd 

In [18]:
TRAIN_FILE_PATH = "data/train.json"

with open(TRAIN_FILE_PATH, "r", encoding="utf-8") as f:
    train_data = json.load(f)
# train_data

In [19]:
len(train_data)

12330

In [20]:
train_data[0].keys()

dict_keys(['ner_tags', 'ner_ids', 'tokens', 'space_after'])

In [22]:
TEST_FILE_PATH = "data/test.json"

with open(TEST_FILE_PATH, "r", encoding="utf-8") as f:
    test_data = json.load(f)
# test_data

In [23]:
len(test_data)

2421

In [24]:
TAG2ID_FILE_PATH = "data/tag_to_id.json"

with open(TAG2ID_FILE_PATH, "r", encoding="utf-8") as f:
    tag2id = json.load(f)
# tag2id

## Split Data

In [25]:
X = [{'tokens' : data['tokens'], 'space_after' : data['space_after']} for data in train_data]
y = [{'ner_ids' : data['ner_ids'], 'ner_tags' : data['ner_tags']} for data in train_data]

In [27]:
X_test = [{'tokens' : data['tokens'], 'space_after' : data['space_after']} for data in test_data]

In [28]:
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.2, random_state=35)

## Create DataLoaders

## Apply Bert

In [12]:
import torch # Neural Networks
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModel, AdamW # Transformers
from sklearn.manifold import TSNE # Data projection
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [36]:
MAX_TOKENS_PER_ENTRY = max([len(data['tokens']) for data in train_data])

In [79]:
class DatasetTransformer(Dataset):
    def __init__(self, X, y, tokenizer):
        self.tokenizer = tokenizer
        self.X = X
        self.y = y
            
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        tokens = self.X[idx]
        labels = self.y[idx]
        
        # Remove cedilla diacritics as suggested in
        # https://huggingface.co/dumitrescustefan/bert-base-romanian-uncased-v1
        tokens = [
            token.replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș") 
            for token in tokens
        ]
        print(tokens)
        text_tensor = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens))
        print(text_tensor)
        print('shape',text_tensor.shape)
        padder = torch.zeros(MAX_TOKENS_PER_ENTRY - len(tokens))
        text_tensor = torch.cat([text_tensor, padder])
        print(text_tensor.shape)
        
        return text_tensor, torch.tensor(labels)


In [40]:
tokenizer = AutoTokenizer.from_pretrained(
    "dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=False)
# TODO INCEARCA SI CU True

In [69]:
X_train = [data['tokens'] for data in train_data]
# X_train

In [73]:
y_train = [data['ner_ids'] for data in train_data]
# y_train

In [83]:
ds_train = DatasetTransformer(X_train, y_train, tokenizer)
ds_val = DatasetTransformer(X_validate, y_validate, tokenizer)

BATCH_SIZE = 16

train_dataloader = DataLoader(
    ds_train, 
    sampler=RandomSampler(ds_train), 
    batch_size=BATCH_SIZE
)

val_dataloader = DataLoader(
    ds_val, 
    sampler=SequentialSampler(ds_val), 
    batch_size=BATCH_SIZE
)

# iter = next(iter(train_dataloader))
# print(iter)
iter = iter(train_dataloader)
print(iter.next())
# data, label = next(iter)
# print(data.shape)
# print(data)

# todo preda baga num_workers

TypeError: '_SingleProcessDataLoaderIter' object is not callable

In [48]:
class TransformerModel(nn.Module):
    def __init__(self, in_dim=768, no_classes=2):
        super(TransformerModel, self).__init__()
        # Get the romanian Transformer from the huggingface server
        self.transformer = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1")
        # Add a linear layer for classification
        self.fc1 = nn.Linear(in_dim, no_classes)
        
    def forward(self, x):
        out = x.squeeze(1)
        # Get output from Transformer.
        # We want the special [CLS] representation ([:,0,:]) from the last layer ([0]) 
        out = self.transformer(out)[0][:,0,:]
        # We usually add dropout before the final classification layer when using a Transformer
        out = F.dropout(out, p=0.1)
        out = self.fc1(out)
        return out