In [1]:
import torch
import transformers 


# Contents
- Documented
    - Datasets
    - DataLoader and Multi-Processing in Pytorch
    - DataLoader and Memory-Pining in PyTorch

- Todo
    - Torchscript

## Datasets
(Documentation from pytorch)

 torch.utils.data.Dataset is an abstract class representing a dataset. Your custom dataset should inherit Dataset and override the following methods:
 
____len____ so that len(dataset) returns the size of the dataset.

**__getitem__** to support the indexing such that dataset[i] can be used to get iith sample
also implement how to read data from ____init____

This is a __map__ style dataset. We cam also have an iterable sytle dataset where we implemrt the ____iter____ protocol


In [1]:
# we will be creating a dataset for the following sample file. 
! cat ../data/sample_text_label.csv

text,label
aaaa,0
bbbb,1
cccc,1
dddd,0
eeee,0
table,1

In [2]:
# create a sample dataset to read a file which has two cols : (text, label)
# we have to define two methods : __len__ and __get_item__ 
"""
 torch.utils.data.Dataset is an abstract class representing a dataset. Your custom dataset should inherit Dataset and override the following methods:
__len__ so that len(dataset) returns the size of the dataset.
__getitem__ to support the indexing such that dataset[i] can be used to get ith sample
"""
# we also define init to read the input file and/or apply some text(or image) tranformations 
from torch.utils.data.dataset import Dataset
import pandas as pd

# Dataset??
class MyDataset(Dataset):
    def __init__(self, csv_file, tokenizer=None, max_length=128):
        """
        Args :
            csv_file (string) : Path to the csv file with data
            transform(callable, optional) : Optional tranform to be applied on a sample
        """
        df= pd.read_csv(csv_file)
        #drop cols which are n/a (we are not adding the names of cols but if they are not added as headers , we can add it)
        
        df.dropna(subset =["text", "label"], inplace = True)
        
        if tokenizer :
            df["text"] =df["text"].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_length))
#         self.text = df["text"].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_length)).to_numpy()
        self.text = df["text"].to_numpy()
        self.labels = df["label"].to_numpy()
       
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.text[index], self.labels[index]
        
    

In [33]:
# implementation -> 
# Huggingface : read a dataset using the test file we have . Initialize a tokenizer from BERT (with vocab file)
from transformers.tokenization_bert import BertTokenizer


#1 -> simple dataset with text and label (no tokenizing)
dataset = MyDataset("../data/sample_text_label.csv")
print(dataset[-1])

#2 > huggingface -> read and apply tokenizer for model
pretrained_model_path =("eBERT-base-titles-uncased-v2/")
tok = BertTokenizer(pretrained_model_path + "vocab.txt")
dataset = MyDataset("sample_text_label.csv", tok)
print ("CLS, embedding for table, SEP")
print(dataset[-1])

('table', 1)
CLS, embedding for table, SEP
([0, 466, 1], 1)


In [38]:
## DataLoader
# - inputs are -> Dataset (from above) 
from torch.utils.data import DataLoader
# to see the signature : uncomment this line 
# DataLoader?

## DataLoader and Multi-Processing in Pytorch
- meaning of collate : collect and combine
- Collate function -> to collate samples into batches. Torch has customized collation possible (and is often used)
    - When automatic batching is disabled : fn called with individual data point, converts numpy -> pytorch
    - When automatic batching is enabled : list of data samples at a time, expected to combine input samples into a batch for yielding from data loader generator 
    - Always preprend a new dimension as batch dimension
    - numpy arrays -> Pytorch tensors
    - Preserves the data structure ,but batched (if enabled)
- Multi-processing
    - Python Global Interpeter Lock prevents fully parallelizinf code across threads, pytorch provides an easy switch to perform multi-process data loading by setting num_workers to a positive int
    - Use single process data loading by default -> data fetching done in the same process that DataLoader is initialized. Preferred when resource used for sharing data is limited or entired dataset can be loaded in memory
        - more readability, good for debugging  
    - Pytorch dataloaders give much faster data acess than the regular I/O performed on disk , because of multi-processing: num_workers is customizable
        - once num_workers > 0, dataset, collate, worker_init_fn are passed to each worker
        - torch.utils.data.get_worker_info returns useful info in a worker process
        - map style -> main process generates indices using sampler and send to workers. Shuffle/randomization done in the main process
        - recommended not to return CUDA tensors in multi-processing
    - 
- Should be used to handle large datasets
- default batch size =1 

## Converting pre-trained TF Bert model to Pytorch Model 

In [13]:
import torch
from transformers import *
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),]
# for model_class, tokenizer_class, pretrained_weights in MODELS:
    # Load pretrained model/tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained(pretrained_weights)
model.save_pretrained("../models/pytorch_bert_uncased/")

os.listdir("../models/pytorch_bert_uncased/")
#  save_pretrained

['config.json', 'pytorch_model.bin']