<a href="https://colab.research.google.com/github/rahulsm27/ML/blob/main/Text_Generation_using_Fnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Install libraries and import

In [43]:
!pip install datasets
!pip install torch[transformers]



In [2]:
import torch

## 2. Loading Data

In [3]:
from datasets import load_dataset
datasets = load_dataset('wikitext','wikitext-2-raw-v1')

Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [4]:
datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

## 3. PREPROCESSING DATA

In [5]:
import re
def preprocess_text(sentence):
  text = sentence['text'].lower() # lowering the sentence and storing in text vaiable
  text = re.sub('[^a-z?!.,]',' ',text) # removing other than characters and punctuations
  text = re.sub('\s\s+',' ',text) # removing double spaces
  sentence['text'] = text
  return sentence

In [6]:
datasets['train'] = datasets['train'].map(preprocess_text)
datasets['test'] = datasets['test'].map(preprocess_text)
datasets['validation'] = datasets['validation'].map(preprocess_text)

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [7]:
datasets['train'] = datasets['train'].filter(lambda x : len(x['text']) > 20)
datasets['test'] = datasets['test'].filter(lambda x : len(x['text']) > 20)
datasets['validation'] = datasets['validation'].filter(lambda x : len(x['text']) > 20)

Filter:   0%|          | 0/36718 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4358 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [8]:
datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 2312
    })
    train: Dataset({
        features: ['text'],
        num_rows: 18794
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1988
    })
})

## 3. TOKENIZATION

In [60]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [61]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [75]:
vocab_size = len(tokenizer.vocab)

30522

In [91]:
def tokenize(sentence):
  sentence = tokenizer(sentence['text'],truncation = True)

  return sentence

tokenized_inputs = datasets['train'].map(tokenize)

Map:   0%|          | 0/18794 [00:00<?, ? examples/s]

In [93]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

batch = 16

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
dataloader = DataLoader(list(zip(tokenized_inputs['input_ids'], tokenized_inputs['attention_mask'])),batch_size=batch,
                        collate_fn=data_collator)

In [89]:
x

{'input_ids': [101, 5925, 1040, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [92]:
tokenized_inputs

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 18794
})