In [1]:
!pip install -qU 'transformers[torch]' datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from pprint import pprint

## **Load and preprocess the dataset.**

In [3]:
# 1. Load the dataset from the hub.

from datasets import load_dataset
sms = load_dataset('sms_spam')
sms

Downloading builder script:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.87k [00:00<?, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 5574
    })
})

In [4]:
sms = sms['train'].train_test_split(train_size=0.85, seed=42, stratify_by_column='label')
sms

DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 4737
    })
    test: Dataset({
        features: ['sms', 'label'],
        num_rows: 837
    })
})

In [5]:
pprint(sms['train'][0])

{'label': 0,
 'sms': "She said,'' do u mind if I go into the bedroom for a minute ? '' "
        "''OK'', I sed in a sexy mood. She came out 5 minuts latr wid a "
        'cake...n My Wife,\n'}


In [6]:
# 2. Tokenization using a tokenization_function and map method
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

def tokenization_function(inp):
    return tokenizer(inp['sms'], truncation=True)

tokenized_dataset = sms.map(tokenization_function, batched=True)
tokenized_dataset

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4737 [00:00<?, ? examples/s]

Map:   0%|          | 0/837 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sms', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4737
    })
    test: Dataset({
        features: ['sms', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 837
    })
})

In [7]:
# Remove some of the columns that the model doesn't require.
tokenized_dataset = tokenized_dataset.remove_columns(['sms'])
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels')
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4737
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 837
    })
})

In [8]:
# 3. Dynamic padding using DataCollator.

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
# Login to the hub and paste your WRITE access token
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## **Training**

In [10]:
# 1. Define TrainingArguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='bert-base-uncased-finetuned-smsspam',
    evaluation_strategy='epoch',
    save_strategy="epoch",
    num_train_epochs=4,
    push_to_hub=True,
)

# 2. Define your Model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# 3. Define the compute metric
import numpy as np
!pip install -qU evaluate
import evaluate

def compute_metrics(eval_preds):
    metrics = evaluate.combine([
        evaluate.load('accuracy'),
        evaluate.load('precision'),
        evaluate.load('recall'),
        evaluate.load('f1')
    ])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=predictions, references=labels)

# 4. Define Trainer object
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)
# 5. Train the transformer
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h

Cloning https://huggingface.co/shre-db/bert-base-uncased-finetuned-smsspam into local empty directory.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0828,0.053816,0.989247,0.972477,0.946429,0.959276
2,0.0269,0.179205,0.967742,0.824427,0.964286,0.888889
3,0.0229,0.062332,0.991637,0.981651,0.955357,0.968326
4,0.0043,0.063694,0.990442,0.981481,0.946429,0.963636


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

TrainOutput(global_step=2372, training_loss=0.028858492940877882, metrics={'train_runtime': 440.5687, 'train_samples_per_second': 43.008, 'train_steps_per_second': 5.384, 'total_flos': 526709277283440.0, 'train_loss': 0.028858492940877882, 'epoch': 4.0})

In [11]:
trainer.push_to_hub(commit_message="Training complete")

To https://huggingface.co/shre-db/bert-base-uncased-finetuned-smsspam
   d19530f..7fbe6ee  main -> main

   d19530f..7fbe6ee  main -> main

To https://huggingface.co/shre-db/bert-base-uncased-finetuned-smsspam
   7fbe6ee..4409b68  main -> main

   7fbe6ee..4409b68  main -> main



'https://huggingface.co/shre-db/bert-base-uncased-finetuned-smsspam/commit/7fbe6ee42d2136113ea3983d5c335bdb8c8ad7e9'

In [None]:
classifier = AutoModelForSequenceClassification.from_pretrained('test_trainer/checkpoint-1500')
classifier

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
from transformers import pipeline

In [13]:
classifier = pipeline("sentiment-analysis", model="shre-db/bert-base-uncased-finetuned-smsspam")

Downloading (…)lve/main/config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [20]:
model.config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

LABEL_0: NOT SPAM<br>
LABEL_1: SPAM

In [14]:
sample = "Become a sub-broker today! Monthly payouts on a high revenue-sharing model with zero office expenses! txtby.me/tc/2GSjt9Fo Um9-ADMPL."

In [15]:
classifier(sample)

[{'label': 'LABEL_1', 'score': 0.9999496936798096}]

In [16]:
sample = 'Dear MyMochi Member, The most awaited #MochiEOSS is here! Get Flat 50% off on Mochi & upto 50% off on other brands from 14 - 16 July. Sale preview for you on 13th July.'

In [17]:
classifier(sample)

[{'label': 'LABEL_1', 'score': 0.9999488592147827}]

In [18]:
sample = "Hey Cleon! How's your masters going in Canada?"

In [19]:
classifier(sample)

[{'label': 'LABEL_0', 'score': 0.9998865127563477}]