In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
from sklearn.metrics import confusion_matrix

In [7]:
df=messages = pd.read_csv('/content/drive/MyDrive/Meta-Learning LLMs with Rozhina/data/SMSSpamCollection', sep='\t',
                           names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.shape

(5572, 2)

In [9]:
X=list(df['message'])
y=list(df['label'])

In [10]:
# create a list of 0 and 1
y=list(pd.get_dummies(y,drop_first=True)['spam'])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [13]:
# DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base.
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [15]:
# truncation is for removing space.
# padding makes all the sentences into a simillar size.
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [16]:
# Convert encodings into dataset objects
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [17]:
# training arguments
training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    eval_steps= 800
)

In [18]:
# start training
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [19]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.019006737640925815}

In [20]:
# all prediction on testset
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[ 3.0453787, -2.9654205],
       [-2.8517468,  2.9679985],
       [ 3.149217 , -3.0912495],
       ...,
       [ 3.0850563, -2.941075 ],
       [-2.8595295,  2.994571 ],
       [ 2.7175896, -2.6989331]], dtype=float32), label_ids=array([0, 1, 0, ..., 0, 1, 0], dtype=int32), metrics={'eval_loss': 0.019013995783669607})

In [21]:
trainer.predict(test_dataset)[1].shape

(1115,)

In [22]:
# labels id
output=trainer.predict(test_dataset)[1]

In [23]:
cm=confusion_matrix(y_test,output)
cm

array([[955,   0],
       [  0, 160]])

In [24]:
trainer.save_model('/content/drive/MyDrive/Meta-Learning LLMs with Rozhina/sentiment_model')

In [25]:
# Load the saved model
model_path = '/content/drive/MyDrive/Meta-Learning LLMs with Rozhina/sentiment_model'
loaded_model = TFDistilBertForSequenceClassification.from_pretrained(model_path)

# Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Define the input sentence
input_sentence = "be our lucky WINNER today.Dont miss this. call 786012964 "

# Tokenize the input sentence
input_ids = tokenizer(input_sentence, padding=True, truncation=True, return_tensors="tf")['input_ids']

# Make a prediction on the input sentence
prediction = loaded_model.predict(input_ids)

# Convert the prediction to a probability and print the result
prob = tf.nn.softmax(prediction.logits, axis=-1)[0][1].numpy() # probability of being spam
print(f"Input: {input_sentence}")
print(f"Predicted probability of being ham: {1 - prob:.4f}")
print(f"Predicted probability of being spam: {prob:.4f}")

Some layers from the model checkpoint at /content/drive/MyDrive/Meta-Learning LLMs with Rozhina/sentiment_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Meta-Learning LLMs with Rozhina/sentiment_model and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use 

Input: be our lucky WINNER today.Dont miss this. call 786012964 
Predicted probability of being ham: 0.0460
Predicted probability of being spam: 0.9540
