#Read the dataset

In [None]:
# Mounting the Google drive which have the dataset

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import tensorflow as tf
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
import warnings

warnings.filterwarnings('ignore')

In [None]:
df=pd.read_excel(r"/content/drive/MyDrive/Colab Notebooks/Dataset/SMSSpamClass.xlsx")
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df["Label"].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

In [None]:
df["spam"]=df["Label"].apply(lambda x: 1 if x=='spam' else 0)
df["spam"].value_counts()

0    4825
1     747
Name: spam, dtype: int64

# Dependent and Independent Variables

In [None]:
X=df.Message
y=df.spam

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# Tokenizing the train and test data

In [None]:
# Tokenizer
tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
train_set=X_train.astype(str).to_list()
test_set=X_test.astype(str).to_list()

train_encoding=tokenizer(train_set,truncation=True,padding=True)
test_encoding=tokenizer(test_set,truncation=True,padding=True)

In [None]:
train_dataset=tf.data.Dataset.from_tensor_slices((
    dict(train_encoding),
    y_train
))

test_dataset=tf.data.Dataset.from_tensor_slices((
    dict(test_encoding),
    y_test
))

In [None]:
training_args=TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    eval_steps=10
)

In [None]:
with training_args.strategy.scope():
  model=TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
with training_args.strategy.scope():
  model=TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

trainer=TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.034090311186654225}

In [None]:
pred=trainer.predict(test_dataset)[1]
pred

array([0, 1, 0, ..., 0, 1, 0])

In [None]:
cm=confusion_matrix(y_test,pred)
cm

array([[955,   0],
       [  0, 160]])