## Classification Problem: Spam Classification

In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline, DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_layer_norm', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [4]:
mydata = pd.read_csv('spam.csv')[['v1','v2']]

In [5]:
mydata['target'] = np.where(mydata['v1']=='ham',0,1)

In [6]:
mydata.drop(columns=['v1'],inplace=True)

In [7]:
mydata.head()

Unnamed: 0,v2,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
trainX,testX, trainY, testY = train_test_split(mydata['v2'],mydata['target'],stratify=mydata['target'],test_size=.3)

In [9]:
trainX.reset_index(inplace=True,drop=True)
testX.reset_index(inplace=True,drop=True)
trainY.reset_index(inplace=True,drop=True)
testY.reset_index(inplace=True,drop=True)

In [38]:
trainX_encoded = tokenizer(trainX.to_list(),padding=True,truncation=True)
testX_encoded = tokenizer(testX.to_list(),padding=True,truncation=True)

In [24]:
trainX_encoded[0]

Encoding(num_tokens=186, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [11]:
trainY[trainY==1].head(1)

7    1
Name: target, dtype: int64

In [12]:
trainX_encoded['input_ids'][10]

[101,
 1045,
 14145,
 2080,
 26947,
 1012,
 1012,
 1012,
 2066,
 24654,
 5292,
 2546,
 1012,
 1012,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [13]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(trainX_encoded),trainY))

In [14]:
## as_numpy_iterator = > Returns an iterator which converts all elements of the dataset to numpy.

list(train_dataset.as_numpy_iterator())[10:11]

[({'input_ids': array([  101,  1045, 14145,  2080, 26947,  1012,  1012,  1012,  2066,
          24654,  5292,  2546,  1012,  1012,  1012,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0, 

In [15]:
test_dataset = tf.data.Dataset.from_tensor_slices((dict(testX_encoded),testY))

In [21]:
val_dataset = tf.data.Dataset.from_tensor_slices((dict(testX_encoded)))

In [18]:
count = 0
for element in train_dataset.shuffle(1000).batch(10):
    count = count+1

In [19]:
count

390

In [20]:
trainX.shape

(3900,)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, 
              loss=model.compute_loss) # can also use any keras loss function


In [23]:
## When you use batch with the dataset, you don't need to define the batch_size in the fit method
## source: https://stackoverflow.com/questions/62670041/batch-size-in-tf-model-fit-vs-batch-size-in-tf-data-dataset

model.fit(train_dataset.shuffle(1000).batch(64), epochs=1)



<keras.callbacks.History at 0x7ff7294be590>

In [93]:
model.layers

[<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertMainLayer at 0x7ff738aafe50>,
 <keras.layers.core.Dense at 0x7ff738f3cf90>,
 <keras.layers.core.Dense at 0x7ff738f642d0>,
 <keras.layers.core.Dropout at 0x7ff738f64690>]

In [74]:

raw_pred = model.predict(testX_encoded['input_ids'])

In [87]:
pred_proba = tf.math.softmax(raw_pred[0], axis=-1).numpy()

In [88]:
list(pred_proba[0:10][:,1])

[0.79919356,
 0.6529633,
 0.6586566,
 0.87410295,
 0.6491943,
 0.65674955,
 0.64292157,
 0.6555667,
 0.6694563,
 0.8525124]

In [77]:
testY[0:10]

0    1
1    0
2    0
3    1
4    0
5    0
6    1
7    0
8    0
9    1
Name: target, dtype: int64

In [83]:
from sklearn.metrics import roc_auc_score

In [89]:
roc_auc_score(testY,pred_proba[:,1])

0.9899121941594318

### Next Tasks
* class weight
* add final layer to get the probabilities
* change metrics to auc
* freeze few layers and train the other