## Загрузка датасета

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
messages = list(data['v2'])
labels = list(data['v1'])

In [28]:
y = list(pd.get_dummies(labels, drop_first=True)['spam'])

In [29]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(messages, y, test_size=0.2, random_state=0)

## Подготовка трансформеров

In [30]:
import transformers

In [31]:
import tensorflow as tf

In [32]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [33]:
train_encodings = tokenizer(x_train,
                            truncation=True,
                            padding=True)

val_encodings = tokenizer(x_test,
                            truncation=True,
                            padding=True)

In [34]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_test
))

## Загрузка модели

In [35]:
from transformers import TFDistilBertForSequenceClassification

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

## Обучение модели

In [37]:
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5)

model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])

model.fit(train_dataset.shuffle(100).batch(16),
          epochs=3,
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x174da0a10>

## Оценка результатов

In [38]:
model.evaluate(val_dataset.shuffle(100).batch(16))



[0.042157430201768875, 0.9928250908851624]

In [39]:
y_pred = []
for text in x_test:
    predict_input = tokenizer.encode(text,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")
    
    output = model.predict(predict_input)[0]
    predictions = tf.nn.softmax(output, axis=1).numpy()
    pred = np.argmax(predictions, axis=1)
    y_pred.append(pred)

















In [40]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.9928251121076234
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       953
           1       0.96      0.99      0.98       162

    accuracy                           0.99      1115
   macro avg       0.98      0.99      0.99      1115
weighted avg       0.99      0.99      0.99      1115

