In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


In [15]:
# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\HONOR\.cache\kagglehub\datasets\uciml\sms-spam-collection-dataset\versions\1


In [16]:

data = pd.read_csv('spam.csv', encoding='ISO-8859-1')
data = data[['v1', 'v2']] # Оставляем только два столбца: 'v1' (метка класса) и 'v2' (сообщение)
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [17]:
#Беру из датасета первые 5560 строк, чтобы обучить модель (данные будут использоваться для обучения 
# и тестирования 75% на 25%), остальные строки датасета оставлю для наглядного тестирования

# Выделяем первые 5560 строк для обучения и тестирования модели
train_test_data = data.iloc[:5560]

In [20]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

# Определяем признаки (X) и метки классов (y)
X = train_test_data['v2'].values.reshape(-1, 1) # Преобразуем столбец сообщений в форму массива с одним признаком
y = train_test_data['v1']      # Метки классов (spam или ham)                   

# Применяем oversampling, чтобы сбалансировать классы
X_resampled, y_resampled = ros.fit_resample(X, y)

# Создаем новый DataFrame с сбалансированными данными
balanced_data = pd.DataFrame(X_resampled, columns=['v2'])
balanced_data['v1'] = y_resampled

print("Class counts after oversampling:\n", balanced_data['v1'].value_counts())

Class counts after oversampling:
 v1
ham     4815
spam    4815
Name: count, dtype: int64


In [21]:
# Преобразуем столбец меток классов ('v1') в строковый формат, чтобы избежать ошибок при дальнейших преобразованиях
train_test_data['v1'] = train_test_data['v1'].astype(str)
# Заменяем текстовые метки классов: 'ham' (не спам) преобразуется в 0, а 'spam' в 1
train_test_data['v1'] = train_test_data['v1'].map({'ham': 0, 'spam': 1})
# Удаляем строки, где в столбцах 'v1' или 'v2' имеются пропущенные значения
train_test_data = train_test_data.dropna(subset=['v1', 'v2'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_test_data['v1'] = train_test_data['v1'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_test_data['v1'] = train_test_data['v1'].map({'ham': 0, 'spam': 1})


In [36]:
# Преобразуем текстовые данные ('v2') в список строк для дальнейшей обработки
X_text = train_test_data['v2'].values.tolist()
y = train_test_data['v1'].values # Преобразуем метки классов ('v1') в массив

# Разделяем данные на обучающую и тестовую выборки 
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.25, random_state=42
)
# Создаем объект Tokenizer для преобразования текстов в числовые последовательности
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_text)# Строим словарь токенов на обучающей выборке

# Преобразуем тексты в числовые последовательности
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

max_length = 200
# Применяем дополнение (padding) последовательностей до заданной длины
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Создаем модель нейронной сети
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length))

model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.5))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Обучаем модель на обучающей выборке с разделением на валидационную выборку 
history = model.fit(
    X_train_padded,
    y_train,
    validation_split=0.25,
    epochs=10, 
    batch_size=512
)

# Получаем предсказания на тестовой выборке
predictions = model.predict(X_test_padded)

# Преобразуем вероятности в бинарные метки (0 или 1) с порогом 0.5
predicted_labels = (predictions > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, predicted_labels))
print("F1-Score:", f1_score(y_test, predicted_labels))
print("ROC-AUC:", roc_auc_score(y_test, predictions))



Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 3s/step - accuracy: 0.5984 - loss: 0.5930 - val_accuracy: 0.8686 - val_loss: 0.4346
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2s/step - accuracy: 0.8727 - loss: 0.3762 - val_accuracy: 0.8686 - val_loss: 0.4806
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3s/step - accuracy: 0.8691 - loss: 0.3454 - val_accuracy: 0.8686 - val_loss: 0.4132
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3s/step - accuracy: 0.8650 - loss: 0.2727 - val_accuracy: 0.9645 - val_loss: 0.3420
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2s/step - accuracy: 0.9360 - loss: 0.1885 - val_accuracy: 0.9760 - val_loss: 0.2789
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2s/step - accuracy: 0.9830 - loss: 0.1158 - val_accuracy: 0.9808 - val_loss: 0.2305
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

### Чтобы протестировать модель возьму столбцы датасета, которые не были использованы в обучении модели. Выделенные строки относятся к SPAM.

1. No. I meant the calculation is the same. That  &lt;#&gt; units at  &lt;#&gt; . This school is really expensive. Have you started practicing your accent.
2. Because its important. And have you decided if you are doing 4years of dental school or if you'll just do the nmde exam.
3. "Sorry, I'll call later"
4. if you aren't here in the next  &lt;#&gt;  hours imma flip my shit
5. Anything lor. Juz both of us lor.
6. Get me out of this dump heap. My mom decided to come to lowes. BORING.
7. Ok lor... Sony ericsson salesman... I ask shuhui then she say quite gd 2 use so i considering...
8. Ard 6 like dat lor.
9. Why don't you wait 'til at least wednesday to see if you get your .
10. Huh y lei...

**11. "REMINDER FROM O2: To get 2.50 pounds free call credit and details of great offers pls reply 2 this text with your valid name, house no and postcode"**


**12. "This is the 2nd time we have tried 2 contact u. U have won the �750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate."**

13. Will �_ b going to esplanade fr home?
14. "Pity, * was in mood for that. So...any other suggestions?"
15. The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free
16. Rofl. Its true to its name




In [31]:

# Пример новых данных
new_data = [
    "No. I meant the calculation is the same. That <#> units at <#>. This school is really expensive. Have you started practicing your accent. Because it's important. And have you decided if you are doing 4 years of dental school or if you'll just do the nmde exam.",
    "Because its important. And have you decided if you are doing 4years of dental school or if you'll just do the nmde exam.",
    "Sorry, I'll call later",
    "if you aren't here in the next <#> hours imma flip my shit",
    "Anything lor. Juz both of us lor.",
    "Get me out of this dump heap. My mom decided to come to lowes. BORING.",
    "Ok lor... Sony ericsson salesman... I ask shuhui then she say quite gd 2 use so i considering...",
    "Ard 6 like dat lor.",
    "Why don't you wait 'til at least wednesday to see if you get your .",
    "Huh y lei...",
    "REMINDER FROM O2: To get 2.50 pounds free call credit and details of great offers pls reply 2 this text with your valid name, house no and postcode",
    "This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. To claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.",
    "Will �_ b going to esplanade fr home?",
    "'Pity, * was in mood for that. So...any other suggestions?'",
    "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
    "Rofl. Its true to its name"
]

# Токенизируем новые данные (используем обученный токенайзер)
new_sequences = tokenizer.texts_to_sequences(new_data)

# Дополняем последовательности до max_length
new_padded = pad_sequences(new_sequences, maxlen=max_length, padding='post')

# Подаем данные в модель для предсказания
predictions = model.predict(new_padded)

# Интерпретируем результаты: если вероятность > 0.5, классифицируем как spam (1), иначе как ham (0)
predicted_labels = (predictions > 0.5).astype(int)

# Объединяем текст с предсказаниями
results = pd.DataFrame({
    "Текст": new_data,
    "Предсказание (Spam=1, Ham=0)": predicted_labels.flatten(),
    "Вероятность (Spam)": predictions.flatten()
})



# Выводим результаты
# Печать всей таблицы как строки

results

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 288ms/step


Unnamed: 0,Текст,"Предсказание (Spam=1, Ham=0)",Вероятность (Spam)
0,No. I meant the calculation is the same. That ...,0,0.141712
1,Because its important. And have you decided if...,0,0.141731
2,"Sorry, I'll call later",0,0.121487
3,if you aren't here in the next <#> hours imma ...,0,0.126106
4,Anything lor. Juz both of us lor.,0,0.119861
5,Get me out of this dump heap. My mom decided t...,0,0.132995
6,Ok lor... Sony ericsson salesman... I ask shuh...,0,0.129522
7,Ard 6 like dat lor.,0,0.122643
8,Why don't you wait 'til at least wednesday to ...,0,0.13986
9,Huh y lei...,0,0.117833


### Из тестирования видно, что модель определила все сообщения, которые пренадлежат к spam и ham