In [27]:
from src.data.consts import FILE_PATH
import pandas as pd
import zipfile


with zipfile.ZipFile("../data/raw/filtered_paranmt.zip", "r") as zip_ref:
    with zip_ref.open("filtered.tsv") as file:
        df = pd.read_csv(file, sep='\t')

In [28]:
tox_df = df[['reference', 'ref_tox']].copy()
tox_df = tox_df[:20000]

In [29]:
tox_df.head()

Unnamed: 0,reference,ref_tox
0,"If Alkar is flooding her with psychic waste, t...",0.014195
1,Now you're getting nasty.,0.065473
2,"Well, we could spare your life, for one.",0.213313
3,"Ah! Monkey, you've got to snap out of it.",0.053362
4,I've got orders to put her down.,0.009402


In [30]:
from sklearn.model_selection import train_test_split

X = tox_df['reference']  # тексты
y = tox_df['ref_tox']    # уровни токсичности

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [31]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

In [32]:
X_train_preprocessed = [preprocess_text(text) for text in X_train]
X_test_preprocessed = [preprocess_text(text) for text in X_test]


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_preprocessed)
X_test_tfidf = tfidf_vectorizer.transform(X_test_preprocessed)
X_train_tfidf = X_train_tfidf.astype('float32')



In [34]:
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


In [35]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [36]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_tfidf.toarray(), y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_tfidf.toarray(), y_test))


In [37]:
batch_size = 32
epochs = 20

model.fit(train_dataset.batch(batch_size), epochs=epochs, validation_data=test_dataset.batch(batch_size))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x21ad0662760>

In [38]:
y_pred = model.predict(X_test_tfidf.toarray())



In [39]:
from sklearn.metrics import accuracy_score

# Преобразуйте истинные метки тестовых данных в бинарный формат, если они не таковы
y_test_binary = (y_test > 0.5).astype(int)
y_pred_binary = (y_pred > 0.5).astype(int)


# Оцените точность
accuracy = accuracy_score(y_test_binary, y_pred_binary)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.70


In [40]:
# Экспортируйте модель в файл
model.save('model.h5')



  saving_api.save_model(
