---
title: ""
author: "Yurkov Sergey"
date: "2024-01-01"
categories: [python]
format: html
draft: true
---


In [1]:
%%capture

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)


In [2]:
# !unzip archive.zip -n

# import zipfile

# with zipfile.ZipFile("archive.zip", mode="r") as archive:
#     for file in archive.namelist():
#         archive.extractall("archive/")

In [3]:
df = pd.read_csv("archive/labeled.csv", dtype={"toxic": np.int8})

df


Unnamed: 0,comment,toxic
0,"Верблюдов-то за что? Дебилы, бл...\n",1
1,"Хохлы, это отдушина затюканого россиянина, мол...",1
2,Собаке - собачья смерть\n,1
3,"Страницу обнови, дебил. Это тоже не оскорблени...",1
4,"тебя не убедил 6-страничный пдф в том, что Скр...",1
...,...,...
14407,Вонючий совковый скот прибежал и ноет. А вот и...,1
14408,А кого любить? Гоблина тупорылого что-ли? Или ...,1
14409,"Посмотрел Утомленных солнцем 2. И оказалось, ч...",0
14410,КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н...,1


In [4]:
# Create data sets for defaults and non-defaults
nondefaults = df[df["toxic"] == 0]
defaults = df[df["toxic"] == 1]

# Undersample the non-defaults
nondefaults_under = nondefaults.sample(len(defaults))

# Concatenate the undersampled nondefaults with defaults
df_balanced = pd.concat(
    [nondefaults_under.reset_index(drop=True), defaults.reset_index(drop=True)], axis=0
)

# Print the value counts for loan status
print(df_balanced["toxic"].value_counts())


toxic
0    4826
1    4826
Name: count, dtype: int64


In [5]:
df_balanced.sample(5)

Unnamed: 0,comment,toxic
4317,Какие интересные крайности\n,0
3333,А-А-А-А-А-А--А-А!!!!!!!\n,1
3336,"Стекло разбить. Зная наших мусоров, они даже н...",1
2571,Это неизлечимо. К старости плотность нейронов ...,1
2581,"Все конечно правильно, но... За 2 минуты люди ...",0


In [6]:
from sklearn.model_selection import train_test_split

X = df_balanced["comment"]
y = df_balanced["toxic"]

y

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False, random_state=0 # shuffle=True
)

y_train.value_counts()


toxic
0    4826
1    2895
Name: count, dtype: int64

In [7]:
from keras_preprocessing import text
from keras_preprocessing import sequence

In [8]:
import string

# from keras.preprocessing.sequence import pad_sequences
from natasha import Doc, MorphVocab, NewsEmbedding, NewsMorphTagger, Segmenter

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

print(morph_vocab)

max_features = 20_000


class RuTokenizer(text.Tokenizer):
    def __init__(self):
        super().__init__(self)

        self.num_words = max_features

    def tokenize(self, text):
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        for token in doc.tokens:
            token.lemmatize(morph_vocab)
        # tokens = [_.lemma for _ in doc.tokens if _.text not in string.punctuation]
        tokens = [
            _.lemma.lower()
            for _ in doc.tokens
            if _.text not in string.punctuation
            and len(_.text) > 1
            and not _.text.isnumeric()
        ]

        return tokens


tokenizer = RuTokenizer()
# df_balanced["tokens"] = df_balanced["comment"].apply(
#     lambda x: repr(tokenizer.tokenize(x))
# )
# df_balanced["tokens"]
maxlen = 300

tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(sequences, maxlen=maxlen)

sequences = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(sequences, maxlen=maxlen)

model = keras.models.Sequential(
    [
        keras.layers.Embedding(max_features + 1, 128, input_length=maxlen),
        keras.layers.Bidirectional(
            keras.layers.LSTM(
                64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True
            )
        ),
        keras.layers.Bidirectional(
            keras.layers.LSTM(
                64,
                dropout=0.2,
                recurrent_dropout=0.2,
            )
        ),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)


model.summary()


MorphVocab()
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 128)          2560128   
                                                                 
 bidirectional (Bidirectiona  (None, 300, 128)         98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 2,757,889
Trainable params: 2,757,889
Non-trainable params: 0
_________________________________________________________________


In [9]:
# import keras_tuner

In [10]:
callback = keras.callbacks.EarlyStopping()

In [11]:
model.compile(keras.optimizers.Adam(0.01), "binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train, y_train, batch_size=32, epochs=2, validation_data=(X_test, y_test), callbacks=[callback])

len(history.history['loss'])

Epoch 1/2

KeyboardInterrupt: 

In [None]:
sequences = tokenizer.texts_to_sequences(["Я хочу есть", "Я люблю тебя", "на работе был полный пиддес :| и так каждое за..."])
test = sequence.pad_sequences(sequences, maxlen=maxlen)

output = model.predict(test)

print(output.flatten())


[0.5422895  0.98179567 0.01973787]


In [None]:
print(sequences)

In [None]:
model.save("model.keras")


In [None]:
loaded_model = keras.models.load_model("model.keras")

In [None]:
sequences = tokenizer.texts_to_sequences(["Я хочу есть", "Я люблю тебя", "на работе был полный пиддес :| и так каждое за..."])
test = sequence.pad_sequences(sequences, maxlen=maxlen)

output = loaded_model.predict(test)

print(output.flatten())


[0.5422895  0.98179567 0.01973787]
