In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense,Dropout,Flatten
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection
import nltk
from nltk.stem.snowball import SnowballStemmer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
data = pd.read_csv('../input/russian-language-toxic-comments/labeled.csv')
data.head(10)

In [None]:
nltk.download('averaged_perceptron_tagger_ru')
text = np.array(data.comment.values)
target = data.toxic.astype(int).values
def upperCaseRate(string):
    "Returns percentage of uppercase letters in the string"
    return np.array(list(map(str.isupper, string))).mean()
upcaseRate = list(map(upperCaseRate, data.comment.values))
def cleanText(string):
    """This function deletes all symbols except Cyrilic and Base Latin alphabet,
    stopwords, functional parts of speech. Returns string of words stem."""
    # Common cleaning
    string = string.lower()
    string = re.sub(r"http\S+", "", string)
    string = str.replace(string,'Ё','е')
    string = str.replace(string,'ё','е')
    prog = re.compile('[А-Яа-яA-Za-z]+')
    words = prog.findall(string.lower())
    
    # Word Cleaning
    ## Stop Words
    stopwords = nltk.corpus.stopwords.words('russian')
    words = [w for w in words if w not in stopwords]
    ## Cleaning functional POS (Parts of Speech)
    functionalPos = {'CONJ', 'PRCL'}
    words = [w for w, pos in nltk.pos_tag(words, lang='rus') if pos not in functionalPos]
    ## Stemming
    stemmer = SnowballStemmer('russian')
    return ' '.join(list(map(stemmer.stem, words)))

In [None]:
text = list(map(cleanText, text))
from collections import Counter
def counter_word(t):
    count=Counter()
    for i in t:
        for word in i.split():
            count[word]+=1
    return count
len(counter_word(text))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=0.2, stratify=target, shuffle = True, random_state=42)

In [None]:
dictionary_size = len(counter_word(text))
tokenizer = Tokenizer(num_words=dictionary_size)

tokenizer.fit_on_texts(X_train)

X_train_tokenized_lst = tokenizer.texts_to_sequences(X_train)
X_test_tokenized_lst  = tokenizer.texts_to_sequences(X_test)

In [None]:
max_comment_length = 300
X_trained = pad_sequences(X_train_tokenized_lst, maxlen=max_comment_length)
X_tested =  pad_sequences(X_test_tokenized_lst, maxlen=max_comment_length )

In [None]:
max_features =len(counter_word(text))
embedding_dim =16
sequence_length = 300

model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=sequence_length,\
                                    embeddings_regularizer = l2(0.005))) ####обьявление dim
model.add(Dropout(0.4))
model.add(LSTM(embedding_dim,dropout=0.2, recurrent_dropout=0.2,return_sequences=True,\
                                                             kernel_regularizer=l2(0.005),\
                                                             bias_regularizer=l2(0.005)))
model.add(Flatten())
model.add(Dense(512, activation='relu',\
                                kernel_regularizer=l2(0.001),\
                                bias_regularizer=l2(0.001),))
model.add(Dropout(0.4))

model.add(Dense(8, activation='relu',\
                                kernel_regularizer=l2(0.001),\
                                bias_regularizer=l2(0.001),))
model.add(Dropout(0.4))


model.add(Dense(1,activation='sigmoid'))
                               



model.summary()
model.compile(loss='binary_crossentropy',optimizer=Adam(1e-3),metrics=['accuracy'])

In [None]:
epochs = 100

history = model.fit(X_trained, y_train, epochs=epochs,validation_data=(X_tested,y_test), batch_size=2048)

In [None]:
predictions = model.predict(X_tested,verbose=1)
print(predictions[:10])

In [None]:
y_test[:10]

In [None]:
print(f'Total accuracy  {np.sum((np.round(predictions).flatten()))/sum(y_test)*100}')