In [5]:
import pandas as pd
from collections import Counter
import unidecode
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))
import string
import re
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from icecream import ic


In [6]:
unidecoded_nospam_words = []
unidecoded_spam_words = []
def unidecoder(phrase):
    unidecoded_phrase = unidecode.unidecode(phrase)
    return unidecoded_phrase 

In [7]:
unidecoded_stop_words = []

for word in stop_words:
    decoded_word = unidecoder(word)
    unidecoded_stop_words.append(decoded_word)

In [8]:
spam_df = pd.read_excel('—Å–ø–∞–º.xlsx', names=['text'])
spam_df.head()
spam_df.drop_duplicates(subset=['text'], inplace=True)
spam_df.shape

(31282, 1)

In [9]:
no_spam_df = pd.read_excel('–Ω–µ_—Å–ø–∞–º.xlsx', names=['id', 'text']).head(40000)
no_spam_df.head()
no_spam_df.drop(no_spam_df.index[0], inplace=True)
no_spam_df.shape


(39999, 2)

In [10]:
def preprocess_text(text):
    # Check if the input is a string, otherwise return an empty list or handle the non-string case
    if not isinstance(text, str):
        return []
    
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)  # Remove punctuation
    
    text = text.lower()  # Convert to lowercase
    words = text.split()  # Split into words
    tokens = [word for word in words if word not in unidecoded_stop_words]  # Remove stopwords
    
    return tokens

In [11]:
def delete_stopwords(text):

    tokens = [word for word in text if word not in unidecoded_stop_words]  # Remove stopwords
    
    return tokens

In [12]:
no_spam_words = no_spam_df['text'].apply(preprocess_text).sum()


In [13]:
spam_words = spam_df['text'].apply(preprocess_text).sum()



In [14]:
for word in no_spam_words:
    decoded_word = unidecoder(word)
    unidecoded_nospam_words.append(decoded_word)
    
for word in spam_words:
    decoded_word = unidecoder(word)
    unidecoded_spam_words.append(decoded_word)

In [15]:
cleaned_nospam = delete_stopwords(unidecoded_nospam_words)
cleaned_spam = delete_stopwords(unidecoded_spam_words)

In [16]:
unidecoded_nospam_words = [word for word in cleaned_nospam if word not in ['', '-', 'A', 'V', '!!!', '{"type":', 'Vy', 'I', '.',
                                                                                    'g', 'let', 'd', 'p', 'rf', 'te', 'n', 't',  'text', 'type',
                                                                                    
                                                                                    'S', 'Da', '2.', '3.', '4.', '5.', 'eto', 'ul']]

unidecoded_spam_words = [word for word in cleaned_spam if word not in ['', 'you', 'your', 'a', '-', '--' 'A', 'for', 'c', 'to','is',
'and',  'ot', 'k', 'Rs', 'of', 'Withdrawal', 'the',   'withdrawal',    'day', 'in', 'me', 'get', 'this', 'it', 'have', 'can', 'if','he', 'want' , '!!', 'that', 'who', "ect'", 'always'
'ne','trx', 'b', 't', 'i', 'A', 'za', 'all', 'us', '+', 'do', 'p', 'y', 'u', 'C','[A][A][A][A]', '|#|#|#|#|#|#||#||#|#|#|', '|RECHARgE'
'r', 'R', 'V', '!!!', '{"type":', 'Vy', 'I', '.', 'S', 'Da', '2.', '3.', '4.', '5.', 'eto', 'ul', 'na', 'text', 'type']]

In [17]:
unidecodedf_nospam_df = pd.DataFrame(unidecoded_nospam_words, columns=['unidecoded_text'])
unidecodedf_nospam_df = unidecodedf_nospam_df[unidecodedf_nospam_df['unidecoded_text'].apply(lambda x: len(x) >= 2)]
unidecodedf_nospam_df['label'] = 0

In [18]:

unidecodedf_spam_df = pd.DataFrame(unidecoded_spam_words, columns=['unidecoded_text'])
unidecodedf_spam_df = unidecodedf_spam_df[unidecodedf_spam_df['unidecoded_text'].apply(lambda x: len(x) >= 2)]
unidecodedf_spam_df['label'] = 1

In [35]:
unidecodedf_spam_df

Unnamed: 0,unidecoded_text,label
0,mshllh,1
1,ylts,1
2,Ht~,1
3,msHbt,1
4,atom,1
...,...,...
1061162,sergeycrytit,1
1061163,nuzhny,1
1061164,liudi,1
1061165,podrabotku,1


In [36]:
unidecodedf_nospam_df

Unnamed: 0,unidecoded_text,label
0,dobro,0
1,pozhalovat',0
2,dobryi,0
3,put',0
4,udachi,0
...,...,...
986537,vydaiut,0
986538,nikomu,0
986539,osobenno,0
986540,tekh,0


In [20]:
df = pd.concat([unidecodedf_nospam_df, unidecodedf_spam_df])

In [21]:
df.head()

Unnamed: 0,unidecoded_text,label
0,dobro,0
1,pozhalovat',0
2,dobryi,0
3,put',0
4,udachi,0


In [22]:

vectorizer = TfidfVectorizer(use_idf = True, ngram_range = (1, 3))
X = vectorizer.fit_transform(df['unidecoded_text'])
y = df['label']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [34]:
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
ic(accuracy)

ic| accuracy: 87.40375609996549


87.40375609996549

In [48]:
my_phrase = """
–°–ø—Ä–æ—Å–∏ –£ –ö–∞—Ç—é–∫–æ–≤–∞ –µ—Å—Ç—å –ª–∏ —É –Ω–µ–≥–æ –≤–æ–¥–∞üòÇ–∫–æ–Ω–µ—á–Ω–æ –µ—Å—Ç—åüòÇ
 
 """
# 1. –ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ —Ñ—Ä–∞–∑—ã
my_phrase_processed = preprocess_text(my_phrase)


# 2. –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º —Å–ø–∏—Å–æ–∫ —Å–ª–æ–≤ –æ–±—Ä–∞—Ç–Ω–æ –≤ —Å—Ç—Ä–æ–∫—É
my_phrase_joined = ' '.join(my_phrase_processed)

# 3. –ü—Ä–∏–º–µ–Ω—è–µ–º unidecoder
my_phrase_unidecoded = unidecoder(my_phrase_joined)
ic(my_phrase_unidecoded)

# 4. –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è —Ñ—Ä–∞–∑—ã
my_phrase_vect = vectorizer.transform([my_phrase_unidecoded])
ic(my_phrase_vect)

# 5. –ü—Ä–æ–≥–Ω–æ–∑–∏—Ä–æ–≤–∞–Ω–∏–µ
prediction = nb_classifier.predict(my_phrase_vect)
ic(prediction)

# 6. –†–µ–∑—É–ª—å—Ç–∞—Ç
if prediction[0] == 1:
    print("–°–ø–∞–º")
else:
    print("–ù–µ —Å–ø–∞–º")

ic| my_phrase_unidecoded: "sprosi u katiukova est' li u nego vodakonechno est'"
ic| my_phrase_vect: <1x198832 sparse matrix of type '<class 'numpy.float64'>'
                    	with 5 stored elements in Compressed Sparse Row format>
ic| prediction: array([1])


–°–ø–∞–º


In [27]:
spam_analyse = unidecodedf_spam_df.groupby('unidecoded_text').agg({'label': 'count'}).sort_values(by='label', ascending=False)

In [28]:
len(spam_analyse)

135036

In [4]:
spam_analyse2 = unidecodedf_nospam_df.groupby('unidecoded_text').agg({'label': 'count'}).sort_values(by='label', ascending=False)

NameError: name 'unidecodedf_nospam_df' is not defined

In [66]:
len(spam_analyse2)

91817

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelBinarizer

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –º–µ—Ç–∫–∏ –≤ –±–∏–Ω–∞—Ä–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç, –µ—Å–ª–∏ –Ω—É–∂–Ω–æ
encoder = LabelBinarizer()
y_train_nn = encoder.fit_transform(y_train)
y_test_nn = encoder.transform(y_test)

# –ü–æ—Å—Ç—Ä–æ–∏–º –ø—Ä–æ—Å—Ç—É—é –Ω–µ–π—Ä–æ—Å–µ—Ç—å
model = Sequential()
model.add(Dense(512, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# –ö–æ–º–ø–∏–ª—è—Ü–∏—è –º–æ–¥–µ–ª–∏
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
model.fit(X_train.toarray(), y_train_nn, epochs=10, batch_size=32, validation_data=(X_test.toarray(), y_test_nn))

# –û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏
score = model.evaluate(X_test.toarray(), y_test_nn)
print(f'Accuracy: {score[1] * 100:.2f}%')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


: 