In [1]:
import pandas as pd
from collections import Counter
import unidecode
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))
import string
import re
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from icecream import ic


In [2]:
spam_df = pd.read_excel('спам.xlsx', names=['text'])
spam_df.head()
spam_df.drop_duplicates(subset=['text'], inplace=True)
spam_df.shape

(11573, 1)

In [3]:
no_spam_df = pd.read_excel('не_спам.xlsx', names=['id', 'text'])
no_spam_df = no_spam_df.head(30000)
no_spam_df.drop(no_spam_df.index[0], inplace=True)

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'\d+', '', text)
    words = text.split()
    tokens = [word for word in words if word not in stop_words]
    
    
    return tokens

no_spam_words = no_spam_df['text'].apply(preprocess_text).sum()
spam_words = spam_df['text'].apply(preprocess_text).sum()

In [5]:
unidecoded_nospam_words = []
unidecoded_spam_words = []
def unidecoder(phrase):
    unidecoded_phrase = unidecode.unidecode(phrase)
    return unidecoded_phrase 

In [6]:
for word in no_spam_words:
    decoded_word = unidecoder(word)
    unidecoded_nospam_words.append(decoded_word)
    
for word in spam_words:
    decoded_word = unidecoder(word)
    unidecoded_spam_words.append(decoded_word)

In [7]:
unidecoded_nospam_words = [word for word in unidecoded_nospam_words if word not in ['', '-', 'A', 'V', '!!!', '{"type":', 'Vy', 'I', '.',
                                                                                    'g', 'let', 'd', 'p', 'rf', 'te', 'n', 't',  'text', 'type',
                                                                                    
                                                                                    'S', 'Da', '2.', '3.', '4.', '5.', 'eto', 'ul']]

unidecoded_spam_words = [word for word in unidecoded_spam_words if word not in ['', 'you', 'your', 'a', '-', '--' 'A', 'for', 'c', 'to','is',
'and',  'ot', 'k', 'Rs', 'of', 'Withdrawal', 'the',   'withdrawal',    'day',
'ne','trx', 'b', 't', 'i', 'A', 'za', 'all', 'us', '+', 'do', 'p', 'y', 'u', 'C','[A][A][A][A]',
'r', 'R', 'V', '!!!', '{"type":', 'Vy', 'I', '.', 'S', 'Da', '2.', '3.', '4.', '5.', 'eto', 'ul', 'na', 'text', 'type']]

In [8]:
unidecodedf_nospam_df = pd.DataFrame(unidecoded_nospam_words, columns=['unidecoded_text'])
unidecodedf_nospam_df = unidecodedf_nospam_df[unidecodedf_nospam_df['unidecoded_text'].apply(lambda x: len(x) >= 2)]
unidecodedf_nospam_df['label'] = 0

In [10]:

unidecodedf_spam_df = pd.DataFrame(unidecoded_spam_words, columns=['unidecoded_text'])
unidecodedf_spam_df = unidecodedf_spam_df[unidecodedf_spam_df['unidecoded_text'].apply(lambda x: len(x) >= 2)]
unidecodedf_spam_df['label'] = 1

In [11]:
df = pd.concat([unidecodedf_nospam_df, unidecodedf_spam_df])

In [12]:
df.head()

Unnamed: 0,unidecoded_text,label
0,dobro,0
1,pozhalovat',0
2,dobryi,0
3,put',0
4,udachi,0


In [13]:

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['unidecoded_text'])
ic(X)
y = df['label']
ic(y)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [16]:
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
ic(accuracy)

ic| accuracy: 90.70363293734943


90.70363293734943

In [47]:
my_phrase = """
красноярские региональные новости

"""
# 1. Предобработка фразы
my_phrase_processed = preprocess_text(my_phrase)


# 2. Преобразуем список слов обратно в строку
my_phrase_joined = ' '.join(my_phrase_processed)

# 3. Применяем unidecoder
my_phrase_unidecoded = unidecoder(my_phrase_joined)
ic(my_phrase_unidecoded)

# 4. Векторизация фразы
my_phrase_vect = vectorizer.transform([my_phrase_unidecoded])
ic(my_phrase_vect)

# 5. Прогнозирование
prediction = nb_classifier.predict(my_phrase_vect)
ic(prediction)

# 6. Результат
if prediction[0] == 1:
    print("Спам")
else:
    print("Не спам")

ic| my_phrase_unidecoded: "krasnoiarskie regional'nye novosti"
ic| my_phrase_vect: <Compressed Sparse Row sparse matrix of dtype 'float64'
                    	with 4 stored elements and shape (1, 109934)>
ic| prediction: array([0])


Не спам


In [48]:
spam_analyse = unidecodedf_spam_df.groupby('unidecoded_text').agg({'label': 'count'}).sort_values(by='label', ascending=False)

In [None]:
spam_analyse

In [49]:
spam_analyse2 = unidecodedf_nospam_df.groupby('unidecoded_text').agg({'label': 'count'}).sort_values(by='label', ascending=False)

In [None]:
spam_analyse2