In [1]:
# Импорт необходимых модулей 
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Настройки для визуализации
# Если используется темная тема - лучше текст сделать белым
TEXT_COLOR = 'black'

matplotlib.rcParams['figure.figsize'] = (15, 10)
matplotlib.rcParams['text.color'] = 'black'
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['axes.labelcolor'] = TEXT_COLOR
matplotlib.rcParams['xtick.color'] = TEXT_COLOR
matplotlib.rcParams['ytick.color'] = TEXT_COLOR

# Зафиксируем состояние случайных чисел
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [2]:
from sklearn.datasets import fetch_20newsgroups

# В функции загрузки уже есть разделение на обучение/тест
#   воспользуемся этим на момент подготовки модели
# Для анализа лучше посмотреть на все данные
newsgroups_data = fetch_20newsgroups(subset='all', random_state=RANDOM_STATE)

In [3]:
print(newsgroups_data.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [8]:
# Посмотрим, какой у данных тип
data = newsgroups_data['data']
targets = newsgroups_data['target']
target_names = newsgroups_data['target_names']

print(f"Data type:\t{type(data)}\n")
print(f"Target names:\n{target_names}\n")
print(f"Target data:\n{targets[:10]}")

Data type:	<class 'list'>

Target names:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

Target data:
[10  3 17  3  4 12  4 10 10 19]


In [9]:
# Здесь специально сделана некоторая предобработка, 
#   которая обычно делается в рамках подготовки
texts_dataset = [
    "пирожок это лишь пирожок",
    "пирог не кушать пирожок можно",
    "сегодня ходил кино поел пирог"
]

corpus = set()
# Для начала составим словарь
for text in texts_dataset:
    tokens = text.split(' ')
    corpus.update(tokens)

corpus = list(corpus)
print(f'Corpus: {corpus}')

Corpus: ['пирожок', 'пирог', 'сегодня', 'можно', 'кушать', 'ходил', 'поел', 'кино', 'лишь', 'не', 'это']


In [10]:
# После составления корпуса мы можем составить матрицу попаданий
samples_count = len(texts_dataset)
corpus_len = len(corpus)
X_data = np.zeros((samples_count, corpus_len), dtype=int)

for i_sample, text in enumerate(texts_dataset):
    tokens = text.split(' ')
    for token in tokens:
        token_index = corpus.index(token)
        X_data[i_sample, token_index] += 1

# Для лучшего представления составим DataFrame
X_df = pd.DataFrame(X_data, columns=corpus)
X_df['_texts'] = texts_dataset

X_df

Unnamed: 0,пирожок,пирог,сегодня,можно,кушать,ходил,поел,кино,лишь,не,это,_texts
0,2,0,0,0,0,0,0,0,1,0,1,пирожок это лишь пирожок
1,1,1,0,1,1,0,0,0,0,1,0,пирог не кушать пирожок можно
2,0,1,1,0,0,1,1,1,0,0,0,сегодня ходил кино поел пирог


In [11]:
row_sums = X_data.sum(axis=1)
X_data_norm = X_data/row_sums[:,None]

X_df = pd.DataFrame(X_data_norm, columns=corpus)
X_df['_texts'] = texts_dataset

X_df

Unnamed: 0,пирожок,пирог,сегодня,можно,кушать,ходил,поел,кино,лишь,не,это,_texts
0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,пирожок это лишь пирожок
1,0.2,0.2,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.2,0.0,пирог не кушать пирожок можно
2,0.0,0.2,0.2,0.0,0.0,0.2,0.2,0.2,0.0,0.0,0.0,сегодня ходил кино поел пирог


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    # Ограничение максимального кол-ва признаков (размера выходного вектора)
    #   None -> не ограничено, вычисляется из данных
    max_features=None,
)

X_data = [
    "Пирожок - это лишь пирожок!",
    "Пирог не кушать, пирожок - можно.",
    "Я сегодня ходил в кино и поел пирог!"
]

X_data_vec = vectorizer.fit_transform(X_data)
# Отобразим векторизированное представление (кол-во данных, кол-во фич)
print(X_data_vec.shape)

(3, 11)


In [13]:
corpus = vectorizer.get_feature_names()
corpus

['кино',
 'кушать',
 'лишь',
 'можно',
 'не',
 'пирог',
 'пирожок',
 'поел',
 'сегодня',
 'ходил',
 'это']

In [14]:
dict(zip(corpus, vectorizer.idf_))

{'кино': 1.6931471805599454,
 'кушать': 1.6931471805599454,
 'лишь': 1.6931471805599454,
 'можно': 1.6931471805599454,
 'не': 1.6931471805599454,
 'пирог': 1.2876820724517808,
 'пирожок': 1.2876820724517808,
 'поел': 1.6931471805599454,
 'сегодня': 1.6931471805599454,
 'ходил': 1.6931471805599454,
 'это': 1.6931471805599454}

In [15]:
df = pd.DataFrame(X_data_vec.todense(), columns = corpus)
df['_texts'] = X_data

df

Unnamed: 0,кино,кушать,лишь,можно,не,пирог,пирожок,поел,сегодня,ходил,это,_texts
0,0.0,0.0,0.481482,0.0,0.0,0.0,0.732359,0.0,0.0,0.0,0.481482,Пирожок - это лишь пирожок!
1,0.0,0.490479,0.0,0.490479,0.490479,0.373022,0.373022,0.0,0.0,0.0,0.0,"Пирог не кушать, пирожок - можно."
2,0.467351,0.0,0.0,0.0,0.0,0.355432,0.0,0.467351,0.467351,0.467351,0.0,Я сегодня ходил в кино и поел пирог!


In [16]:
df = pd.DataFrame({
    'x1': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
    'x2': ['A', 'B', 'B', 'B', 'C', 'A', 'A', 'C', 'A', 'A', 'C', 'B', 'B', 'C', 'C'],
    'y': [0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0]
})

df

Unnamed: 0,x1,x2,y
0,0,A,0
1,0,B,0
2,0,B,1
3,0,B,1
4,0,C,0
5,1,A,0
6,1,A,1
7,1,C,1
8,1,A,1
9,1,A,0


In [17]:
# Подгрузим данные
newsgroups_train = fetch_20newsgroups(
    subset='train', random_state=RANDOM_STATE)
newsgroups_test = fetch_20newsgroups(
    subset='test', random_state=RANDOM_STATE)

X_train = newsgroups_train['data']
y_train = newsgroups_train['target']

X_test = newsgroups_test['data']
y_test = newsgroups_test['target']

In [18]:
from sklearn.naive_bayes import MultinomialNB

# Без всяких предобработок кидаем, что есть в трансформацию
vectorizer = TfidfVectorizer()

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

nb_clf = MultinomialNB()
nb_clf.fit(X_train_vec, y_train)

print(f'Train accuracy: {nb_clf.score(X_train_vec, y_train)}')
print(f'Test accuracy: {nb_clf.score(X_test_vec, y_test)}')

Train accuracy: 0.9326498143892522
Test accuracy: 0.7738980350504514


In [21]:
import nltk
# Скачиваем необходимые модули фреймворка nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from string import punctuation

[nltk_data] Downloading package stopwords to /home/lena/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/lena/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/lena/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [22]:
# Выберем текст для примера и посмотрим
sample_text = X_train[1]

print(sample_text)

From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.washington.edu

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.

Guy Kuo <guykuo@u.washington.edu>



In [23]:
# Проверим работу приведения к нижнему регистру
# Этот подход позволяет исключить различия слов Hello и hello,
#   так как по сути это одно и то же слово
sample_text = sample_text.lower()
print(sample_text)

from: guykuo@carson.u.washington.edu (guy kuo)
subject: si clock poll - final call
summary: final call for si clock reports
keywords: si,acceleration,clock,upgrade
article-i.d.: shelley.1qvfo9innc3s
organization: university of washington
lines: 11
nntp-posting-host: carson.u.washington.edu

a fair number of brave souls who upgraded their si clock oscillator have
shared their experiences for this poll. please send a brief message detailing
your experiences with the procedure. top speed attained, cpu rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

i will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. thanks.

guy kuo <guykuo@u.washington.edu>



In [24]:
# Удаление пунктуации
# Более сложный анализ может учитывать пунктуацию, но для простых
#   случаев пунктуация исключается, чтобы оставить лишь слова
#   как основную информацию
punct_transl = str.maketrans('', '', punctuation)
sample_text = sample_text.translate(punct_transl)
print(sample_text)

from guykuocarsonuwashingtonedu guy kuo
subject si clock poll  final call
summary final call for si clock reports
keywords siaccelerationclockupgrade
articleid shelley1qvfo9innc3s
organization university of washington
lines 11
nntppostinghost carsonuwashingtonedu

a fair number of brave souls who upgraded their si clock oscillator have
shared their experiences for this poll please send a brief message detailing
your experiences with the procedure top speed attained cpu rated speed
add on cards and adapters heat sinks hour of usage per day floppy disk
functionality with 800 and 14 m floppies are especially requested

i will be summarizing in the next two days so please add to the network
knowledge base if you have done the clock upgrade and havent answered this
poll thanks

guy kuo guykuouwashingtonedu



In [25]:
# Удаление чисел
# Числа как правило редко повторяются, для простого подхода
#   достаточно удалить числа, так как это неповторяющаяся информация
sample_text = re.sub(r'\d+', '', sample_text)
print(sample_text)

from guykuocarsonuwashingtonedu guy kuo
subject si clock poll  final call
summary final call for si clock reports
keywords siaccelerationclockupgrade
articleid shelleyqvfoinncs
organization university of washington
lines 
nntppostinghost carsonuwashingtonedu

a fair number of brave souls who upgraded their si clock oscillator have
shared their experiences for this poll please send a brief message detailing
your experiences with the procedure top speed attained cpu rated speed
add on cards and adapters heat sinks hour of usage per day floppy disk
functionality with  and  m floppies are especially requested

i will be summarizing in the next two days so please add to the network
knowledge base if you have done the clock upgrade and havent answered this
poll thanks

guy kuo guykuouwashingtonedu



In [26]:
# Удаление повторяющихся пробелов
# Часто в текста делают кучу пробелов и отступов - они не несут информации
sample_text = re.sub(r'\s+', ' ', sample_text)
print(sample_text)

from guykuocarsonuwashingtonedu guy kuo subject si clock poll final call summary final call for si clock reports keywords siaccelerationclockupgrade articleid shelleyqvfoinncs organization university of washington lines nntppostinghost carsonuwashingtonedu a fair number of brave souls who upgraded their si clock oscillator have shared their experiences for this poll please send a brief message detailing your experiences with the procedure top speed attained cpu rated speed add on cards and adapters heat sinks hour of usage per day floppy disk functionality with and m floppies are especially requested i will be summarizing in the next two days so please add to the network knowledge base if you have done the clock upgrade and havent answered this poll thanks guy kuo guykuouwashingtonedu 


In [27]:
# Токенизация - превращаем одну большую строку в массив токенов (слов)
# Под токенами могут пониматься не только слова, но и комбинации слов,
#   хотя для простого анализа - достаточно токенизировать до слов
word_tokens = nltk.word_tokenize(sample_text)
print(word_tokens)

['from', 'guykuocarsonuwashingtonedu', 'guy', 'kuo', 'subject', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'for', 'si', 'clock', 'reports', 'keywords', 'siaccelerationclockupgrade', 'articleid', 'shelleyqvfoinncs', 'organization', 'university', 'of', 'washington', 'lines', 'nntppostinghost', 'carsonuwashingtonedu', 'a', 'fair', 'number', 'of', 'brave', 'souls', 'who', 'upgraded', 'their', 'si', 'clock', 'oscillator', 'have', 'shared', 'their', 'experiences', 'for', 'this', 'poll', 'please', 'send', 'a', 'brief', 'message', 'detailing', 'your', 'experiences', 'with', 'the', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'on', 'cards', 'and', 'adapters', 'heat', 'sinks', 'hour', 'of', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'with', 'and', 'm', 'floppies', 'are', 'especially', 'requested', 'i', 'will', 'be', 'summarizing', 'in', 'the', 'next', 'two', 'days', 'so', 'please', 'add', 'to', 'the', 'network', 'knowledge', 'ba

In [28]:
# Удаляем стоп-слова, для начала посмотрим, что это за слова
stop_words = set(stopwords.words('english'))
print(stop_words)

{'i', 'those', 'too', 'couldn', 've', 'how', "you'd", 'y', 'or', 'don', 'was', 'you', 'me', 'be', 'at', 'through', 'wouldn', 'o', 'do', 'after', "mustn't", 'from', "doesn't", 'same', 'wasn', 'then', 'being', 'she', 'ma', 'ourselves', 'herself', "it's", 'about', "haven't", 'having', 'it', 'only', 'by', "won't", 'below', "you've", 'yours', 'before', 'where', 'doing', 'than', 'your', 'the', 'other', 'few', 'most', 'yourselves', 'as', "shouldn't", 'no', 'they', 'themselves', 'didn', "weren't", 'but', 'itself', "wouldn't", "you'll", 'm', 'why', 'here', 'theirs', 'between', 'for', 'over', 'him', "she's", 'd', 't', 'these', 'aren', 're', 'this', 'on', 'to', "hadn't", 'and', 'such', 'above', 'of', 'during', 'just', "needn't", 'does', 'did', 'yourself', 'can', 'some', "wasn't", 'their', 'against', 'ours', 'because', 'both', "mightn't", 'hadn', 'out', 'until', 'under', "you're", 'myself', 'in', 'an', 'should', 'not', 'had', 'each', 'shan', 'which', 'my', 'mightn', 'hasn', 'down', 'so', "should'v

In [29]:
# Теперь фильтруем стоп-слова из наших токенов
word_tokens = [word for word in word_tokens if word not in stop_words]
print(word_tokens)

['guykuocarsonuwashingtonedu', 'guy', 'kuo', 'subject', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'reports', 'keywords', 'siaccelerationclockupgrade', 'articleid', 'shelleyqvfoinncs', 'organization', 'university', 'washington', 'lines', 'nntppostinghost', 'carsonuwashingtonedu', 'fair', 'number', 'brave', 'souls', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experiences', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experiences', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'cards', 'adapters', 'heat', 'sinks', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'floppies', 'especially', 'requested', 'summarizing', 'next', 'two', 'days', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'havent', 'answered', 'poll', 'thanks', 'guy', 'kuo', 'guykuouwashingtonedu']


In [30]:
# Проводим лемматизацию - приводим к нормальной форме
wordnet_lemmatizer = WordNetLemmatizer()

word_tokens = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
print(word_tokens)

['guykuocarsonuwashingtonedu', 'guy', 'kuo', 'subject', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'report', 'keywords', 'siaccelerationclockupgrade', 'articleid', 'shelleyqvfoinncs', 'organization', 'university', 'washington', 'line', 'nntppostinghost', 'carsonuwashingtonedu', 'fair', 'number', 'brave', 'soul', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experience', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experience', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'card', 'adapter', 'heat', 'sink', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'floppy', 'especially', 'requested', 'summarizing', 'next', 'two', 'day', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'havent', 'answered', 'poll', 'thanks', 'guy', 'kuo', 'guykuouwashingtonedu']


In [31]:
# Чтобы лучше понять, как он работает - рассмотрим примеры:
print(wordnet_lemmatizer.lemmatize('bats'))
print(wordnet_lemmatizer.lemmatize('are'))  # Ууупс, тут не преобразовалось в be =(
print(wordnet_lemmatizer.lemmatize('feet'))

bat
are
foot


In [32]:
# После этого нам нужно объединить токены обратно в единую строку 
#   для будущего кодирования
processed_text = ' '.join(word_tokens)
print(processed_text)

guykuocarsonuwashingtonedu guy kuo subject si clock poll final call summary final call si clock report keywords siaccelerationclockupgrade articleid shelleyqvfoinncs organization university washington line nntppostinghost carsonuwashingtonedu fair number brave soul upgraded si clock oscillator shared experience poll please send brief message detailing experience procedure top speed attained cpu rated speed add card adapter heat sink hour usage per day floppy disk functionality floppy especially requested summarizing next two day please add network knowledge base done clock upgrade havent answered poll thanks guy kuo guykuouwashingtonedu
