In [41]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

3. Загрузка обучающей и тестовой выборки

In [42]:
from sklearn.datasets import fetch_20newsgroups

remove = ('headers', 'footers', 'quotes')


def get_train_data(categories):
    if type(categories) is not list:
        categories = [categories]
    return fetch_20newsgroups(subset='train', shuffle=True, categories=categories, random_state=42, remove=remove)


all_categories = ['comp.graphics', 'sci.crypt', 'sci.electronics']
train_bunch = get_train_data(all_categories)
test_bunch = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=all_categories, remove=remove)


def get_sample(bunch, category_idx):
    for idx, target in enumerate(bunch.target):
        if target == category_idx:
            return bunch.data[idx]

4. Вывод по одному документа каждого из классов

In [43]:
get_sample(train_bunch, all_categories.index('comp.graphics'))

"Hello, I realize that this might be a FAQ but I have to ask since I don't get a\nchange to read this newsgroup very often.  Anyways for my senior project I need\nto convert an AutoCad file to a TIFF file.  Please I don't need anyone telling\nme that the AutoCAD file is a vector file and the TIFF is a bit map since I\nhave heard that about 100 times already I would just like to know if anyone\nknows how to do this or at least point me to the right direction."

In [44]:
get_sample(train_bunch, all_categories.index('sci.crypt'))

'Looking for PostScript or Tex version of a paper called:\n\t"PUBLIC-KEY CRYPTOGRAPHY"\n\nWritten by:\n\tJames Nechvatal\n\tSecurity Technology Group\n\tNational Computer Systems Laboratory\n\tNational Institute of Standards and Technology\n\tGaithersburg, MD 20899\n\n\tDecember 1990\n\nThe version I obtained is plain text and all symbolic character\nformatting has been lost.\n'

In [45]:
get_sample(train_bunch, all_categories.index('sci.electronics'))

'Just a thought........Maybe it possibly has to do with the fact that it\nIS an Emerson.  I\'ve got an Emerson VCR which is #6 in the series.  Returned\nit six times for various and never the same problems.  Got tired of taking it \nback and fixed it myself.  The Hi-Fi "window" was a bit off.  Something like\nthe Hi-Fi audio fine-tuning.  When I was a Wal-Mart "associate" in \'88-\'89,\nwe had AT LEAST one returned as defective EVERY SINGLE DAY.  How\'s that for\nreliability?  Face it--Emerson can make audio stuff (albeit not of premium\nquality), but they CAN\'T make anything as complex as video equipment with \nreliability IMHO.  Please, no flames.  Just *had* to share my Emerson disaster\nin the light of this exploding tv.  \nJC\n\n\n'

5. Выполнение процедуры стемминга

In [46]:
import nltk
from nltk.stem import *
from nltk import word_tokenize

nltk.download('punkt')


def stemminize(documents: list[str]) -> list[str]:
    porter_stemmer = PorterStemmer()
    stem_train = []
    for document in documents:
        nltk_tokens = word_tokenize(document)
        line = ''
        for word in nltk_tokens:
            line += ' ' + porter_stemmer.stem(word)
        stem_train.append(line)
    return stem_train


train_tokenized = stemminize(train_bunch.data)
test_tokenized = stemminize(test_bunch.data)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ruslan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
# вывод 3 первых документов обучающих данных
train_tokenized[:3]

[" hello , i realiz that thi might be a faq but i have to ask sinc i do n't get a chang to read thi newsgroup veri often . anyway for my senior project i need to convert an autocad file to a tiff file . pleas i do n't need anyon tell me that the autocad file is a vector file and the tiff is a bit map sinc i have heard that about 100 time alreadi i would just like to know if anyon know how to do thi or at least point me to the right direct .",
 " just a thought ........ mayb it possibl ha to do with the fact that it is an emerson . i 've got an emerson vcr which is # 6 in the seri . return it six time for variou and never the same problem . got tire of take it back and fix it myself . the hi-fi `` window '' wa a bit off . someth like the hi-fi audio fine-tun . when i wa a wal-mart `` associ '' in '88-'89 , we had at least one return as defect everi singl day . how 's that for reliabl ? face it -- emerson can make audio stuff ( albeit not of premium qualiti ) , but they ca n't make anyth

In [48]:
# вывод 3 первых документов тестовых данных
test_tokenized[:3]

[' well , i am place a file at my ftp today that contain sever polygon descript of a head , face , skull , vase , etc . the format of the file is a list of vertic , normal , and triangl . there are variou resolut and the name of the data file includ the number of polygon , eg . phred.1.3k.vbl contain 1300 polygon . in order to get the data via ftp do the follow : 1 ) ftp taurus.cs.nps.navy.mil 2 ) login as anonym , guest as the password 3 ) cd pub/dabro 4 ) binari 5 ) get cyber.tar.z onc you get the data onto your workstat : 1 ) uncompress data.tar.z 2 ) tar xvof data.tar if you have ani question , pleas let me know . georg dabro dabro @ taurus.cs.nps.navy.mil -- georg dabrowski cyberwar lab',
 " tri search for dmorf , i think it 's locat on wuarchive.wustl.edu in a mirror directori ... i 've use it befor , & it wa pretti good !",
 ' not realli . i think it is less than 10 % .']

In [49]:
columns = pd.MultiIndex.from_product([['Count', 'TF', 'TF-IDF'], ['Без стоп-слов', 'С стоп-словами']])
df_train = pd.DataFrame(columns=columns)
df_test = pd.DataFrame(columns=columns)

df_train_stem = pd.DataFrame(columns=columns)
df_test_stem = pd.DataFrame(columns=columns)

6 Векторизация и вывод 20 наиболее частых слов для всей тренировочной выборки без стоп-слов

In [50]:
vect = CountVectorizer(max_features=10000)
train_data = vect.fit_transform(train_bunch.data)


def get_20_freq_words(vect, data):
    words = list(zip(vect.get_feature_names_out(), np.ravel(data.sum(axis=0))))
    words.sort(key=lambda x: x[1], reverse=True)
    return words[:20]


count_column = get_20_freq_words(vect, train_data)
df_train['Count', 'Без стоп-слов'] = count_column
count_column

[('the', 16689),
 ('to', 8883),
 ('of', 7021),
 ('and', 6843),
 ('is', 5467),
 ('in', 4416),
 ('it', 3900),
 ('that', 3682),
 ('for', 3677),
 ('you', 2852),
 ('be', 2788),
 ('this', 2585),
 ('on', 2451),
 ('are', 2155),
 ('with', 2111),
 ('or', 2090),
 ('have', 1879),
 ('as', 1784),
 ('can', 1704),
 ('if', 1702)]

6 Векторизация и вывод 20 наиболее частых слов для всей тренировочной выборки со стоп-словами

In [51]:
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(train_bunch.data)

count_column_stop = get_20_freq_words(vect, dtm)
df_train['Count', 'С стоп-словами'] = count_column_stop
count_column_stop

[('key', 937),
 ('use', 932),
 ('like', 642),
 ('don', 592),
 ('db', 562),
 ('edu', 553),
 ('encryption', 552),
 ('data', 547),
 ('know', 542),
 ('just', 533),
 ('chip', 521),
 ('does', 501),
 ('used', 498),
 ('information', 497),
 ('image', 492),
 ('people', 483),
 ('time', 447),
 ('bit', 437),
 ('file', 427),
 ('graphics', 423)]

6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью TfidfTransformer без стоп-слов TF

In [52]:
def get_20_freq_words_idf(feature_names, tfidf_values):
    result = []
    word_weights = dict(zip(feature_names, tfidf_values))
    sorted_words = sorted(word_weights.items(), key=lambda x: x[1], reverse=True)
    for word, weight in sorted_words[:20]:
        result.append((word, weight))
    return result


vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_bunch.data)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column = get_20_freq_words_idf(feature_names, tfidf_values)
df_train['TF', 'Без стоп-слов'] = tf_column
tf_column

[('the', 573.5878746017531),
 ('to', 323.08392380711285),
 ('of', 242.72988399270668),
 ('and', 217.11039200783313),
 ('is', 186.06314755308978),
 ('it', 169.68759255688352),
 ('in', 162.3235233209576),
 ('for', 148.54566211510095),
 ('that', 146.62595086524686),
 ('you', 120.10726765931362),
 ('this', 105.17732299778746),
 ('be', 98.40643227700879),
 ('on', 96.18830762929639),
 ('have', 86.45612956176703),
 ('with', 76.85987967685325),
 ('if', 75.80342481606581),
 ('or', 75.78439948121508),
 ('are', 74.94851602350174),
 ('can', 68.77692347543262),
 ('not', 63.1163398649122)]

6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью TfidfTransformer с стоп-словами TF

In [53]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(train_bunch.data)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stop = get_20_freq_words_idf(feature_names, tfidf_values)
df_train['TF', 'С стоп-словами'] = tf_column_stop
tf_column_stop

[('use', 58.433522994302784),
 ('know', 57.307906073228544),
 ('like', 56.64353296005652),
 ('don', 49.839861337882176),
 ('just', 49.002983276208454),
 ('does', 48.9585366753105),
 ('key', 48.142224601803),
 ('thanks', 39.15072837325299),
 ('chip', 35.95980854400468),
 ('good', 35.08471270781613),
 ('need', 32.70371344296186),
 ('used', 31.85354729678519),
 ('think', 31.408220640281492),
 ('ve', 31.265005741849148),
 ('time', 30.354985689209872),
 ('people', 30.323719447911316),
 ('encryption', 28.319287928894738),
 ('using', 27.62980799728084),
 ('graphics', 27.205639758499185),
 ('clipper', 26.571725044843287)]

6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью TfidfTransformer без стоп-слов TF-IDF

In [54]:
vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_bunch.data)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf = get_20_freq_words_idf(feature_names, tfidf_values)
df_train['TF-IDF', 'Без стоп-слов'] = tf_idf
tf_idf

[('the', 214.27923011983643),
 ('to', 128.50658234239415),
 ('of', 101.86203264775209),
 ('and', 91.733961272254),
 ('is', 83.97727336110019),
 ('it', 79.47554322700573),
 ('in', 72.70953833589493),
 ('that', 72.51290798766651),
 ('for', 67.66475343892618),
 ('you', 66.41180952122423),
 ('be', 55.65313856163747),
 ('this', 55.13173979874598),
 ('on', 50.56554310512375),
 ('have', 48.11707358740201),
 ('are', 44.12242787539058),
 ('with', 43.883460758799835),
 ('if', 43.62474130570933),
 ('or', 43.39770046917348),
 ('can', 40.89942210826312),
 ('as', 40.345366794020045)]

6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью TfidfTransformer со стоп-словами TF-IDF

In [55]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(train_bunch.data)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_stop = get_20_freq_words_idf(feature_names, tfidf_values)
df_train['TF-IDF', 'С стоп-словами'] = tf_idf_stop
tf_idf_stop

[('key', 31.639250020331772),
 ('know', 29.275985507770525),
 ('use', 28.5909199903978),
 ('like', 27.36974112627459),
 ('does', 27.32020401912984),
 ('don', 25.935560274661245),
 ('just', 25.519485103509275),
 ('chip', 24.453605471872685),
 ('thanks', 24.11088770306822),
 ('encryption', 20.612197002323146),
 ('good', 20.345796034175873),
 ('need', 19.637524126929815),
 ('ve', 19.573641488242785),
 ('graphics', 19.457067951393185),
 ('clipper', 19.278143372470023),
 ('people', 18.825771055423083),
 ('think', 18.438571829249657),
 ('used', 18.158442956203725),
 ('government', 17.929344754138906),
 ('time', 17.5444284746565)]

### Обработка тестовых данных

6 Векторизация и вывод 20 наиболее частых слов для всей тестовой выборки без стоп-слов

In [56]:
vect = CountVectorizer(max_features=10000)
dtm = vect.fit_transform(test_bunch.data)
count_test_column = get_20_freq_words(vect, dtm)
df_test['Count', 'Без стоп-слов'] = count_test_column
count_test_column

[('the', 9066),
 ('to', 5360),
 ('of', 4137),
 ('and', 4073),
 ('is', 3074),
 ('in', 2610),
 ('it', 2402),
 ('for', 2362),
 ('that', 2228),
 ('you', 2086),
 ('be', 1535),
 ('this', 1472),
 ('on', 1462),
 ('or', 1295),
 ('with', 1258),
 ('have', 1215),
 ('are', 1186),
 ('if', 1154),
 ('can', 1101),
 ('as', 1026)]

6 Векторизация и вывод 20 наиболее частых слов для всей тестовой выборки со стоп-словами

In [57]:
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(test_bunch.data)
count_column_stop = get_20_freq_words(vect, dtm)
df_test['Count', 'С стоп-словами'] = count_column_stop
count_column_stop

[('image', 666),
 ('jpeg', 526),
 ('use', 516),
 ('edu', 468),
 ('graphics', 462),
 ('like', 408),
 ('file', 389),
 ('don', 378),
 ('data', 368),
 ('know', 355),
 ('just', 339),
 ('bit', 337),
 ('available', 325),
 ('software', 324),
 ('images', 307),
 ('program', 298),
 ('does', 291),
 ('time', 282),
 ('used', 272),
 ('ftp', 271)]

6. Векторизация и вывод 20 наиболее важных слов для тестовой выборки с помощью TfidfTransformer без стоп-слов TF

In [58]:
vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(test_bunch.data)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test['TF', 'Без стоп-слов'] = tf_column_test
tf_column_test

[('the', 351.8833889205307),
 ('to', 215.21048224463186),
 ('of', 153.66848738794562),
 ('and', 140.95239513687198),
 ('is', 120.607868681005),
 ('it', 110.88904071825),
 ('in', 104.33536769410595),
 ('that', 98.57686193779719),
 ('for', 92.40323586413749),
 ('you', 81.3739413442512),
 ('be', 65.317483332802),
 ('on', 62.06126483104049),
 ('this', 61.59725664266683),
 ('have', 57.61931147617616),
 ('or', 51.777520899187046),
 ('if', 50.32542218561264),
 ('can', 49.21418012871439),
 ('with', 48.18761054093589),
 ('are', 45.11043625995436),
 ('not', 43.53696710250099)]

6. Векторизация и вывод 20 наиболее важных слов для тестовой выборки с помощью TfidfTransformer со стоп-словами TF

In [59]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(test_bunch.data)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stop_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test['TF', 'С стоп-словами'] = tf_column_stop_test
tf_column_stop_test

[('know', 42.06416144310403),
 ('like', 36.822127481402006),
 ('use', 36.792352685034714),
 ('just', 32.40544450511485),
 ('don', 30.469003399281732),
 ('does', 29.885181249520144),
 ('thanks', 27.29259689277586),
 ('think', 24.82104074312471),
 ('used', 21.459669060506144),
 ('need', 20.686196868337543),
 ('graphics', 20.679160827517396),
 ('time', 20.129142247758516),
 ('program', 19.584807797820034),
 ('people', 19.102579785729354),
 ('chip', 18.54157887888643),
 ('edu', 18.28349107306669),
 ('ve', 18.270863639958304),
 ('government', 18.089022419581863),
 ('good', 17.967957804096248),
 ('bit', 17.440629791897027)]

6. Векторизация и вывод 20 наиболее важных слов для тестовой выборки с помощью TfidfTransformer без стоп-слов TF-IDF

In [60]:
vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(test_bunch.data)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test['TF-IDF', 'Без стоп-слов'] = tf_idf_test
tf_idf_test

[('the', 132.73987348944183),
 ('to', 85.24139589140572),
 ('of', 64.8113206497389),
 ('and', 59.482599607653924),
 ('is', 54.52831275498656),
 ('it', 53.75496178917013),
 ('that', 47.770429081592496),
 ('in', 47.06683267580104),
 ('you', 45.42774190604445),
 ('for', 43.184971775965785),
 ('be', 35.43112523777322),
 ('this', 34.08603226381701),
 ('on', 32.90027716297289),
 ('have', 32.025491313098804),
 ('if', 29.12956299199104),
 ('or', 29.011357783500337),
 ('can', 28.82297538311695),
 ('are', 27.38799920531715),
 ('with', 27.03257285835745),
 ('not', 26.9403506133788)]

6. Векторизация и вывод 20 наиболее важных слов для тестовой выборки с помощью TfidfTransformer со стоп-словами TF-IDF

In [61]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(test_bunch.data)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_stop_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test['TF-IDF', 'С стоп-словами'] = tf_idf_stop_test
tf_idf_stop_test

[('know', 21.435150574064025),
 ('like', 18.950713675342715),
 ('use', 18.420212557191185),
 ('thanks', 17.098531494337998),
 ('does', 16.98361584501106),
 ('just', 16.684961307647363),
 ('don', 16.428887334276713),
 ('think', 14.557804515797576),
 ('graphics', 14.252209873941238),
 ('program', 13.64987318610155),
 ('government', 12.964530900368489),
 ('chip', 12.860296724296857),
 ('used', 12.439336643011975),
 ('people', 12.333813168768687),
 ('need', 12.240898928564446),
 ('bit', 12.015183440146776),
 ('edu', 11.906194792288247),
 ('ve', 11.870735103204861),
 ('time', 11.745018935759806),
 ('key', 11.670618169470233)]

## Стемминг

### Тренировочные данные

6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью CountVectorizer без стоп-слов c приминением стемминга

In [62]:
vect = CountVectorizer(max_features=10000)
dtm = vect.fit_transform(train_tokenized)
count_column_stem = get_20_freq_words(vect, dtm)
df_train_stem['Count', 'Без стоп-слов'] = count_column_stem
count_column_stem

[('the', 16688),
 ('to', 8883),
 ('of', 7021),
 ('and', 6843),
 ('is', 5549),
 ('in', 4419),
 ('it', 4191),
 ('that', 3692),
 ('for', 3677),
 ('be', 2998),
 ('you', 2852),
 ('thi', 2585),
 ('on', 2459),
 ('are', 2195),
 ('with', 2111),
 ('or', 2090),
 ('use', 2014),
 ('have', 1997),
 ('as', 1784),
 ('not', 1740)]

6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью CountVectorizer с стоп-словами c приминением стемминга

In [63]:
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(train_tokenized)
count_column_stem_stop = get_20_freq_words(vect, dtm)
df_train_stem['Count', 'С стоп-словами'] = count_column_stem_stop
count_column_stem_stop

[('thi', 2585),
 ('use', 2014),
 ('key', 1283),
 ('ha', 887),
 ('ani', 866),
 ('wa', 783),
 ('encrypt', 774),
 ('imag', 737),
 ('file', 730),
 ('like', 711),
 ('chip', 672),
 ('doe', 671),
 ('know', 622),
 ('bit', 621),
 ('program', 569),
 ('db', 562),
 ('onli', 560),
 ('edu', 553),
 ('data', 548),
 ('secur', 534)]

6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью TfidfTransformer без стоп-слов c приминением стемминга TF

In [64]:
vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_tokenized)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stem = get_20_freq_words_idf(feature_names, tfidf_values)
df_train_stem['TF', 'Без стоп-слов'] = tf_column_stem
tf_column_stem

[('the', 561.4880559369901),
 ('to', 316.3652287700625),
 ('of', 237.41198848231448),
 ('and', 212.34202585341927),
 ('is', 185.57446689326932),
 ('it', 175.55160439683934),
 ('in', 159.25693721881674),
 ('for', 145.53831889691298),
 ('that', 144.01545732916136),
 ('you', 117.57659979606518),
 ('be', 104.11856816919685),
 ('thi', 103.06110204410142),
 ('on', 94.5230013722625),
 ('have', 89.88433983404117),
 ('with', 75.18149596453108),
 ('are', 74.93802609781713),
 ('if', 74.24248878944846),
 ('or', 74.23062669897583),
 ('do', 71.9015042136159),
 ('use', 71.4761519919797)]

6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью TfidfTransformer c стоп-словами c приминением стемминга TF

In [65]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(train_tokenized)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stem_stop = get_20_freq_words_idf(feature_names, tfidf_values)
df_train_stem['TF', 'С стоп-словами'] = tf_column_stem_stop
tf_column_stem_stop

[('thi', 169.76884542624322),
 ('use', 116.9587739384261),
 ('ani', 75.52328947198923),
 ('key', 61.252598858687456),
 ('wa', 61.24004770417701),
 ('know', 58.981341558737796),
 ('doe', 57.240041179177425),
 ('like', 57.137968393386735),
 ('ha', 56.59160730937827),
 ('chip', 44.84136837996277),
 ('just', 44.83029998661235),
 ('thank', 41.73801066350643),
 ('work', 41.61901207046525),
 ('anyon', 41.18249764862851),
 ('look', 40.281971512326024),
 ('file', 38.83230683543829),
 ('need', 38.34165975253376),
 ('encrypt', 36.15778681744015),
 ('onli', 34.91770247819603),
 ('program', 33.24366815109684)]

6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью TfidfTransformer без стоп-слов c приминением стемминга

In [66]:
vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_tokenized)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_column = get_20_freq_words_idf(feature_names, tfidf_values)
df_train_stem['TF-IDF', 'Без стоп-слов'] = tf_idf_column
tf_idf_column

[('the', 214.47110783764484),
 ('to', 128.91799924792693),
 ('of', 101.9397995673158),
 ('and', 91.85258731878157),
 ('is', 84.95276457618122),
 ('it', 82.95304153471035),
 ('in', 73.08110129504044),
 ('that', 73.01505717147099),
 ('for', 68.08421571530532),
 ('you', 66.66827307818957),
 ('be', 59.109398696493265),
 ('thi', 55.45567314375222),
 ('on', 50.853304634787605),
 ('have', 50.30994107364992),
 ('are', 45.04857754230604),
 ('with', 44.02554259499939),
 ('if', 43.91731908930183),
 ('or', 43.60061441479198),
 ('do', 43.39462930056318),
 ('use', 42.979583779815044)]

train tf-idf stemming

In [67]:
vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_tokenized)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_column_stop = get_20_freq_words_idf(feature_names, tfidf_values)
df_train_stem['TF-IDF', 'С стоп-словами'] = tf_idf_column_stop
tf_idf_column_stop

[('the', 214.47110783764484),
 ('to', 128.91799924792693),
 ('of', 101.9397995673158),
 ('and', 91.85258731878157),
 ('is', 84.95276457618122),
 ('it', 82.95304153471035),
 ('in', 73.08110129504044),
 ('that', 73.01505717147099),
 ('for', 68.08421571530532),
 ('you', 66.66827307818957),
 ('be', 59.109398696493265),
 ('thi', 55.45567314375222),
 ('on', 50.853304634787605),
 ('have', 50.30994107364992),
 ('are', 45.04857754230604),
 ('with', 44.02554259499939),
 ('if', 43.91731908930183),
 ('or', 43.60061441479198),
 ('do', 43.39462930056318),
 ('use', 42.979583779815044)]

test count steming

In [68]:
vect = CountVectorizer(max_features=10000)
dtm = vect.fit_transform(test_tokenized)
count_column_stem_test = get_20_freq_words(vect, dtm)
df_test_stem['Count', 'Без стоп-слов'] = count_column_stem_test
count_column_stem_test

[('the', 9063),
 ('to', 5360),
 ('of', 4137),
 ('and', 4073),
 ('is', 3139),
 ('in', 2612),
 ('it', 2562),
 ('for', 2362),
 ('that', 2237),
 ('you', 2086),
 ('be', 1647),
 ('thi', 1472),
 ('on', 1469),
 ('have', 1298),
 ('or', 1295),
 ('with', 1260),
 ('are', 1212),
 ('if', 1154),
 ('use', 1097),
 ('not', 1077)]

test count steming stop

In [69]:
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(test_tokenized)
count_column_stem_stop_test = get_20_freq_words(vect, dtm)
df_test_stem['Count', 'С стоп-словами'] = count_column_stem_stop_test
count_column_stem_stop_test

[('thi', 1472),
 ('use', 1097),
 ('imag', 998),
 ('file', 615),
 ('jpeg', 531),
 ('wa', 510),
 ('ani', 505),
 ('program', 497),
 ('ha', 479),
 ('edu', 468),
 ('like', 457),
 ('bit', 451),
 ('format', 411),
 ('know', 401),
 ('doe', 386),
 ('data', 369),
 ('onli', 344),
 ('work', 344),
 ('make', 341),
 ('just', 339)]

test tf steming

In [70]:
vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(test_tokenized)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stem_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test_stem['TF', 'Без стоп-слов'] = tf_column_stem_test
tf_column_stem_test

[('the', 344.5586615001451),
 ('to', 211.00830323462205),
 ('of', 150.4948643044924),
 ('and', 137.86770734884337),
 ('is', 121.90454220930677),
 ('it', 114.00279976905662),
 ('in', 102.13673946564619),
 ('that', 96.8675995748726),
 ('for', 90.52244749715135),
 ('you', 79.67798552554372),
 ('be', 68.85460938269497),
 ('on', 61.11286152167876),
 ('have', 60.90208541002496),
 ('thi', 60.34198459030236),
 ('or', 50.6519636067793),
 ('if', 49.388084620385364),
 ('with', 47.226534170832274),
 ('can', 46.08463643662672),
 ('are', 45.258192235058026),
 ('do', 44.57070053438678)]

test tf steming stop

In [71]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(test_tokenized)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_column_stem_stop_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test_stem['TF', 'С стоп-словами'] = tf_column_stem_stop_test
tf_column_stem_stop_test

[('thi', 97.49594840488756),
 ('use', 71.55533662918268),
 ('ani', 44.25521347634618),
 ('know', 43.06179116896538),
 ('wa', 39.53046546637826),
 ('like', 36.7672996472524),
 ('ha', 34.4237897180173),
 ('doe', 34.19333412527219),
 ('just', 29.513176624612083),
 ('thank', 28.549402258522097),
 ('anyon', 28.298548977191338),
 ('work', 26.494787158165767),
 ('think', 24.640467411932118),
 ('need', 24.56412549445321),
 ('program', 24.56261574830257),
 ('look', 24.441912389427163),
 ('make', 24.08146082002033),
 ('key', 23.553337023165067),
 ('pleas', 23.374159220189213),
 ('onli', 22.125484501080784)]

test tf-idf steming

In [72]:
vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(test_tokenized)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_column_stem_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test_stem['TF-IDF', 'Без стоп-слов'] = tf_idf_column_stem_test
tf_idf_column_stem_test

[('the', 133.4538752492904),
 ('to', 85.99632390188842),
 ('of', 65.18554426844001),
 ('and', 59.63534071325143),
 ('it', 55.94816487314925),
 ('is', 55.94487331494873),
 ('that', 48.31727363290116),
 ('in', 47.30170083345097),
 ('you', 45.68450447600251),
 ('for', 43.47570804261264),
 ('be', 37.344622742852174),
 ('thi', 34.3491285954964),
 ('have', 34.04975283966364),
 ('on', 33.209071780256735),
 ('if', 29.438597446898267),
 ('or', 29.11817727731375),
 ('can', 28.41866932660334),
 ('are', 28.194428691654128),
 ('do', 27.903939642519983),
 ('not', 27.900212314318484)]

test tf-idf steming stop

In [73]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(test_tokenized)
tfidf = TfidfTransformer(use_idf=False).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

tf_idf_column_stem_stop_test = get_20_freq_words_idf(feature_names, tfidf_values)
df_test_stem['TF-IDF', 'С стоп-словами'] = tf_idf_column_stem_stop_test
tf_idf_column_stem_stop_test

[('thi', 97.49594840488756),
 ('use', 71.55533662918268),
 ('ani', 44.25521347634618),
 ('know', 43.06179116896538),
 ('wa', 39.53046546637826),
 ('like', 36.7672996472524),
 ('ha', 34.4237897180173),
 ('doe', 34.19333412527219),
 ('just', 29.513176624612083),
 ('thank', 28.549402258522097),
 ('anyon', 28.298548977191338),
 ('work', 26.494787158165767),
 ('think', 24.640467411932118),
 ('need', 24.56412549445321),
 ('program', 24.56261574830257),
 ('look', 24.441912389427163),
 ('make', 24.08146082002033),
 ('key', 23.553337023165067),
 ('pleas', 23.374159220189213),
 ('onli', 22.125484501080784)]

In [74]:
df_train

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами
0,"(the, 16689)","(key, 937)","(the, 573.5878746017531)","(use, 58.433522994302784)","(the, 214.27923011983643)","(key, 31.639250020331772)"
1,"(to, 8883)","(use, 932)","(to, 323.08392380711285)","(know, 57.307906073228544)","(to, 128.50658234239415)","(know, 29.275985507770525)"
2,"(of, 7021)","(like, 642)","(of, 242.72988399270668)","(like, 56.64353296005652)","(of, 101.86203264775209)","(use, 28.5909199903978)"
3,"(and, 6843)","(don, 592)","(and, 217.11039200783313)","(don, 49.839861337882176)","(and, 91.733961272254)","(like, 27.36974112627459)"
4,"(is, 5467)","(db, 562)","(is, 186.06314755308978)","(just, 49.002983276208454)","(is, 83.97727336110019)","(does, 27.32020401912984)"
5,"(in, 4416)","(edu, 553)","(it, 169.68759255688352)","(does, 48.9585366753105)","(it, 79.47554322700573)","(don, 25.935560274661245)"
6,"(it, 3900)","(encryption, 552)","(in, 162.3235233209576)","(key, 48.142224601803)","(in, 72.70953833589493)","(just, 25.519485103509275)"
7,"(that, 3682)","(data, 547)","(for, 148.54566211510095)","(thanks, 39.15072837325299)","(that, 72.51290798766651)","(chip, 24.453605471872685)"
8,"(for, 3677)","(know, 542)","(that, 146.62595086524686)","(chip, 35.95980854400468)","(for, 67.66475343892618)","(thanks, 24.11088770306822)"
9,"(you, 2852)","(just, 533)","(you, 120.10726765931362)","(good, 35.08471270781613)","(you, 66.41180952122423)","(encryption, 20.612197002323146)"


In [75]:
df_test

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами
0,"(the, 9066)","(image, 666)","(the, 351.8833889205307)","(know, 42.06416144310403)","(the, 132.73987348944183)","(know, 21.435150574064025)"
1,"(to, 5360)","(jpeg, 526)","(to, 215.21048224463186)","(like, 36.822127481402006)","(to, 85.24139589140572)","(like, 18.950713675342715)"
2,"(of, 4137)","(use, 516)","(of, 153.66848738794562)","(use, 36.792352685034714)","(of, 64.8113206497389)","(use, 18.420212557191185)"
3,"(and, 4073)","(edu, 468)","(and, 140.95239513687198)","(just, 32.40544450511485)","(and, 59.482599607653924)","(thanks, 17.098531494337998)"
4,"(is, 3074)","(graphics, 462)","(is, 120.607868681005)","(don, 30.469003399281732)","(is, 54.52831275498656)","(does, 16.98361584501106)"
5,"(in, 2610)","(like, 408)","(it, 110.88904071825)","(does, 29.885181249520144)","(it, 53.75496178917013)","(just, 16.684961307647363)"
6,"(it, 2402)","(file, 389)","(in, 104.33536769410595)","(thanks, 27.29259689277586)","(that, 47.770429081592496)","(don, 16.428887334276713)"
7,"(for, 2362)","(don, 378)","(that, 98.57686193779719)","(think, 24.82104074312471)","(in, 47.06683267580104)","(think, 14.557804515797576)"
8,"(that, 2228)","(data, 368)","(for, 92.40323586413749)","(used, 21.459669060506144)","(you, 45.42774190604445)","(graphics, 14.252209873941238)"
9,"(you, 2086)","(know, 355)","(you, 81.3739413442512)","(need, 20.686196868337543)","(for, 43.184971775965785)","(program, 13.64987318610155)"


In [76]:
df_train_stem

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами
0,"(the, 16688)","(thi, 2585)","(the, 561.4880559369901)","(thi, 169.76884542624322)","(the, 214.47110783764484)","(the, 214.47110783764484)"
1,"(to, 8883)","(use, 2014)","(to, 316.3652287700625)","(use, 116.9587739384261)","(to, 128.91799924792693)","(to, 128.91799924792693)"
2,"(of, 7021)","(key, 1283)","(of, 237.41198848231448)","(ani, 75.52328947198923)","(of, 101.9397995673158)","(of, 101.9397995673158)"
3,"(and, 6843)","(ha, 887)","(and, 212.34202585341927)","(key, 61.252598858687456)","(and, 91.85258731878157)","(and, 91.85258731878157)"
4,"(is, 5549)","(ani, 866)","(is, 185.57446689326932)","(wa, 61.24004770417701)","(is, 84.95276457618122)","(is, 84.95276457618122)"
5,"(in, 4419)","(wa, 783)","(it, 175.55160439683934)","(know, 58.981341558737796)","(it, 82.95304153471035)","(it, 82.95304153471035)"
6,"(it, 4191)","(encrypt, 774)","(in, 159.25693721881674)","(doe, 57.240041179177425)","(in, 73.08110129504044)","(in, 73.08110129504044)"
7,"(that, 3692)","(imag, 737)","(for, 145.53831889691298)","(like, 57.137968393386735)","(that, 73.01505717147099)","(that, 73.01505717147099)"
8,"(for, 3677)","(file, 730)","(that, 144.01545732916136)","(ha, 56.59160730937827)","(for, 68.08421571530532)","(for, 68.08421571530532)"
9,"(be, 2998)","(like, 711)","(you, 117.57659979606518)","(chip, 44.84136837996277)","(you, 66.66827307818957)","(you, 66.66827307818957)"


In [77]:
df_test_stem

Unnamed: 0_level_0,Count,Count,TF,TF,TF-IDF,TF-IDF
Unnamed: 0_level_1,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами,Без стоп-слов,С стоп-словами
0,"(the, 9063)","(thi, 1472)","(the, 344.5586615001451)","(thi, 97.49594840488756)","(the, 133.4538752492904)","(thi, 97.49594840488756)"
1,"(to, 5360)","(use, 1097)","(to, 211.00830323462205)","(use, 71.55533662918268)","(to, 85.99632390188842)","(use, 71.55533662918268)"
2,"(of, 4137)","(imag, 998)","(of, 150.4948643044924)","(ani, 44.25521347634618)","(of, 65.18554426844001)","(ani, 44.25521347634618)"
3,"(and, 4073)","(file, 615)","(and, 137.86770734884337)","(know, 43.06179116896538)","(and, 59.63534071325143)","(know, 43.06179116896538)"
4,"(is, 3139)","(jpeg, 531)","(is, 121.90454220930677)","(wa, 39.53046546637826)","(it, 55.94816487314925)","(wa, 39.53046546637826)"
5,"(in, 2612)","(wa, 510)","(it, 114.00279976905662)","(like, 36.7672996472524)","(is, 55.94487331494873)","(like, 36.7672996472524)"
6,"(it, 2562)","(ani, 505)","(in, 102.13673946564619)","(ha, 34.4237897180173)","(that, 48.31727363290116)","(ha, 34.4237897180173)"
7,"(for, 2362)","(program, 497)","(that, 96.8675995748726)","(doe, 34.19333412527219)","(in, 47.30170083345097)","(doe, 34.19333412527219)"
8,"(that, 2237)","(ha, 479)","(for, 90.52244749715135)","(just, 29.513176624612083)","(you, 45.68450447600251)","(just, 29.513176624612083)"
9,"(you, 2086)","(edu, 468)","(you, 79.67798552554372)","(thank, 28.549402258522097)","(for, 43.47570804261264)","(thank, 28.549402258522097)"


In [78]:
writer = pd.ExcelWriter('./../output/2lab_result.xlsx', engine='openpyxl')

df_train.to_excel(writer, sheet_name='Train, wo stem')
df_test.to_excel(writer, sheet_name='Test, wo stem')
df_train_stem.to_excel(writer, sheet_name='Train, with stem')
df_test_stem.to_excel(writer, sheet_name='Test, with stem')

writer.close()

8. Конвеер

In [79]:
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__max_features': (500, 1000, 2500, 5000, 10000, None),
    'vect__stop_words': ('english', None),
    'tfidf__use_idf': (True, False),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

grid_search.fit(train_bunch.data, train_bunch.target)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
grid_search.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best score: 0.893
Best parameters set:


{'tfidf__use_idf': True,
 'vect__max_features': 5000,
 'vect__stop_words': 'english'}

In [80]:
print(classification_report(grid_search.predict(train_bunch.data), train_bunch.target))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       593
           1       0.96      0.92      0.94       621
           2       0.92      0.98      0.95       556

    accuracy                           0.95      1770
   macro avg       0.95      0.95      0.95      1770
weighted avg       0.95      0.95      0.95      1770


In [81]:
print(classification_report(grid_search.predict(test_bunch.data), test_bunch.target))

              precision    recall  f1-score   support

           0       0.89      0.82      0.85       424
           1       0.88      0.79      0.83       441
           2       0.70      0.88      0.78       313

    accuracy                           0.82      1178
   macro avg       0.82      0.83      0.82      1178
weighted avg       0.84      0.82      0.82      1178


Пайплайн для стемминга

In [82]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__max_features': (500, 1000, 2500, 5000, 10000, None),
    'vect__stop_words': ('english', None),
    'tfidf__use_idf': (True, False),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

grid_search.fit(train_tokenized, train_bunch.target)

print("Best score: %0.3f" % grid_search.best_score_)
grid_search.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best score: 0.886


{'tfidf__use_idf': True,
 'vect__max_features': 5000,
 'vect__stop_words': 'english'}

In [83]:
print(classification_report(grid_search.predict(train_tokenized), train_bunch.target))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       585
           1       0.97      0.92      0.94       629
           2       0.92      0.98      0.95       556

    accuracy                           0.95      1770
   macro avg       0.95      0.95      0.95      1770
weighted avg       0.95      0.95      0.95      1770


In [84]:
print(classification_report(grid_search.predict(test_tokenized), test_bunch.target))

              precision    recall  f1-score   support

           0       0.89      0.82      0.85       421
           1       0.90      0.78      0.84       455
           2       0.69      0.90      0.78       302

    accuracy                           0.83      1178
   macro avg       0.83      0.83      0.82      1178
weighted avg       0.84      0.83      0.83      1178
