In [107]:
import numpy as np

# 3. Загрузка обучающей и тестовой выборки

In [108]:
from sklearn.datasets import fetch_20newsgroups

remove = ('headers', 'footers', 'quotes')


def get_train_data(categories):
    if type(categories) is not list:
        categories = [categories]
    return fetch_20newsgroups(subset='train', shuffle=True, categories=categories, random_state=42, remove=remove)


all_categories = ['comp.graphics', 'sci.crypt', 'sci.electronics']
train_bunch = get_train_data(all_categories)
test_bunch = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=all_categories, remove=remove)


def get_sample(bunch, category_idx):
    for idx, target in enumerate(bunch.target):
        if target == category_idx:
            return bunch.data[idx]

# 4. Вывод по одному документа каждого из классов

In [109]:
get_sample(train_bunch, all_categories.index('comp.graphics'))

"Hello, I realize that this might be a FAQ but I have to ask since I don't get a\nchange to read this newsgroup very often.  Anyways for my senior project I need\nto convert an AutoCad file to a TIFF file.  Please I don't need anyone telling\nme that the AutoCAD file is a vector file and the TIFF is a bit map since I\nhave heard that about 100 times already I would just like to know if anyone\nknows how to do this or at least point me to the right direction."

In [110]:
get_sample(train_bunch, all_categories.index('sci.crypt'))

'Looking for PostScript or Tex version of a paper called:\n\t"PUBLIC-KEY CRYPTOGRAPHY"\n\nWritten by:\n\tJames Nechvatal\n\tSecurity Technology Group\n\tNational Computer Systems Laboratory\n\tNational Institute of Standards and Technology\n\tGaithersburg, MD 20899\n\n\tDecember 1990\n\nThe version I obtained is plain text and all symbolic character\nformatting has been lost.\n'

In [111]:
get_sample(train_bunch, all_categories.index('sci.electronics'))

'Just a thought........Maybe it possibly has to do with the fact that it\nIS an Emerson.  I\'ve got an Emerson VCR which is #6 in the series.  Returned\nit six times for various and never the same problems.  Got tired of taking it \nback and fixed it myself.  The Hi-Fi "window" was a bit off.  Something like\nthe Hi-Fi audio fine-tuning.  When I was a Wal-Mart "associate" in \'88-\'89,\nwe had AT LEAST one returned as defective EVERY SINGLE DAY.  How\'s that for\nreliability?  Face it--Emerson can make audio stuff (albeit not of premium\nquality), but they CAN\'T make anything as complex as video equipment with \nreliability IMHO.  Please, no flames.  Just *had* to share my Emerson disaster\nin the light of this exploding tv.  \nJC\n\n\n'

# 5. Выполнение процедуры стемминга

In [112]:
import nltk
from nltk.stem import *
from nltk import word_tokenize

nltk.download('punkt')


def stemminize(documents: list[str]) -> list[str]:
    porter_stemmer = PorterStemmer()
    stem_train = []
    for document in documents:
        nltk_tokens = word_tokenize(document)
        line = ''
        for word in nltk_tokens:
            line += ' ' + porter_stemmer.stem(word)
        stem_train.append(line)
    return stem_train


train_tokenized = stemminize(train_bunch.data)
test_tokenized = stemminize(test_bunch.data)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ruslan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [113]:
# вывод 3 первых документов обучающих данных
train_tokenized[:3]

[" hello , i realiz that thi might be a faq but i have to ask sinc i do n't get a chang to read thi newsgroup veri often . anyway for my senior project i need to convert an autocad file to a tiff file . pleas i do n't need anyon tell me that the autocad file is a vector file and the tiff is a bit map sinc i have heard that about 100 time alreadi i would just like to know if anyon know how to do thi or at least point me to the right direct .",
 " just a thought ........ mayb it possibl ha to do with the fact that it is an emerson . i 've got an emerson vcr which is # 6 in the seri . return it six time for variou and never the same problem . got tire of take it back and fix it myself . the hi-fi `` window '' wa a bit off . someth like the hi-fi audio fine-tun . when i wa a wal-mart `` associ '' in '88-'89 , we had at least one return as defect everi singl day . how 's that for reliabl ? face it -- emerson can make audio stuff ( albeit not of premium qualiti ) , but they ca n't make anyth

In [114]:
# вывод 3 первых документов тестовых данных
test_tokenized[:3]

[' well , i am place a file at my ftp today that contain sever polygon descript of a head , face , skull , vase , etc . the format of the file is a list of vertic , normal , and triangl . there are variou resolut and the name of the data file includ the number of polygon , eg . phred.1.3k.vbl contain 1300 polygon . in order to get the data via ftp do the follow : 1 ) ftp taurus.cs.nps.navy.mil 2 ) login as anonym , guest as the password 3 ) cd pub/dabro 4 ) binari 5 ) get cyber.tar.z onc you get the data onto your workstat : 1 ) uncompress data.tar.z 2 ) tar xvof data.tar if you have ani question , pleas let me know . georg dabro dabro @ taurus.cs.nps.navy.mil -- georg dabrowski cyberwar lab',
 " tri search for dmorf , i think it 's locat on wuarchive.wustl.edu in a mirror directori ... i 've use it befor , & it wa pretti good !",
 ' not realli . i think it is less than 10 % .']

# 6 Векторизация и вывод 20 наиболее частых слов для всей тренировочной выборки без стоп-слов

In [115]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(max_features=10000)
train_data = vect.fit_transform(train_bunch.data)


def get_20_freq_words(vect, data):
    words = list(zip(vect.get_feature_names_out(), np.ravel(data.sum(axis=0))))
    words.sort(key=lambda x: x[1], reverse=True)
    return words[:20]


get_20_freq_words(vect, train_data)

[('the', 16689),
 ('to', 8883),
 ('of', 7021),
 ('and', 6843),
 ('is', 5467),
 ('in', 4416),
 ('it', 3900),
 ('that', 3682),
 ('for', 3677),
 ('you', 2852),
 ('be', 2788),
 ('this', 2585),
 ('on', 2451),
 ('are', 2155),
 ('with', 2111),
 ('or', 2090),
 ('have', 1879),
 ('as', 1784),
 ('can', 1704),
 ('if', 1702)]

# 6 Векторизация и вывод 20 наиболее частых слов для каждого класса тренировочной выборки по отдельности без стоп-слов

In [116]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_train_data(category)
    vect = CountVectorizer(max_features=10000)
    # dtm - Document Term Matrix
    dtm = vect.fit_transform(bunch.data)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('the', 3652), ('to', 2146), ('and', 1961), ('of', 1745), ('is', 1407), ('for', 1259), ('in', 1144), ('it', 1113), ('you', 859), ('that', 771), ('on', 728), ('this', 667), ('or', 601), ('with', 579), ('be', 568), ('can', 525), ('are', 514), ('have', 512), ('if', 498), ('from', 496)]
category=sci.crypt
[('the', 8980), ('to', 4739), ('of', 3888), ('and', 3506), ('is', 2797), ('in', 2232), ('that', 2108), ('it', 1865), ('be', 1655), ('for', 1565), ('this', 1365), ('on', 1150), ('are', 1090), ('you', 1085), ('with', 1010), ('as', 968), ('or', 955), ('not', 918), ('key', 906), ('have', 868)]
category=sci.electronics
[('the', 4057), ('to', 1998), ('of', 1388), ('and', 1376), ('is', 1263), ('in', 1040), ('it', 922), ('you', 908), ('for', 853), ('that', 803), ('on', 573), ('be', 565), ('this', 553), ('are', 551), ('or', 534), ('with', 522), ('have', 499), ('if', 477), ('as', 374), ('not', 371)]


# 6 Векторизация и вывод 20 наиболее частых слов для всей тренировочной выборки со стоп-словами

In [117]:
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(train_bunch.data)

get_20_freq_words(vect, dtm)

[('key', 937),
 ('use', 932),
 ('like', 642),
 ('don', 592),
 ('db', 562),
 ('edu', 553),
 ('encryption', 552),
 ('data', 547),
 ('know', 542),
 ('just', 533),
 ('chip', 521),
 ('does', 501),
 ('used', 498),
 ('information', 497),
 ('image', 492),
 ('people', 483),
 ('time', 447),
 ('bit', 437),
 ('file', 427),
 ('graphics', 423)]

# 6 Векторизация и вывод 20 наиболее частых слов для каждого класса тренировочной со стоп-словами

In [118]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_train_data(category)
    vect = CountVectorizer(max_features=10000, stop_words='english')
    dtm = vect.fit_transform(bunch.data)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('image', 484), ('graphics', 410), ('edu', 297), ('jpeg', 267), ('file', 265), ('use', 225), ('data', 219), ('files', 217), ('images', 212), ('software', 212), ('program', 199), ('ftp', 189), ('available', 185), ('format', 178), ('color', 174), ('like', 167), ('know', 165), ('pub', 161), ('gif', 160), ('does', 157)]
category=sci.crypt
[('key', 906), ('encryption', 551), ('db', 549), ('use', 448), ('chip', 438), ('government', 404), ('clipper', 387), ('people', 376), ('privacy', 349), ('keys', 340), ('security', 331), ('public', 313), ('information', 303), ('like', 285), ('just', 279), ('don', 271), ('law', 268), ('anonymous', 250), ('data', 246), ('used', 241)]
category=sci.electronics
[('use', 259), ('like', 190), ('power', 168), ('don', 166), ('wire', 163), ('ground', 161), ('used', 160), ('know', 148), ('does', 144), ('good', 142), ('circuit', 139), ('just', 136), ('current', 130), ('need', 120), ('wiring', 116), ('work', 115), ('time', 112), ('ve', 111), ('w

# 6 Векторизация и вывод 20 наиболее частых слов для всей тестовой выборки без стоп-слов

In [119]:
vect = CountVectorizer(max_features=10000)
dtm = vect.fit_transform(test_bunch.data)

get_20_freq_words(vect, dtm)

[('the', 9066),
 ('to', 5360),
 ('of', 4137),
 ('and', 4073),
 ('is', 3074),
 ('in', 2610),
 ('it', 2402),
 ('for', 2362),
 ('that', 2228),
 ('you', 2086),
 ('be', 1535),
 ('this', 1472),
 ('on', 1462),
 ('or', 1295),
 ('with', 1258),
 ('have', 1215),
 ('are', 1186),
 ('if', 1154),
 ('can', 1101),
 ('as', 1026)]

# 6 Векторизация и вывод 20 наиболее частых слов для каждого класса тестовой выборки без стоп-слов

In [120]:
def get_test_data(categories):
    if type(categories) is not list:
        categories = [categories]
    return fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=categories, remove=remove)


for category in all_categories:
    print(f"category={category}")
    bunch = get_test_data(category)
    vect = CountVectorizer(max_features=10000)
    dtm = vect.fit_transform(bunch.data)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('the', 3694), ('to', 2376), ('and', 2208), ('of', 1945), ('is', 1505), ('for', 1351), ('in', 1275), ('you', 1053), ('it', 1045), ('that', 770), ('on', 734), ('this', 707), ('or', 681), ('image', 655), ('with', 655), ('be', 642), ('are', 580), ('can', 558), ('from', 547), ('jpeg', 526)]
category=sci.crypt
[('the', 3251), ('to', 1859), ('of', 1402), ('and', 1116), ('that', 915), ('is', 891), ('in', 801), ('it', 801), ('be', 577), ('for', 563), ('you', 553), ('this', 477), ('not', 423), ('on', 420), ('have', 418), ('if', 380), ('or', 359), ('are', 358), ('they', 355), ('with', 338)]
category=sci.electronics
[('the', 2121), ('to', 1125), ('of', 790), ('and', 749), ('is', 678), ('it', 556), ('that', 543), ('in', 534), ('you', 480), ('for', 448), ('be', 316), ('on', 308), ('have', 301), ('this', 288), ('with', 265), ('if', 262), ('or', 255), ('are', 248), ('can', 239), ('but', 236)]


# 6 Векторизация и вывод 20 наиболее частых слов для всей тестовой выборки со стоп-словами

In [121]:
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(test_bunch.data)

get_20_freq_words(vect, dtm)

[('image', 666),
 ('jpeg', 526),
 ('use', 516),
 ('edu', 468),
 ('graphics', 462),
 ('like', 408),
 ('file', 389),
 ('don', 378),
 ('data', 368),
 ('know', 355),
 ('just', 339),
 ('bit', 337),
 ('available', 325),
 ('software', 324),
 ('images', 307),
 ('program', 298),
 ('does', 291),
 ('time', 282),
 ('used', 272),
 ('ftp', 271)]

# 6 Векторизация и вывод 20 наиболее частых слов для каждого класса тестовой выборки со стоп-словами

In [122]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_test_data(category)
    vect = CountVectorizer(max_features=10000, stop_words='english')
    dtm = vect.fit_transform(bunch.data)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('image', 655), ('jpeg', 526), ('graphics', 456), ('edu', 404), ('file', 366), ('images', 302), ('available', 269), ('format', 262), ('gif', 253), ('data', 249), ('ftp', 248), ('bit', 245), ('software', 245), ('color', 221), ('use', 218), ('files', 215), ('pub', 205), ('program', 197), ('version', 193), ('like', 188)]
category=sci.crypt
[('government', 214), ('key', 176), ('use', 176), ('clipper', 165), ('chip', 151), ('don', 141), ('people', 141), ('encryption', 134), ('like', 127), ('just', 121), ('time', 116), ('know', 113), ('phone', 111), ('think', 111), ('message', 108), ('keys', 100), ('algorithm', 97), ('law', 94), ('security', 93), ('used', 92)]
category=sci.electronics
[('use', 122), ('just', 106), ('know', 100), ('used', 95), ('like', 93), ('don', 89), ('battery', 84), ('does', 74), ('copy', 69), ('time', 69), ('think', 68), ('program', 65), ('need', 63), ('make', 60), ('ve', 60), ('sure', 55), ('power', 53), ('want', 53), ('software', 52), ('radio', 

# 6.d Векторизация и вывод 20 наиболее частых слов для всей тренировочной выборки без стоп-слов с применением стемминга

In [123]:
vect = CountVectorizer(max_features=10000)
dtm = vect.fit_transform(train_tokenized)
print(get_20_freq_words(vect, dtm))

[('the', 16688), ('to', 8883), ('of', 7021), ('and', 6843), ('is', 5549), ('in', 4419), ('it', 4191), ('that', 3692), ('for', 3677), ('be', 2998), ('you', 2852), ('thi', 2585), ('on', 2459), ('are', 2195), ('with', 2111), ('or', 2090), ('use', 2014), ('have', 1997), ('as', 1784), ('not', 1740)]


# 6.d Векторизация и вывод 20 наиболее частых слов для каждого класса тренировочной выборки без стоп-слов с применением стемминга

In [124]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_train_data(category)
    stemminized = stemminize(bunch.data)
    vect = CountVectorizer(max_features=10000)
    dtm = vect.fit_transform(stemminized)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('the', 3651), ('to', 2146), ('and', 1961), ('of', 1745), ('is', 1419), ('for', 1259), ('it', 1200), ('in', 1146), ('you', 859), ('that', 771), ('on', 730), ('imag', 717), ('thi', 668), ('be', 604), ('or', 601), ('with', 579), ('have', 545), ('are', 522), ('use', 509), ('if', 498)]
category=sci.crypt
[('the', 8980), ('to', 4739), ('of', 3888), ('and', 3506), ('is', 2854), ('in', 2232), ('that', 2115), ('it', 2028), ('be', 1786), ('for', 1565), ('thi', 1364), ('key', 1249), ('on', 1154), ('are', 1118), ('you', 1085), ('with', 1010), ('not', 970), ('as', 968), ('use', 958), ('or', 955)]
category=sci.electronics
[('the', 4057), ('to', 1998), ('of', 1388), ('and', 1376), ('is', 1276), ('in', 1041), ('it', 963), ('you', 908), ('for', 853), ('that', 806), ('be', 608), ('on', 575), ('are', 555), ('thi', 553), ('use', 547), ('or', 534), ('have', 531), ('with', 522), ('if', 477), ('do', 401)]


# 6.d Векторизация и вывод 20 наиболее частых слов для всей тестовой выборки со стоп-словами с применением стемминга

In [125]:
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(test_tokenized)
print(get_20_freq_words(vect, dtm))

[('thi', 1472), ('use', 1097), ('imag', 998), ('file', 615), ('jpeg', 531), ('wa', 510), ('ani', 505), ('program', 497), ('ha', 479), ('edu', 468), ('like', 457), ('bit', 451), ('format', 411), ('know', 401), ('doe', 386), ('data', 369), ('onli', 344), ('work', 344), ('make', 341), ('just', 339)]


# 6.d Векторизация и вывод 20 наиболее частых слов для каждого класса тренировочной выборки cо стоп-словами с применением стемминга

In [126]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_test_data(category)
    stemminized = stemminize(bunch.data)
    vect = CountVectorizer(max_features=10000, stop_words='english')
    dtm = vect.fit_transform(stemminized)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('imag', 979), ('thi', 707), ('file', 580), ('jpeg', 531), ('use', 469), ('edu', 404), ('format', 395), ('program', 366), ('graphic', 321), ('bit', 309), ('color', 287), ('gif', 284), ('avail', 280), ('data', 250), ('ftp', 248), ('ani', 245), ('softwar', 234), ('display', 231), ('comput', 224), ('version', 224)]
category=sci.crypt
[('thi', 477), ('use', 354), ('key', 276), ('govern', 229), ('encrypt', 211), ('wa', 207), ('chip', 194), ('ha', 160), ('clipper', 155), ('ani', 152), ('like', 152), ('phone', 151), ('secur', 145), ('peopl', 140), ('know', 138), ('onli', 136), ('make', 126), ('just', 121), ('law', 121), ('think', 121)]
category=sci.electronics
[('thi', 288), ('use', 274), ('wa', 124), ('batteri', 113), ('ani', 108), ('just', 106), ('know', 106), ('ha', 104), ('like', 103), ('work', 100), ('doe', 99), ('make', 96), ('copi', 94), ('need', 85), ('program', 81), ('time', 75), ('anyon', 73), ('onli', 71), ('think', 71), ('board', 70)]


# 6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью TfidfTransformer без стоп-слов

In [127]:
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_bunch.data)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

def get_20_freq_words_idf(feature_names, tfidf_values):
    word_weights = dict(zip(feature_names, tfidf_values))
    sorted_words = sorted(word_weights.items(), key=lambda x: x[1], reverse=True)
    for word, weight in sorted_words[:20]:
        print(f"{word}: {weight}")
        
get_20_freq_words_idf(feature_names, tfidf_values)

the: 214.27923011983643
to: 128.50658234239415
of: 101.86203264775209
and: 91.733961272254
is: 83.97727336110019
it: 79.47554322700573
in: 72.70953833589493
that: 72.51290798766651
for: 67.66475343892618
you: 66.41180952122423
be: 55.65313856163747
this: 55.13173979874598
on: 50.56554310512375
have: 48.11707358740201
are: 44.12242787539058
with: 43.883460758799835
if: 43.62474130570933
or: 43.39770046917348
can: 40.89942210826312
as: 40.345366794020045


# 6 Векторизация и вывод 20 наиболее важных слов для каждого класса обучающей выборки с помощью TfidfTransformer без стоп-слов

In [128]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_train_data(category)
    vectorizer = CountVectorizer(max_features=10000)
    dtm = vectorizer.fit_transform(bunch.data)
    tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

    feature_names = vectorizer.get_feature_names_out()
    tfidf_values = tfidf.toarray().sum(axis=0)
    print(get_20_freq_words_idf(feature_names, tfidf_values))

category=comp.graphics
the: 56.81603927198729
to: 37.5504252863982
of: 29.95573947448096
and: 29.460733805639126
is: 26.83975596124121
it: 26.306797654818542
for: 25.149437433894523
in: 23.2357918510436
you: 20.673734461421777
that: 20.115182794971254
on: 17.71246888444271
this: 16.797040846507482
have: 16.169172457895662
be: 15.316347479606655
or: 14.892040185365866
can: 14.412387652558904
if: 14.147115541575188
with: 14.130815774612628
any: 13.617128936651824
but: 13.159431390155026
None
category=sci.crypt
the: 99.93764847107565
to: 57.66221081150154
of: 46.1899156380585
and: 38.59505532861134
is: 34.65420251878092
that: 33.583875491055075
it: 31.869516519073688
in: 29.482313408273818
be: 26.4689599195368
you: 24.889063304511872
this: 24.376291130996346
for: 23.58073764592061
they: 20.05091740337681
key: 19.51308036713041
on: 18.340987575708446
not: 18.18277256091116
have: 17.90227365267764
as: 17.80312531534647
are: 17.670592540470366
with: 16.98221081383814
None
category=sci.electr

# 6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью TfidfTransformer со стоп-словами

In [129]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(train_bunch.data)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

get_20_freq_words_idf(feature_names, tfidf_values)

key: 31.639250020331772
know: 29.275985507770525
use: 28.5909199903978
like: 27.36974112627459
does: 27.32020401912984
don: 25.935560274661245
just: 25.519485103509275
chip: 24.453605471872685
thanks: 24.11088770306822
encryption: 20.612197002323146
good: 20.345796034175873
need: 19.637524126929815
ve: 19.573641488242785
graphics: 19.457067951393185
clipper: 19.278143372470023
people: 18.825771055423083
think: 18.438571829249657
used: 18.158442956203725
government: 17.929344754138906
time: 17.5444284746565


# 6 Векторизация и вывод 20 наиболее важных слов для каждого класса обучающей выборки с помощью TfidfTransformer со стоп-словами

In [130]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_train_data(category)
    vectorizer = CountVectorizer(max_features=10000, stop_words='english')
    dtm = vectorizer.fit_transform(bunch.data)
    tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

    feature_names = vectorizer.get_feature_names_out()
    tfidf_values = tfidf.toarray().sum(axis=0)
    print(get_20_freq_words_idf(feature_names, tfidf_values))

category=comp.graphics
graphics: 13.831226949576145
thanks: 12.288044819620865
know: 11.28612771693418
files: 10.102138516852015
image: 10.059196059407899
does: 10.004388990058809
file: 9.554483758125274
program: 9.124242438805213
like: 8.74043723054119
need: 8.575542354404014
use: 8.551942822506788
looking: 8.172372633378805
don: 7.948725929628865
help: 7.715196424156215
windows: 7.512770455768797
just: 7.431313142451506
software: 7.336423321915559
ve: 7.321258597928946
format: 7.273424388627451
hi: 7.226118442773505
None
category=sci.crypt
key: 23.517928447854555
chip: 16.042979038078702
encryption: 16.009767020451324
clipper: 14.966225678343589
government: 13.886159272154273
keys: 12.348914523675896
people: 12.146083918114993
use: 12.100090892961198
just: 11.5722368986071
don: 10.825121722527149
nsa: 9.972883347244524
know: 9.714097923177743
like: 9.674787433122667
does: 9.163514002454846
public: 8.731548348649389
escrow: 8.701773036575155
security: 8.476496973427668
think: 8.448380

# 6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью TfidfTransformer без стоп-слов c приминением стемминга

In [132]:
vectorizer = CountVectorizer(max_features=10000)
dtm = vectorizer.fit_transform(train_tokenized)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

get_20_freq_words_idf(feature_names, tfidf_values)

the: 214.47110783764484
to: 128.91799924792693
of: 101.9397995673158
and: 91.85258731878157
is: 84.95276457618122
it: 82.95304153471035
in: 73.08110129504044
that: 73.01505717147099
for: 68.08421571530532
you: 66.66827307818957
be: 59.109398696493265
thi: 55.45567314375222
on: 50.853304634787605
have: 50.30994107364992
are: 45.04857754230604
with: 44.02554259499939
if: 43.91731908930183
or: 43.60061441479198
do: 43.39462930056318
use: 42.979583779815044


# 6. Векторизация и вывод 20 наиболее важных слов для каждого класса обучающей выборки с помощью TfidfTransformer без стоп-слов c приминением стемминга

In [133]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_train_data(category)
    stemminized = stemminize(bunch.data)
    vectorizer = CountVectorizer(max_features=10000)
    dtm = vectorizer.fit_transform(stemminized)
    tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

    feature_names = vectorizer.get_feature_names_out()
    tfidf_values = tfidf.toarray().sum(axis=0)
    print(get_20_freq_words_idf(feature_names, tfidf_values))

category=comp.graphics
the: 57.84204157650781
to: 38.340151343439004
of: 30.585510691137237
and: 30.045737608648373
it: 28.000454544661952
is: 27.41668278223464
for: 25.715714722335886
in: 23.779693685432704
you: 21.137700831988884
that: 20.533837250563355
on: 18.1092841570932
thi: 17.188796226050815
have: 17.093136948410628
be: 16.302077761169727
do: 15.401428977570154
or: 15.258422708993308
file: 14.653988465757077
if: 14.485327297878731
with: 14.414977019404219
ani: 14.03922847248981
None
category=sci.crypt
the: 100.89753418910733
to: 58.37532802365063
of: 46.615240463294775
and: 38.904579893384145
is: 35.750875349520236
that: 34.1864895509709
it: 33.93805555595851
in: 29.71597401845652
be: 28.398975469667054
key: 26.23362687401613
you: 25.243952438346575
thi: 24.69987466763479
for: 23.866638227781138
they: 20.298227761944975
have: 18.960418087473922
not: 18.883439176419014
are: 18.5760104596733
on: 18.540370636697027
as: 17.944114308415344
do: 17.490319647410548
None
category=sci.e

# 6. Векторизация и вывод 20 наиболее важных слов для всей обучающей выборки с помощью TfidfTransformer со стоп-словами c приминением стемминга

In [134]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
dtm = vectorizer.fit_transform(train_bunch.data)
tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf.toarray().sum(axis=0)

get_20_freq_words_idf(feature_names, tfidf_values)

key: 31.639250020331772
know: 29.275985507770525
use: 28.5909199903978
like: 27.36974112627459
does: 27.32020401912984
don: 25.935560274661245
just: 25.519485103509275
chip: 24.453605471872685
thanks: 24.11088770306822
encryption: 20.612197002323146
good: 20.345796034175873
need: 19.637524126929815
ve: 19.573641488242785
graphics: 19.457067951393185
clipper: 19.278143372470023
people: 18.825771055423083
think: 18.438571829249657
used: 18.158442956203725
government: 17.929344754138906
time: 17.5444284746565


# 6. Векторизация и вывод 20 наиболее важных слов для каждого класса обучающей выборки с помощью TfidfTransformer со стоп-словами c приминением стемминга

In [135]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_train_data(category)
    stemminized = stemminize(bunch.data)
    vectorizer = CountVectorizer(max_features=10000, stop_words='english')
    dtm = vectorizer.fit_transform(stemminized)
    tfidf = TfidfTransformer(use_idf=True).fit_transform(dtm)

    feature_names = vectorizer.get_feature_names_out()
    tfidf_values = tfidf.toarray().sum(axis=0)
    print(get_20_freq_words_idf(feature_names, tfidf_values))

category=comp.graphics
thi: 19.621420625347906
file: 16.61883229657882
ani: 15.857764195559623
use: 15.719093075820448
thank: 13.746459600916015
imag: 13.744728479290567
know: 12.652182674316517
program: 12.624755324724674
graphic: 12.110970401672128
look: 11.313449104830836
doe: 11.094976683451524
anyon: 10.551388274388568
wa: 10.481184104117553
need: 9.814535800027842
help: 9.561833576753715
format: 9.455752939643002
pleas: 9.026265141642927
point: 9.025544542011133
ha: 8.966125867615792
like: 8.901510963441291
None
category=sci.crypt
key: 31.228292988781
thi: 29.491860750296926
encrypt: 20.8031575119436
use: 20.343046535829462
chip: 19.417298714056788
clipper: 14.752755580075826
govern: 14.217654249151735
secur: 13.447096869841445
ha: 13.34610828917708
wa: 12.629522591032194
peopl: 12.087664658455159
doe: 11.902258153879002
phone: 11.854376646657045
bit: 11.802744216419127
just: 11.549004341432651
ani: 11.497761624127971
like: 11.30424933095508
escrow: 10.750788646668207
know: 10.71