In [513]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt

# 3. Загрузка обучающей и тестовой выборки

In [514]:
from sklearn.datasets import fetch_20newsgroups

remove = ('headers', 'footers', 'quotes')


def get_train_data(categories):
    if type(categories) is not list:
        categories = [categories]
    return fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories=categories, remove=remove)


all_categories = ['comp.graphics', 'sci.crypt', 'sci.electronics']
train_bunch = get_train_data(all_categories)
test_bunch = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=all_categories, remove=remove)


def get_sample(bunch, category_idx):
    for idx, target in enumerate(bunch.target):
        if target == category_idx:
            return bunch.data[idx]

# 4. Вывод по одному документа каждого из классов

In [515]:
get_sample(train_bunch, all_categories.index('comp.graphics'))

"Hello, I realize that this might be a FAQ but I have to ask since I don't get a\nchange to read this newsgroup very often.  Anyways for my senior project I need\nto convert an AutoCad file to a TIFF file.  Please I don't need anyone telling\nme that the AutoCAD file is a vector file and the TIFF is a bit map since I\nhave heard that about 100 times already I would just like to know if anyone\nknows how to do this or at least point me to the right direction."

In [516]:
get_sample(train_bunch, all_categories.index('sci.crypt'))

'Looking for PostScript or Tex version of a paper called:\n\t"PUBLIC-KEY CRYPTOGRAPHY"\n\nWritten by:\n\tJames Nechvatal\n\tSecurity Technology Group\n\tNational Computer Systems Laboratory\n\tNational Institute of Standards and Technology\n\tGaithersburg, MD 20899\n\n\tDecember 1990\n\nThe version I obtained is plain text and all symbolic character\nformatting has been lost.\n'

In [517]:
get_sample(train_bunch, all_categories.index('sci.electronics'))

'Just a thought........Maybe it possibly has to do with the fact that it\nIS an Emerson.  I\'ve got an Emerson VCR which is #6 in the series.  Returned\nit six times for various and never the same problems.  Got tired of taking it \nback and fixed it myself.  The Hi-Fi "window" was a bit off.  Something like\nthe Hi-Fi audio fine-tuning.  When I was a Wal-Mart "associate" in \'88-\'89,\nwe had AT LEAST one returned as defective EVERY SINGLE DAY.  How\'s that for\nreliability?  Face it--Emerson can make audio stuff (albeit not of premium\nquality), but they CAN\'T make anything as complex as video equipment with \nreliability IMHO.  Please, no flames.  Just *had* to share my Emerson disaster\nin the light of this exploding tv.  \nJC\n\n\n'

# 5. Выполнение процедуры стемминга

In [518]:
import nltk
from nltk.stem import *
from nltk import word_tokenize

nltk.download('punkt')


def stemminize(documents: list[str]) -> list[str]:
    porter_stemmer = PorterStemmer()
    stem_train = []
    for document in documents:
        nltk_tokens = word_tokenize(document)
        line = ''
        for word in nltk_tokens:
            line += ' ' + porter_stemmer.stem(word)
        stem_train.append(line)
    return stem_train


train_tokenized = stemminize(train_bunch.data)
test_tokenized = stemminize(test_bunch.data)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ruslan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [519]:
train_tokenized

[" hello , i realiz that thi might be a faq but i have to ask sinc i do n't get a chang to read thi newsgroup veri often . anyway for my senior project i need to convert an autocad file to a tiff file . pleas i do n't need anyon tell me that the autocad file is a vector file and the tiff is a bit map sinc i have heard that about 100 time alreadi i would just like to know if anyon know how to do thi or at least point me to the right direct .",
 " just a thought ........ mayb it possibl ha to do with the fact that it is an emerson . i 've got an emerson vcr which is # 6 in the seri . return it six time for variou and never the same problem . got tire of take it back and fix it myself . the hi-fi `` window '' wa a bit off . someth like the hi-fi audio fine-tun . when i wa a wal-mart `` associ '' in '88-'89 , we had at least one return as defect everi singl day . how 's that for reliabl ? face it -- emerson can make audio stuff ( albeit not of premium qualiti ) , but they ca n't make anyth

In [520]:
test_tokenized

[' well , i am place a file at my ftp today that contain sever polygon descript of a head , face , skull , vase , etc . the format of the file is a list of vertic , normal , and triangl . there are variou resolut and the name of the data file includ the number of polygon , eg . phred.1.3k.vbl contain 1300 polygon . in order to get the data via ftp do the follow : 1 ) ftp taurus.cs.nps.navy.mil 2 ) login as anonym , guest as the password 3 ) cd pub/dabro 4 ) binari 5 ) get cyber.tar.z onc you get the data onto your workstat : 1 ) uncompress data.tar.z 2 ) tar xvof data.tar if you have ani question , pleas let me know . georg dabro dabro @ taurus.cs.nps.navy.mil -- georg dabrowski cyberwar lab',
 " tri search for dmorf , i think it 's locat on wuarchive.wustl.edu in a mirror directori ... i 've use it befor , & it wa pretti good !",
 ' not realli . i think it is less than 10 % .',
 " he 's mistaken . they exist , the semiconductor is silicon carbid , and they are ineffici and expens . th

# 6 Векторизация и вывод 20 наиболее частых слов для всей тренировочной выборки без стоп-слов

In [521]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(max_features=10000)
train_data = vect.fit_transform(train_bunch.data)


def get_20_freq_words(vect, data):
    words = list(zip(vect.get_feature_names_out(), np.ravel(data.sum(axis=0))))
    words.sort(key=lambda x: x[1], reverse=True)
    return words[:20]


get_20_freq_words(vect, train_data)

[('the', 16689),
 ('to', 8883),
 ('of', 7021),
 ('and', 6843),
 ('is', 5467),
 ('in', 4416),
 ('it', 3900),
 ('that', 3682),
 ('for', 3677),
 ('you', 2852),
 ('be', 2788),
 ('this', 2585),
 ('on', 2451),
 ('are', 2155),
 ('with', 2111),
 ('or', 2090),
 ('have', 1879),
 ('as', 1784),
 ('can', 1704),
 ('if', 1702)]

# 6 Векторизация и вывод 20 наиболее частых слов для каждого класса тренировочной выборки по отдельности без стоп-слов

In [522]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_train_data(category)
    vect = CountVectorizer(max_features=10000)
    # dtm - Document Term Matrix
    dtm = vect.fit_transform(bunch.data)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('the', 3652), ('to', 2146), ('and', 1961), ('of', 1745), ('is', 1407), ('for', 1259), ('in', 1144), ('it', 1113), ('you', 859), ('that', 771), ('on', 728), ('this', 667), ('or', 601), ('with', 579), ('be', 568), ('can', 525), ('are', 514), ('have', 512), ('if', 498), ('from', 496)]
category=sci.crypt
[('the', 8980), ('to', 4739), ('of', 3888), ('and', 3506), ('is', 2797), ('in', 2232), ('that', 2108), ('it', 1865), ('be', 1655), ('for', 1565), ('this', 1365), ('on', 1150), ('are', 1090), ('you', 1085), ('with', 1010), ('as', 968), ('or', 955), ('not', 918), ('key', 906), ('have', 868)]
category=sci.electronics
[('the', 4057), ('to', 1998), ('of', 1388), ('and', 1376), ('is', 1263), ('in', 1040), ('it', 922), ('you', 908), ('for', 853), ('that', 803), ('on', 573), ('be', 565), ('this', 553), ('are', 551), ('or', 534), ('with', 522), ('have', 499), ('if', 477), ('as', 374), ('not', 371)]


# 6 Векторизация и вывод 20 наиболее частых слов для всей тренировочной выборки со стоп-словами

In [523]:
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(train_bunch.data)

get_20_freq_words(vect, dtm)

[('key', 937),
 ('use', 932),
 ('like', 642),
 ('don', 592),
 ('db', 562),
 ('edu', 553),
 ('encryption', 552),
 ('data', 547),
 ('know', 542),
 ('just', 533),
 ('chip', 521),
 ('does', 501),
 ('used', 498),
 ('information', 497),
 ('image', 492),
 ('people', 483),
 ('time', 447),
 ('bit', 437),
 ('file', 427),
 ('graphics', 423)]

# 6 Векторизация и вывод 20 наиболее частых слов для каждого класса тренировочной со стоп-словами

In [524]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_train_data(category)
    vect = CountVectorizer(max_features=10000, stop_words='english')
    dtm = vect.fit_transform(bunch.data)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('image', 484), ('graphics', 410), ('edu', 297), ('jpeg', 267), ('file', 265), ('use', 225), ('data', 219), ('files', 217), ('images', 212), ('software', 212), ('program', 199), ('ftp', 189), ('available', 185), ('format', 178), ('color', 174), ('like', 167), ('know', 165), ('pub', 161), ('gif', 160), ('does', 157)]
category=sci.crypt
[('key', 906), ('encryption', 551), ('db', 549), ('use', 448), ('chip', 438), ('government', 404), ('clipper', 387), ('people', 376), ('privacy', 349), ('keys', 340), ('security', 331), ('public', 313), ('information', 303), ('like', 285), ('just', 279), ('don', 271), ('law', 268), ('anonymous', 250), ('data', 246), ('used', 241)]
category=sci.electronics
[('use', 259), ('like', 190), ('power', 168), ('don', 166), ('wire', 163), ('ground', 161), ('used', 160), ('know', 148), ('does', 144), ('good', 142), ('circuit', 139), ('just', 136), ('current', 130), ('need', 120), ('wiring', 116), ('work', 115), ('time', 112), ('ve', 111), ('w

# 6 Векторизация и вывод 20 наиболее частых слов для всей тестовой выборки без стоп-слов

In [525]:
vect = CountVectorizer(max_features=10000)
dtm = vect.fit_transform(test_bunch.data)

get_20_freq_words(vect, dtm)

[('the', 9066),
 ('to', 5360),
 ('of', 4137),
 ('and', 4073),
 ('is', 3074),
 ('in', 2610),
 ('it', 2402),
 ('for', 2362),
 ('that', 2228),
 ('you', 2086),
 ('be', 1535),
 ('this', 1472),
 ('on', 1462),
 ('or', 1295),
 ('with', 1258),
 ('have', 1215),
 ('are', 1186),
 ('if', 1154),
 ('can', 1101),
 ('as', 1026)]

# 6 Векторизация и вывод 20 наиболее частых слов для каждого класса тестовой выборки без стоп-слов

In [526]:
def get_test_data(categories):
    if type(categories) is not list:
        categories = [categories]
    return fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=categories, remove=remove)


for category in all_categories:
    print(f"category={category}")
    bunch = get_test_data(category)
    vect = CountVectorizer(max_features=10000)
    dtm = vect.fit_transform(bunch.data)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('the', 3694), ('to', 2376), ('and', 2208), ('of', 1945), ('is', 1505), ('for', 1351), ('in', 1275), ('you', 1053), ('it', 1045), ('that', 770), ('on', 734), ('this', 707), ('or', 681), ('image', 655), ('with', 655), ('be', 642), ('are', 580), ('can', 558), ('from', 547), ('jpeg', 526)]
category=sci.crypt
[('the', 3251), ('to', 1859), ('of', 1402), ('and', 1116), ('that', 915), ('is', 891), ('in', 801), ('it', 801), ('be', 577), ('for', 563), ('you', 553), ('this', 477), ('not', 423), ('on', 420), ('have', 418), ('if', 380), ('or', 359), ('are', 358), ('they', 355), ('with', 338)]
category=sci.electronics
[('the', 2121), ('to', 1125), ('of', 790), ('and', 749), ('is', 678), ('it', 556), ('that', 543), ('in', 534), ('you', 480), ('for', 448), ('be', 316), ('on', 308), ('have', 301), ('this', 288), ('with', 265), ('if', 262), ('or', 255), ('are', 248), ('can', 239), ('but', 236)]


# 6 Векторизация и вывод 20 наиболее частых слов для всей тестовой выборки со стоп-словами

In [527]:
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(test_bunch.data)

get_20_freq_words(vect, dtm)

[('image', 666),
 ('jpeg', 526),
 ('use', 516),
 ('edu', 468),
 ('graphics', 462),
 ('like', 408),
 ('file', 389),
 ('don', 378),
 ('data', 368),
 ('know', 355),
 ('just', 339),
 ('bit', 337),
 ('available', 325),
 ('software', 324),
 ('images', 307),
 ('program', 298),
 ('does', 291),
 ('time', 282),
 ('used', 272),
 ('ftp', 271)]

# 6 Векторизация и вывод 20 наиболее частых слов для каждого класса тестовой выборки со стоп-словами

In [528]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_test_data(category)
    vect = CountVectorizer(max_features=10000, stop_words='english')
    dtm = vect.fit_transform(bunch.data)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('image', 655), ('jpeg', 526), ('graphics', 456), ('edu', 404), ('file', 366), ('images', 302), ('available', 269), ('format', 262), ('gif', 253), ('data', 249), ('ftp', 248), ('bit', 245), ('software', 245), ('color', 221), ('use', 218), ('files', 215), ('pub', 205), ('program', 197), ('version', 193), ('like', 188)]
category=sci.crypt
[('government', 214), ('key', 176), ('use', 176), ('clipper', 165), ('chip', 151), ('don', 141), ('people', 141), ('encryption', 134), ('like', 127), ('just', 121), ('time', 116), ('know', 113), ('phone', 111), ('think', 111), ('message', 108), ('keys', 100), ('algorithm', 97), ('law', 94), ('security', 93), ('used', 92)]
category=sci.electronics
[('use', 122), ('just', 106), ('know', 100), ('used', 95), ('like', 93), ('don', 89), ('battery', 84), ('does', 74), ('copy', 69), ('time', 69), ('think', 68), ('program', 65), ('need', 63), ('make', 60), ('ve', 60), ('sure', 55), ('power', 53), ('want', 53), ('software', 52), ('radio', 

# 6.d Векторизация и вывод 20 наиболее частых слов для всей тренировочной выборки без стоп-слов с применением стемминга

In [529]:
vect = CountVectorizer(max_features=10000)
dtm = vect.fit_transform(train_tokenized)
print(get_20_freq_words(vect, dtm))

[('the', 16688), ('to', 8883), ('of', 7021), ('and', 6843), ('is', 5549), ('in', 4419), ('it', 4191), ('that', 3692), ('for', 3677), ('be', 2998), ('you', 2852), ('thi', 2585), ('on', 2459), ('are', 2195), ('with', 2111), ('or', 2090), ('use', 2014), ('have', 1997), ('as', 1784), ('not', 1740)]


# 6.d Векторизация и вывод 20 наиболее частых слов для каждого класса тренировочной выборки без стоп-слов с применением стемминга

In [530]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_train_data(category)
    stemminized = stemminize(bunch.data)
    vect = CountVectorizer(max_features=10000)
    dtm = vect.fit_transform(stemminized)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('the', 3651), ('to', 2146), ('and', 1961), ('of', 1745), ('is', 1419), ('for', 1259), ('it', 1200), ('in', 1146), ('you', 859), ('that', 771), ('on', 730), ('imag', 717), ('thi', 668), ('be', 604), ('or', 601), ('with', 579), ('have', 545), ('are', 522), ('use', 509), ('if', 498)]
category=sci.crypt
[('the', 8980), ('to', 4739), ('of', 3888), ('and', 3506), ('is', 2854), ('in', 2232), ('that', 2115), ('it', 2028), ('be', 1786), ('for', 1565), ('thi', 1364), ('key', 1249), ('on', 1154), ('are', 1118), ('you', 1085), ('with', 1010), ('not', 970), ('as', 968), ('use', 958), ('or', 955)]
category=sci.electronics
[('the', 4057), ('to', 1998), ('of', 1388), ('and', 1376), ('is', 1276), ('in', 1041), ('it', 963), ('you', 908), ('for', 853), ('that', 806), ('be', 608), ('on', 575), ('are', 555), ('thi', 553), ('use', 547), ('or', 534), ('have', 531), ('with', 522), ('if', 477), ('do', 401)]


# 6.d Векторизация и вывод 20 наиболее частых слов для всей тестовой выборки со стоп-словами с применением стемминга

In [531]:
vect = CountVectorizer(max_features=10000, stop_words='english')
dtm = vect.fit_transform(test_tokenized)
print(get_20_freq_words(vect, dtm))

[('thi', 1472), ('use', 1097), ('imag', 998), ('file', 615), ('jpeg', 531), ('wa', 510), ('ani', 505), ('program', 497), ('ha', 479), ('edu', 468), ('like', 457), ('bit', 451), ('format', 411), ('know', 401), ('doe', 386), ('data', 369), ('onli', 344), ('work', 344), ('make', 341), ('just', 339)]


# 6.d Векторизация и вывод 20 наиболее частых слов для каждого класса тренировочной выборки cо стоп-словами с применением стемминга

In [532]:
for category in all_categories:
    print(f"category={category}")
    bunch = get_test_data(category)
    stemminized = stemminize(bunch.data)
    vect = CountVectorizer(max_features=10000, stop_words='english')
    dtm = vect.fit_transform(stemminized)
    print(get_20_freq_words(vect, dtm))

category=comp.graphics
[('imag', 979), ('thi', 707), ('file', 580), ('jpeg', 531), ('use', 469), ('edu', 404), ('format', 395), ('program', 366), ('graphic', 321), ('bit', 309), ('color', 287), ('gif', 284), ('avail', 280), ('data', 250), ('ftp', 248), ('ani', 245), ('softwar', 234), ('display', 231), ('comput', 224), ('version', 224)]
category=sci.crypt
[('thi', 477), ('use', 354), ('key', 276), ('govern', 229), ('encrypt', 211), ('wa', 207), ('chip', 194), ('ha', 160), ('clipper', 155), ('ani', 152), ('like', 152), ('phone', 151), ('secur', 145), ('peopl', 140), ('know', 138), ('onli', 136), ('make', 126), ('just', 121), ('law', 121), ('think', 121)]
category=sci.electronics
[('thi', 288), ('use', 274), ('wa', 124), ('batteri', 113), ('ani', 108), ('just', 106), ('know', 106), ('ha', 104), ('like', 103), ('work', 100), ('doe', 99), ('make', 96), ('copi', 94), ('need', 85), ('program', 81), ('time', 75), ('anyon', 73), ('onli', 71), ('think', 71), ('board', 70)]
