## 테스트 데이터 사용

In [45]:
import sklearn.datasets
import scipy as sp

# 전체 테스트 데이터 다운로드
all_data = sklearn.datasets.fetch_20newsgroups(subset="all")
print("Number of total posts: %i" % len(all_data.filenames))
# Number of total posts: 18846

groups = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
# 한정된 카테고리로 훈련 셋 구성
train_data = sklearn.datasets.fetch_20newsgroups(subset="train",
                                                 categories=groups)
print("Number of training posts in tech groups:", len(train_data.filenames))
# Number of training posts in tech groups: 3529

num_clusters = 50 # sp.unique(labels).shape[0]

Number of total posts: 18846
('Number of training posts in tech groups:', 3529)


In [46]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

from sklearn.feature_extraction.text import TfidfVectorizer


class StemmedTfidfVectorizer(TfidfVectorizer):

    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [47]:
vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,
                                    stop_words='english', decode_error='ignore'
                                    )

# 학습 데이터 벡터화
vectorized = vectorizer.fit_transform(train_data.data)
num_samples, num_features = vectorized.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))
# samples: 3529, #features: 4712

#samples: 3529, #features: 4712


In [48]:
# 벡터화된 학습 데이터를 군집화
from sklearn.cluster import KMeans

km = KMeans(n_clusters=num_clusters, n_init=1, verbose=1, random_state=3)
clustered = km.fit(vectorized)

print("km.labels_=%s" % km.labels_)
# km.labels_=[ 6 34 22 ...,  2 21 26]

print("km.labels_.shape=%s" % km.labels_.shape)
# km.labels_.shape=3529

Initialization complete
Iteration  0, inertia 5686.053
Iteration  1, inertia 3164.888
Iteration  2, inertia 3132.208
Iteration  3, inertia 3111.713
Iteration  4, inertia 3098.584
Iteration  5, inertia 3092.191
Iteration  6, inertia 3087.277
Iteration  7, inertia 3084.100
Iteration  8, inertia 3082.800
Iteration  9, inertia 3082.234
Iteration 10, inertia 3081.949
Iteration 11, inertia 3081.843
Iteration 12, inertia 3081.791
Iteration 13, inertia 3081.752
Iteration 14, inertia 3081.660
Iteration 15, inertia 3081.617
Iteration 16, inertia 3081.589
Iteration 17, inertia 3081.571
Converged at iteration 17
km.labels_=[48 23 31 ...,  6  2 22]
km.labels_.shape=3529


In [50]:
# KMeans가 군집화한 labels와 학습 데이터의 labels 비교
from sklearn import metrics

labels = train_data.target

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
#               : 0.400
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
# Completeness: 0.206
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
# V-measure: 0.272
print("Adjusted Rand Index: %0.3f" %
      metrics.adjusted_rand_score(labels, km.labels_))
# Adjusted Rand Index: 0.064
print("Adjusted Mutual Information: %0.3f" %
      metrics.adjusted_mutual_info_score(labels, km.labels_))
# Adjusted Mutual Information: 0.197
print(("Silhouette Coefficient: %0.3f" %
       metrics.silhouette_score(vectorized, labels, sample_size=1000)))
# Silhouette Coefficient: 0.006

Homogeneity: 0.445
Completeness: 0.231
V-measure: 0.304
Adjusted Rand Index: 0.094
Adjusted Mutual Information: 0.223
Silhouette Coefficient: 0.006


In [51]:
# 신규 포스트 등록
new_post = \
    """Disk drive problems. Hi, I have a problem with my hard disk.
After 1 year it is working only sporadically now.
I tried to format it, but now it doesn't boot any more.
Any ideas? Thanks.
"""

# 신규 포스트 벡터화
new_post_vec = vectorizer.transform([new_post])
# 신규 포스트 군집 획득
new_post_label = km.predict(new_post_vec)[0]

print(new_post_vec)
print(new_post_label)

  (0, 4678)	0.234582932587
  (0, 4596)	0.168434406118
  (0, 4246)	0.189649264834
  (0, 4152)	0.155217902356
  (0, 3329)	0.350698953458
  (0, 2210)	0.248333220084
  (0, 2136)	0.233185464275
  (0, 2088)	0.253142237739
  (0, 1864)	0.276864427192
  (0, 1540)	0.228592867093
  (0, 1508)	0.245927256346
  (0, 1481)	0.503689553724
  (0, 878)	0.3321651222
9


In [52]:
# 동일 군집내에 있는 유사한 문서들의 인덱스 추출
# km.labels_ : 군집화된 labels
# new_post_label : 신규 포스트의 label
# numpy.ndarray.nonezero() : 배열에서 True인 항목의 인덱스가 포함된 배열 반환
similar_indices = (km.labels_ == new_post_label).nonzero()[0]

In [53]:
print(similar_indices)
print(len(similar_indices))

[  69  152  157  167  201  225  228  233  359  463  479  520  552  580  622
  676  779  882  884  917  939 1114 1253 1286 1486 1531 1752 1806 1809 1986
 2061 2249 2351 2412 2447 2493 2499 2510 2512 2600 2730 2800 2889 3080 3111
 3145 3146 3199 3202 3278 3285 3297 3310 3350 3437 3458]
56


In [54]:
similar = []
for i in similar_indices:
    # 동일 군집의 다른 문서들과의 거리 측정 
    dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())
    similar.append((dist, train_data.data[i]))

similar = sorted(similar)
print("Count similar: %i" % len(similar))

Count similar: 56


In [55]:
# 가장 유사한 문서
show_at_1 = similar[0]

print("=== #1 ===")
print("dist: %f" % show_at_1[0])
print(show_at_1[1])

=== #1 ===
dist: 1.037844
From: Thomas Dachsel <GERTHD@mvs.sas.com>
Subject: BOOT PROBLEM with IDE controller
Nntp-Posting-Host: sdcmvs.mvs.sas.com
Organization: SAS Institute Inc.
Lines: 25

Hi,
I've got a Multi I/O card (IDE controller + serial/parallel
interface) and two floppy drives (5 1/4, 3 1/2) and a
Quantum ProDrive 80AT connected to it.
I was able to format the hard disk, but I could not boot from
it. I can boot from drive A: (which disk drive does not matter)
but if I remove the disk from drive A and press the reset switch,
the LED of drive A: continues to glow, and the hard disk is
not accessed at all.
I guess this must be a problem of either the Multi I/o card
or floppy disk drive settings (jumper configuration?)
Does someone have any hint what could be the reason for it.
Please reply by email to GERTHD@MVS.SAS.COM
Thanks,
Thomas
+-------------------------------------------------------------------+
| Thomas Dachsel                                                    |
| Int

In [56]:
# 중간의 문서
show_at_2 = similar[int(len(similar) / 2)]
print("=== #2 ===")
print("dist: %f" % show_at_2[0])
print(show_at_2[1])

=== #2 ===
dist: 1.279396
From: vg@volkmar.Stollmann.DE (Volkmar Grote)
Subject: IBM PS/1 vs TEAC FD
Distribution: world
Organization: Me? Organized?
Lines: 21

Hello,

I already tried our national news group without success.

I tried to replace a friend's original IBM floppy disk in his PS/1-PC
with a normal TEAC drive.
I already identified the power supply on pins 3 (5V) and 6 (12V), shorted
pin 6 (5.25"/3.5" switch) and inserted pullup resistors (2K2) on pins
8, 26, 28, 30, and 34.
The computer doesn't complain about a missing FD, but the FD's light
stays on all the time. The drive spins up o.k. when I insert a disk,
but I can't access it.
The TEAC works fine in a normal PC.

Are there any points I missed?

Thank you.
	Volkmar

---
Volkmar.Grote@Stollmann.DE



In [57]:
# 마지막 문서
show_at_3 = similar[-1]
print("=== #3 ===")
print("dist: %f" % show_at_3[0])
print(show_at_3[1])

=== #3 ===
dist: 1.368634
From: bss_brucep@vd.seqeb.gov.au (Bruce Powell)
Subject: Re: ESDI with IDE??? -- Yes it should be Possible,
Organization: South East Queensland Electricity Board
Lines: 27

In article <1993Apr16.033802.6605@monu6.cc.monash.edu.au>, els390r@fawlty1.eng.monash.edu.au (G Chow) writes:
> In article <1qegfd$dqi@wsinis03.info.win.tue.nl> monty@wsinis03.info.win.tue.nl (Guido Leenders) writes:
>>Hi,
>>
>>Is it possible to use an ESDI-controller with HDD together with an
>>IDE-harddisk + controller in one ISA-system?
>>
>>I've read stuff about secondary controllers. Does this trick work?
>>
>>Thanx in advance,
>>
>>Guido
>>monty@win.tue.nl
> 
> I have the same question as Guido. It is possible to use the ESDI drive 
> as a master and the IDE drive as the slave ? 

I can definitily say that you can use an RLL as Master and IDE as slave, as
I have just upgraded my machine with a 200Mb IDE ( And custom Controller
Mdl CI-1010 Extended IDE Controller ) While maintaining my

## 노이즈
### 군집화는 되었으나, 게시물의 카테고리는?

In [58]:
# 학습 데이터에서 일부를 출력하고, comp.graphics 카테고리의 포스트들을 거름

post_group = zip(train_data.data, train_data.target)
# Create a list of tuples that can be sorted by
# the length of the posts
all = [(len(post[0]), post[0], train_data.target_names[post[1]])
       for post in post_group]
graphics = sorted([post for post in all if post[2] == 'comp.graphics'])

print(graphics[1:5])

[(162, u'Subject: E-mail of Michael Abrash?\nFrom: gmontem@eis.calstate.edu (George A. Montemayor)\nOrganization: Calif State Univ/Electronic Information Services\nLines: 0\n\n', 'comp.graphics'), (217, u'From:  Valentin E. Vulihman <vulih@ipmce.su>\nSubject: Attractive drawing on the sphere\nLines: 2\nReply-To: vulih@ipmce.su\nOrganization: Inst. of Prec. Mech. & Comp. Equip., Moscow, Russia\n\nsubscribe comp.graphics\nquit\n', 'comp.graphics'), (217, u'From: hwstock@snll-arpagw.llnl.gov (stockman harlan w)\nSubject: hp2xx for DOS\nOrganization: Sandia National Laboratories\nLines: 3\n\n\nIs there a precompiled version of hp2xx for DOS out there - prefereably\nfor 386/486?\n', 'comp.graphics'), (238, u'From: news@magnus.acs.ohio-state.edu\nSubject: Package for Fashion Designer?\nNntp-Posting-Host: bottom.magnus.acs.ohio-state.edu\nOrganization: The Ohio State University\nLines: 1\n\nThis article was probably generated by a buggy news reader.\n', 'comp.graphics')]


In [59]:
z = sorted(all)
print(z[5:7])

[(160, u'From: passman@world.std.com (Shirley L Passman)\nSubject: help with no docs for motherboard\nOrganization: The World Public Access UNIX, Brookline, MA\nLines: 1\n\n\n', 'comp.sys.ibm.pc.hardware'), (162, u'Subject: E-mail of Michael Abrash?\nFrom: gmontem@eis.calstate.edu (George A. Montemayor)\nOrganization: Calif State Univ/Electronic Information Services\nLines: 0\n\n', 'comp.graphics')]


In [60]:
# 
noise_post = z[5][1]

analyzer = vectorizer.build_analyzer()
print(list(analyzer(noise_post)))

[u'passman', u'world', u'std', u'com', u'shirley', u'passman', u'subject', u'help', u'doc', u'motherboard', u'organ', u'world', u'public', u'access', u'unix', u'brooklin', u'ma', u'line']


In [61]:
useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names())
print(sorted(useful))

[u'access', u'brooklin', u'com', u'doc', u'help', u'ma', u'motherboard', u'public', u'std', u'unix', u'world']


In [62]:
for term in sorted(useful):
    print('IDF(%s)=%.2f' % (term,
                            vectorizer._tfidf.idf_[vectorizer.vocabulary_[term]]))

IDF(access)=3.18
IDF(brooklin)=6.03
IDF(com)=2.03
IDF(doc)=5.22
IDF(help)=2.54
IDF(ma)=4.44
IDF(motherboard)=4.42
IDF(public)=3.93
IDF(std)=5.26
IDF(unix)=3.73
IDF(world)=2.85
