In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import scipy as sp
import nltk
import nltk.stem
import numpy as np

In [2]:
english_stemmer = nltk.stem.SnowballStemmer('english')
# class StemmedCountVectorizer(CountVectorizer):
#     def build_analyzer(self):
#         analyzer = super(StemmedCountVectorizer, self).build_analyzer()
#         return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
# vectorizer1 = StemmedCountVectorizer(min_df=1, stop_words='english')
# vectorizer1 = CountVectorizer(min_df=1, stop_words='english', preprocessor=english_stemmer.stem)

In [3]:
class StemmedTfidfVectorizer(TfidfVectorizer):
    def bulid_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).bulid_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

vectorizer1 = StemmedTfidfVectorizer(min_df=1, stop_words='english', decode_error='ignore')
# vectorizer1 = TfidfVectorizer(min_df=1, stop_words='english', preprocessor=english_stemmer.stem) 

In [4]:
# DIR = r'D:\Machine_Learning_kuni\BuildingMachineLearningSystemsWithPython-master\ch03\data\toy'
DIR = r'C:\Users\kuni\Desktop\titanic\BuildingMachineLearningSystemsWithPython-master\ch03\data\toy'
files = os.listdir(DIR)
contents = []
for file in files:
    with open(os.path.join(DIR, file), 'r') as f:
        content = f.read()
        contents.append(content)
X_train = vectorizer1.fit_transform(contents)
vectorizer1.get_feature_names()

['actually',
 'capabilities',
 'contains',
 'data',
 'databases',
 'images',
 'imaging',
 'interesting',
 'learning',
 'machine',
 'permanently',
 'post',
 'provide',
 'save',
 'storage',
 'store',
 'stuff',
 'toy']

In [5]:
new_post = 'imaging databases'
new_post_vec = vectorizer1.transform([new_post])

In [6]:
def dist_raw(v1, v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normolized = v2/sp.linalg.norm(v2.toarray())
    delta = v1_normalized-v2_normolized
    return sp.linalg.norm(delta.toarray())

In [7]:
import sys
best_doc = None
best_dist = sys.maxsize
best_i = None
for i, post in enumerate(contents):
    if post == new_post:
        continue
    post_vec = X_train.getrow(i)
    d = dist_raw(post_vec, new_post_vec)
    print('===Post %i with dist=%.2f: %s' %(i, d, post))
    if d<best_dist:
        best_dist = d
        best_i = i
        
print("Best post is %i with dist %.2f" %(best_i, best_dist))

===Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
===Post 1 with dist=1.08: Imaging databases provide storage capabilities.
===Post 2 with dist=1.08: Most imaging databases save images permanently.

===Post 3 with dist=0.92: Imaging databases store data.
===Post 4 with dist=0.92: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3 with dist 0.92


In [9]:
from sklearn import datasets

In [10]:
all_data = datasets.fetch_20newsgroups(subset='all')

In [20]:
groups = ['comp.graphics', 'comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
train_data = datasets.fetch_20newsgroups(subset='train', categories=groups)
test_data = datasets.fetch_20newsgroups(subset='test', categories=groups)
print(len(train_data.filenames), len(test_data.filenames))

3529 2349


In [40]:
vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, stop_words='english', decode_error='ignore')
# vectorizer = TfidfVectorizer(min_df=1, max_df=0.5, stop_words='english', decode_error='ignore', preprocessor=english_stemmer.stem)
vectorized = vectorizer.fit_transform(train_data.data)
vectorized.shape

(3529, 5651)

In [23]:
num_clusters = 50
from sklearn.cluster import KMeans
km = KMeans(n_clusters=num_clusters, init='random', n_init=1, verbose=1, random_state=3)
km.fit(vectorized)

Initialization complete
Iteration  0, inertia 5975.258
Iteration  1, inertia 3239.926
Iteration  2, inertia 3205.974
Iteration  3, inertia 3188.750
Iteration  4, inertia 3179.113
Iteration  5, inertia 3173.326
Iteration  6, inertia 3169.006
Iteration  7, inertia 3165.893
Iteration  8, inertia 3163.532
Iteration  9, inertia 3161.955
Iteration 10, inertia 3160.855
Iteration 11, inertia 3160.066
Iteration 12, inertia 3159.503
Iteration 13, inertia 3158.896
Iteration 14, inertia 3158.242
Iteration 15, inertia 3157.406
Iteration 16, inertia 3156.724
Iteration 17, inertia 3156.069
Iteration 18, inertia 3154.935
Iteration 19, inertia 3154.300
Iteration 20, inertia 3154.224
Iteration 21, inertia 3154.154
Iteration 22, inertia 3154.136
Iteration 23, inertia 3154.122
Iteration 24, inertia 3154.111
Converged at iteration 24: center shift 0.000000e+00 within tolerance 1.731605e-08


KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300,
    n_clusters=50, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=3, tol=0.0001, verbose=1)

In [43]:
new_post1 = "Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. I tried to format it, but now it doesn't boot any more. Any ideas? Thanks."
new_post_vec = vectorizer.transform([new_post1])
new_post_label = km.predict(new_post_vec)
new_post_label

array([7])

In [51]:
similar_indices = (km.labels_==new_post_label).nonzero()[0]
len(similar_indices)

149

In [57]:
similar = []
for i in similar_indices:
    dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())
    similar.append((dist, train_data.data[i]))
similar = sorted(similar)


In [59]:
show_at_1 = similar[0]
show_at_2 = similar[int(len(similar)/10)]
show_at_3 = similar[int(len(similar)/2)]

In [62]:
print(show_at_1, '\n\n\n', show_at_2, '\n\n\n',show_at_3)

(1.0807851743953758, "From: Thomas Dachsel <GERTHD@mvs.sas.com>\nSubject: BOOT PROBLEM with IDE controller\nNntp-Posting-Host: sdcmvs.mvs.sas.com\nOrganization: SAS Institute Inc.\nLines: 25\n\nHi,\nI've got a Multi I/O card (IDE controller + serial/parallel\ninterface) and two floppy drives (5 1/4, 3 1/2) and a\nQuantum ProDrive 80AT connected to it.\nI was able to format the hard disk, but I could not boot from\nit. I can boot from drive A: (which disk drive does not matter)\nbut if I remove the disk from drive A and press the reset switch,\nthe LED of drive A: continues to glow, and the hard disk is\nnot accessed at all.\nI guess this must be a problem of either the Multi I/o card\nor floppy disk drive settings (jumper configuration?)\nDoes someone have any hint what could be the reason for it.\nPlease reply by email to GERTHD@MVS.SAS.COM\nThanks,\nThomas\n+-------------------------------------------------------------------+\n| Thomas Dachsel                                         