In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from pprint import pprint

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans

import tensorflow as tf
from models.DenoisingAutoencoder import MaskingNoiseAutoencoder

# 0. Data Loading and processing
앞선 실험에서 TfidfVectorizer를 사용하는 것이 근소하게 성능이 더 좋았기 때문에, 여기서는 TfidfVectorizer만 사용하였음

In [2]:
# Load training set and test set
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
X = newsgroups.data
Y = newsgroups.target

In [3]:
# Declare two vectorizers
# count_vectorizer = CountVectorizer(min_df=40)
tfidf_vectorizer = TfidfVectorizer(min_df=40)

In [4]:
# Fitting vectorizers to the training set
# count_vectorizer = count_vectorizer.fit(X)
tfidf_vectorizer = tfidf_vectorizer.fit(X)

In [5]:
# Transform X_train and X_test using 2 vectorizers
# X_count = count_vectorizer.transform(X)
X_tfidf = tfidf_vectorizer.transform(X)

In [6]:
# Convert sparse matrix into dense matrix
X = X_tfidf.toarray()

In [7]:
n_samples = Y.shape[0]
print("Number of training points: ", n_samples)

Number of training points:  3387


In [8]:
dim_X = X.shape[1]
print("Dimension of X: %d" % dim_X)

Dimension of X: 1315


In [9]:
labels = np.unique(Y)
print("Labels: ", labels)

Labels:  [0 1 2 3]


# 1. k-means clustering with TF-IDF values

In [10]:
n_clusters_set = [3, 4, 5, 6]
names = []
models = []
results = []
silhouette_scores = []
mutual_scores = []
for n_clusters in n_clusters_set:
    # Add model name
    names.append('KMeans_k=%d' % n_clusters)
    # Call model
    model = KMeans(n_clusters=n_clusters, n_init=1, max_iter=100, verbose=1)
    # Fit the model
    model.fit(X)
    # Get cluster IDs
    result = model.predict(X)
    # Save model and result
    models.append(model)
    results.append(result)
    # Calculate silhouette score
    silhouette_scores.append(metrics.silhouette_score(X, result, metric = 'euclidean'))
    # Calculate mutual_information
    mutual_scores.append(metrics.adjusted_mutual_info_score(Y, result))

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 2842.41331031
start iteration
done sorting
end inner loop
Iteration 1, inertia 2793.4469902
start iteration
done sorting
end inner loop
Iteration 2, inertia 2779.92266914
start iteration
done sorting
end inner loop
Iteration 3, inertia 2776.57731093
start iteration
done sorting
end inner loop
Iteration 4, inertia 2775.48429994
start iteration
done sorting
end inner loop
Iteration 5, inertia 2775.07344243
start iteration
done sorting
end inner loop
Iteration 6, inertia 2774.9071878
start iteration
done sorting
end inner loop
Iteration 7, inertia 2774.79891655
start iteration
done sorting
end inner loop
Iteration 8, inertia 2774.75866898
start iteration
done sorting
end inner loop
Iteration 9, inertia 2774.73974053
start iteration
done sorting
end inner loop
Iteration 10, inertia 2774.72690887
start iteration
done sorting
end inner loop
Iteration 11, inertia 2774.71435538
start iteration
done sorting

In [11]:
silhouette_scores = pd.Series(silhouette_scores, index = names)
mutual_scores = pd.Series(mutual_scores, index = names)

In [12]:
print(silhouette_scores)

KMeans_k=3    0.005900
KMeans_k=4    0.007762
KMeans_k=5    0.009138
KMeans_k=6   -0.004803
dtype: float64


In [13]:
print(mutual_scores)

KMeans_k=3    0.069342
KMeans_k=4    0.145373
KMeans_k=5    0.139332
KMeans_k=6    0.114837
dtype: float64


In [14]:
cm = metrics.confusion_matrix(Y, results[1])
print(cm)

[[271 361   8 159]
 [284  83 469 137]
 [374 139  51 423]
 [239 217   6 166]]


# 2. k-means clustering with embedded vectors by denoising autoencoder

In [15]:
# Parameters
transfer_function = tf.nn.sigmoid # tf.nn.relu, tf.nn.softplus, tf.nn.sigmoid, tf.nn.tanh
dropout_probability = 0.9
training_epochs = 100
batch_size = 32
display_step = 1

In [16]:
autoencoder = MaskingNoiseAutoencoder(n_input=dim_X,
                                      n_hidden=200,
                                      transfer_function=transfer_function,
                                      optimizer=tf.train.AdamOptimizer(learning_rate = 0.001),
                                      dropout_probability=dropout_probability,
                                      tied_weights=True)

In [17]:
def get_random_block_from_data(data, batch_size):
    start_index = np.random.randint(0, len(data) - batch_size)
    return data[start_index:(start_index + batch_size)]

In [18]:
for epoch in range(training_epochs):
    avg_cost = 0.
    total_batch = int(n_samples / batch_size)
    for i in range(total_batch):
        batch_xs = get_random_block_from_data(X, batch_size)
        
        # Fit training using batch data
        cost = autoencoder.partial_fit(batch_xs)
        # Compute average loss
        avg_cost += cost / n_samples * batch_size

    # Display logs per epoch step
    if epoch % display_step == 0:
        print("Epoch:", '%04d' % (epoch + 1), "\ttraining_cost=", "{:.9f}".format(avg_cost))

Epoch: 0001 	training_cost= 520.134605423
Epoch: 0002 	training_cost= 36.746153380
Epoch: 0003 	training_cost= 24.014486272
Epoch: 0004 	training_cost= 19.450886125
Epoch: 0005 	training_cost= 17.167007820
Epoch: 0006 	training_cost= 16.048736644
Epoch: 0007 	training_cost= 15.316067241
Epoch: 0008 	training_cost= 14.680691751
Epoch: 0009 	training_cost= 14.474720300
Epoch: 0010 	training_cost= 14.256787922
Epoch: 0011 	training_cost= 13.961486609
Epoch: 0012 	training_cost= 13.920381675
Epoch: 0013 	training_cost= 13.798801230
Epoch: 0014 	training_cost= 13.716955527
Epoch: 0015 	training_cost= 13.585108254
Epoch: 0016 	training_cost= 13.518325071
Epoch: 0017 	training_cost= 13.463409000
Epoch: 0018 	training_cost= 13.472438068
Epoch: 0019 	training_cost= 13.394249681
Epoch: 0020 	training_cost= 13.316564478
Epoch: 0021 	training_cost= 13.352491948
Epoch: 0022 	training_cost= 13.290844431
Epoch: 0023 	training_cost= 13.237605619
Epoch: 0024 	training_cost= 13.354030964
Epoch: 0025 	tr

NameError: name 'X_test' is not defined

In [20]:
X_embedded = autoencoder.transform(X)

In [22]:
X_embedded.shape

(3387, 200)

In [23]:
n_clusters_set = [3, 4, 5, 6]
names = []
models = []
results = []
silhouette_scores = []
mutual_scores = []
for n_clusters in n_clusters_set:
    # Add model name
    names.append('KMeans_k=%d' % n_clusters)
    # Call model
    model = KMeans(n_clusters=n_clusters, n_init=1, max_iter=100, verbose=1)
    # Fit the model
    model.fit(X_embedded)
    # Get cluster IDs
    result = model.predict(X_embedded)
    # Save model and result
    models.append(model)
    results.append(result)
    # Calculate silhouette score
    silhouette_scores.append(metrics.silhouette_score(X, result, metric = 'euclidean'))
    # Calculate mutual_information
    mutual_scores.append(metrics.adjusted_mutual_info_score(Y, result))

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 1270.73
start iteration
done sorting
end inner loop
Iteration 1, inertia 1251.9
start iteration
done sorting
end inner loop
Iteration 2, inertia 1247.92
start iteration
done sorting
end inner loop
Iteration 3, inertia 1246.55
start iteration
done sorting
end inner loop
Iteration 4, inertia 1246.22
start iteration
done sorting
end inner loop
Iteration 5, inertia 1246.14
start iteration
done sorting
end inner loop
Iteration 6, inertia 1246.1
start iteration
done sorting
end inner loop
Iteration 7, inertia 1246.1
start iteration
done sorting
end inner loop
Iteration 8, inertia 1246.1
start iteration
done sorting
end inner loop
Iteration 9, inertia 1246.1
start iteration
done sorting
end inner loop
Iteration 10, inertia 1246.09
start iteration
done sorting
end inner loop
Iteration 11, inertia 1246.09
start iteration
done sorting
end inner loop
Iteration 12, inertia 1246.09
start iteration
done sorting


In [24]:
silhouette_scores = pd.Series(silhouette_scores, index = names)
mutual_scores = pd.Series(mutual_scores, index = names)

In [26]:
print(silhouette_scores)

KMeans_k=3    0.016752
KMeans_k=4    0.000819
KMeans_k=5   -0.002337
KMeans_k=6   -0.019446
dtype: float64


In [27]:
print(mutual_scores)

KMeans_k=3    0.135494
KMeans_k=4    0.137630
KMeans_k=5    0.153293
KMeans_k=6    0.146659
dtype: float64


In [29]:
cm = metrics.confusion_matrix(Y, results[2])
print(cm)

[[329   5 176 200  89]
 [ 30 472 155  89 227]
 [ 96  56 193 204 438]
 [211   6 157 159  95]
 [  0   0   0   0   0]]
