In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


from pprint import pprint
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans

In [2]:
class AdditiveGaussianNoiseAutoencoder(object):
    def __init__(self, n_input, n_hidden, transfer_function=tf.nn.softplus, optimizer=tf.train.AdamOptimizer(),
                 scale=0.1):
        self.n_input = n_input
        self.n_hidden = n_hidden
        self.transfer = transfer_function
        self.scale = tf.placeholder(tf.float32)
        self.training_scale = scale

        network_weights = self._initialize_weights()
        self.weights = network_weights

        # model
        self.x = tf.placeholder(tf.float32, [None, self.n_input])
        self.corrupted = self.x + scale * tf.random_normal((n_input,))
        self.hidden = self.transfer(tf.add(tf.matmul(self.corrupted, self.weights['w1']), self.weights['b1']))
        self.reconstruction = tf.add(tf.matmul(self.hidden, self.weights['w2']), self.weights['b2'])

        # cost
        self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.x), 2.0))
        self.optimizer = optimizer.minimize(self.cost)

        init = tf.global_variables_initializer()
        self.sess = tf.Session()
        self.sess.run(init)

    def _initialize_weights(self):
        all_weights = dict()
        all_weights['w1'] = tf.get_variable("w1", shape=[self.n_input, self.n_hidden],
            initializer=tf.contrib.layers.xavier_initializer())
        all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype = tf.float32))
        all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype = tf.float32))
        all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype = tf.float32))
        return all_weights

    def partial_fit(self, X):
        cost, opt = self.sess.run((self.cost, self.optimizer), feed_dict = {self.x: X,
                                                                            self.scale: self.training_scale
                                                                            })
        return cost

    def calc_total_cost(self, X):
        return self.sess.run(self.cost, feed_dict = {self.x: X,
                                                     self.scale: self.training_scale
                                                     })

    def transform(self, X):
        return self.sess.run(self.hidden, feed_dict = {self.x: X,
                                                       self.scale: self.training_scale
                                                       })

    def generate(self, hidden=None):
        if hidden is None:
            hidden = self.sess.run(tf.random_normal([1, self.n_hidden]))
        return self.sess.run(self.reconstruction, feed_dict = {self.hidden: hidden})

    def reconstruct(self, X):
        return self.sess.run(self.reconstruction, feed_dict = {self.x: X,
                                                               self.scale: self.training_scale
                                                               })
    def corrupt(self, X):
        return self.sess.run(self.corrupted, fedd_dict={self.x:X, self.scale: self.training_scale})
    
    def getWeights(self):
        return self.sess.run(self.weights['w1'])

    def getBiases(self):
        return self.sess.run(self.weights['b1'])


class MaskingNoiseAutoencoder(object):
    def __init__(self, n_input, n_hidden, transfer_function = tf.nn.softplus, optimizer = tf.train.AdamOptimizer(),
                 dropout_probability = 0.95, tied_weights=False):
        self.n_input = n_input
        self.n_hidden = n_hidden
        self.transfer = transfer_function
        self.dropout_probability = dropout_probability
        self.keep_prob = tf.placeholder(tf.float32)
        self.tied_weights = tied_weights

        network_weights = self._initialize_weights(tied=self.tied_weights)
        self.weights = network_weights

        # model
        self.x = tf.placeholder(tf.float32, [None, self.n_input])
        self.corrupted = tf.nn.dropout(self.x, self.keep_prob)
        self.hidden = self.transfer(tf.add(tf.matmul(self.corrupted, self.weights['w1']), self.weights['b1']))
        self.reconstruction = self.transfer(tf.add(tf.matmul(self.hidden, self.weights['w2']), self.weights['b2']))

        # cost
        self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.x), 2.0))
        self.optimizer = optimizer.minimize(self.cost)

        init = tf.global_variables_initializer()
        self.sess = tf.Session()
        self.sess.run(init)

    # def _initialize_weights(self, tied):
    #     all_weights = dict()
    #     all_weights['w1'] = tf.get_variable("w1", shape=[self.n_input, self.n_hidden],
    #         initializer=tf.contrib.layers.xavier_initializer())
    #     all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype = tf.float32))
    #     if tied == True:
    #         all_weights['w2'] = tf.transpose(all_weights['w1'])
    #     else:
    #         all_weights['w2'] = = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype = tf.float32))
    #     all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype = tf.float32))
    #     return all_weights

    def _initialize_weights(self, tied):
        all_weights = dict()
        all_weights['w1'] = tf.get_variable('w1', shape=[self.n_input, self.n_hidden], initializer=tf.contrib.layers.xavier_initializer())
        all_weights['b1'] = tf.get_variable('b1', shape=[self.n_hidden], initializer=tf.constant_initializer(0.0))
        if tied == True:
            all_weights['w2'] = tf.transpose(all_weights['w1'], name='w2')
        else:
            all_weights['w2'] = tf.get_variable('w2', shape=[self.n_hidden, self.n_input], initializer=tf.contrib.layers.xavier_initializer())
        all_weights['b2'] = tf.get_variable('b2', shape=[self.n_input], initializer=tf.constant_initializer(0.0))
        return all_weights

    def partial_fit(self, X):
        cost, opt = self.sess.run((self.cost, self.optimizer),
                                  feed_dict = {self.x: X, self.keep_prob: self.dropout_probability})
        return cost

    def calc_total_cost(self, X):
        return self.sess.run(self.cost, feed_dict = {self.x: X, self.keep_prob: 1.0})

    def transform(self, X):
        return self.sess.run(self.hidden, feed_dict = {self.x: X, self.keep_prob: 1.0})

    def generate(self, hidden=None):
        if hidden is None:
            hidden = self.sess.run(tf.random_normal([1, self.n_hidden]))
        return self.sess.run(self.reconstruction, feed_dict = {self.hidden: hidden})

    def reconstruct(self, X):
        return self.sess.run(self.reconstruction, feed_dict = {self.x: X, self.keep_prob: 1.0})

    def corrupt(self, X):
        return self.sess.run(self.corrupted, feed_dict={self.x: X, self.keep_prob: self.dropout_probability})

    def getWeights(self):
        return self.sess.run(self.weights['w1'])

    def getBiases(self):
        return self.sess.run(self.weights['b1'])

# 0. Data Loading and processing

In [3]:
# Load training set and test set
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
X = newsgroups.data
Y = newsgroups.target

In [4]:
print(X[0:2])
print(Y[0:2])

["My point is that you set up your views as the only way to believe.  Saying \nthat all eveil in this world is caused by atheism is ridiculous and \ncounterproductive to dialogue in this newsgroups.  I see in your posts a \nspirit of condemnation of the atheists in this newsgroup bacause they don'\nt believe exactly as you do.  If you're here to try to convert the atheists \nhere, you're failing miserably.  Who wants to be in position of constantly \ndefending themselves agaist insulting attacks, like you seem to like to do?!\nI'm sorry you're so blind that you didn't get the messgae in the quote, \neveryone else has seemed to.", "\nBy '8 grey level images' you mean 8 items of 1bit images?\nIt does work(!), but it doesn't work if you have more than 1bit\nin your screen and if the screen intensity is non-linear.\n\nWith 2 bit per pixel; there could be 1*c_1 + 4*c_2 timing,\nthis gives 16 levels, but they are linear if screen intensity is\nlinear.\nWith 1*c_1 + 2*c_2 it works, but we hav

In [5]:
# Declare two vectorizers
tfidf_vectorizer = TfidfVectorizer(min_df=40)

In [6]:
# Fitting vectorizers to the training set
tfidf_vectorizer = tfidf_vectorizer.fit(X)

In [7]:
X_tfidf = tfidf_vectorizer.transform(X)

In [8]:
# Convert sparse matrix into dense matrix
X = X_tfidf.toarray()

In [9]:
n_samples = Y.shape[0]
print("Number of training points: ", n_samples)

Number of training points:  3387


In [10]:
dim_X = X.shape[1]
print("Dimension of X: %d" % dim_X)

Dimension of X: 1315


In [11]:
labels = np.unique(Y)
print("Labels: ", labels)

Labels:  [0 1 2 3]


# 1. k-means clustering with TF-IDF values

In [12]:
n_clusters_set = [3, 4, 5, 6]
names = []
models = []
results = []
silhouette_scores = []
mutual_scores = []
for n_clusters in n_clusters_set:
    # Add model name
    names.append('KMeans_k=%d' % n_clusters)
    # Call model
    model = KMeans(n_clusters=n_clusters, n_init=1, max_iter=20, verbose=1)
    # Fit the model
    model.fit(X)
    # Get cluster IDs
    result = model.predict(X)
    # Save model and result
    models.append(model)
    results.append(result)
    # Calculate silhouette score
    silhouette_scores.append(metrics.silhouette_score(X, result, metric = 'euclidean'))
    # Calculate mutual_information
    mutual_scores.append(metrics.adjusted_mutual_info_score(Y, result))

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 2850.04896662
start iteration
done sorting
end inner loop
Iteration 1, inertia 2796.13340262
start iteration
done sorting
end inner loop
Iteration 2, inertia 2783.29582061
start iteration
done sorting
end inner loop
Iteration 3, inertia 2780.18840161
start iteration
done sorting
end inner loop
Iteration 4, inertia 2778.91232123
start iteration
done sorting
end inner loop
Iteration 5, inertia 2778.44027739
start iteration
done sorting
end inner loop
Iteration 6, inertia 2778.2270779
start iteration
done sorting
end inner loop
Iteration 7, inertia 2778.10271669
start iteration
done sorting
end inner loop
Iteration 8, inertia 2778.01092997
start iteration
done sorting
end inner loop
Iteration 9, inertia 2777.95144568
start iteration
done sorting
end inner loop
Iteration 10, inertia 2777.91952795
start iteration
done sorting
end inner loop
Iteration 11, inertia 2777.90395194
start iteration
done sortin

## valuate

In [13]:
silhouette_scores = pd.Series(silhouette_scores, index = names)
mutual_scores = pd.Series(mutual_scores, index = names)

In [14]:
print(silhouette_scores)

KMeans_k=3    0.021667
KMeans_k=4    0.004008
KMeans_k=5    0.000668
KMeans_k=6   -0.006071
dtype: float64


In [15]:
print(mutual_scores)

KMeans_k=3    0.147743
KMeans_k=4    0.001344
KMeans_k=5    0.152440
KMeans_k=6    0.080803
dtype: float64


In [16]:
cm = metrics.confusion_matrix(Y, results[1])
print(cm)

[[  0 799   0   0]
 [  0 971   2   0]
 [  1 980   0   6]
 [  0 628   0   0]]


# 2. k-means clustering with embedded vectors by denoising autoencoder

In [17]:
# Parameters
transfer_function = tf.nn.sigmoid # tf.nn.relu, tf.nn.softplus, tf.nn.sigmoid, tf.nn.tanh
dropout_probability = 0.9
training_epochs = 100
batch_size = 32
display_step = 1

In [18]:
autoencoder = MaskingNoiseAutoencoder(n_input=dim_X,
                                      n_hidden=200,
                                      transfer_function=transfer_function,
                                      optimizer=tf.train.AdamOptimizer(learning_rate = 0.001),
                                      dropout_probability=dropout_probability,
                                      tied_weights=True)

In [19]:
def get_random_block_from_data(data, batch_size):
    start_index = np.random.randint(0, len(data) - batch_size)
    return data[start_index:(start_index + batch_size)]

In [20]:
for epoch in range(training_epochs):
    avg_cost = 0.
    total_batch = int(n_samples / batch_size)
    for i in range(total_batch):
        batch_xs = get_random_block_from_data(X, batch_size)
        
        # Fit training using batch data
        cost = autoencoder.partial_fit(batch_xs)
        # Compute average loss
        avg_cost += cost / n_samples * batch_size

    # Display logs per epoch step
    if epoch % display_step == 0:
        print("Epoch:", '%04d' % (epoch + 1), "\ttraining_cost=", "{:.9f}".format(avg_cost))

Epoch: 0001 	training_cost= 520.729054904
Epoch: 0002 	training_cost= 36.889715257
Epoch: 0003 	training_cost= 24.029514526
Epoch: 0004 	training_cost= 19.470330553
Epoch: 0005 	training_cost= 17.388771023
Epoch: 0006 	training_cost= 16.012469309
Epoch: 0007 	training_cost= 15.232570589
Epoch: 0008 	training_cost= 14.729335518
Epoch: 0009 	training_cost= 14.487824892
Epoch: 0010 	training_cost= 14.110947101
Epoch: 0011 	training_cost= 13.885247031
Epoch: 0012 	training_cost= 13.883394181
Epoch: 0013 	training_cost= 13.668537096
Epoch: 0014 	training_cost= 13.661440836
Epoch: 0015 	training_cost= 13.603842434
Epoch: 0016 	training_cost= 13.438031909
Epoch: 0017 	training_cost= 13.475882076
Epoch: 0018 	training_cost= 13.477259647
Epoch: 0019 	training_cost= 13.359601499
Epoch: 0020 	training_cost= 13.422468728
Epoch: 0021 	training_cost= 13.296928476
Epoch: 0022 	training_cost= 13.284622792
Epoch: 0023 	training_cost= 13.268628274
Epoch: 0024 	training_cost= 13.235987602
Epoch: 0025 	tr

In [21]:
X_embedded = autoencoder.transform(X)

In [22]:
X_embedded.shape

(3387, 200)

In [23]:
n_clusters_set = [3, 4, 5, 6]
names = []
models = []
results = []
silhouette_scores = []
mutual_scores = []
for n_clusters in n_clusters_set:
    # Add model name
    names.append('KMeans_k=%d' % n_clusters)
    # Call model
    model = KMeans(n_clusters=n_clusters, n_init=1, max_iter=100, verbose=1)
    # Fit the model
    model.fit(X_embedded)
    # Get cluster IDs
    result = model.predict(X_embedded)
    # Save model and result
    models.append(model)
    results.append(result)
    # Calculate silhouette score
    silhouette_scores.append(metrics.silhouette_score(X, result, metric = 'euclidean'))
    # Calculate mutual_information
    mutual_scores.append(metrics.adjusted_mutual_info_score(Y, result))

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 1402.47
start iteration
done sorting
end inner loop
Iteration 1, inertia 1289.32
start iteration
done sorting
end inner loop
Iteration 2, inertia 1252.49
start iteration
done sorting
end inner loop
Iteration 3, inertia 1245.06
start iteration
done sorting
end inner loop
Iteration 4, inertia 1242.96
start iteration
done sorting
end inner loop
Iteration 5, inertia 1241.65
start iteration
done sorting
end inner loop
Iteration 6, inertia 1241.16
start iteration
done sorting
end inner loop
Iteration 7, inertia 1241.0
start iteration
done sorting
end inner loop
Iteration 8, inertia 1240.95
start iteration
done sorting
end inner loop
Iteration 9, inertia 1240.92
start iteration
done sorting
end inner loop
Iteration 10, inertia 1240.92
start iteration
done sorting
end inner loop
Iteration 11, inertia 1240.92
center shift 0.000000e+00 within tolerance 2.750360e-07
Initialization complete
start iteration
don

## valuate

In [24]:
silhouette_scores = pd.Series(silhouette_scores, index = names)
mutual_scores = pd.Series(mutual_scores, index = names)

In [25]:
print(silhouette_scores)

KMeans_k=3    0.016379
KMeans_k=4    0.003104
KMeans_k=5   -0.005006
KMeans_k=6   -0.036257
dtype: float64


In [26]:
print(mutual_scores)

KMeans_k=3    0.134068
KMeans_k=4    0.200064
KMeans_k=5    0.163654
KMeans_k=6    0.146883
dtype: float64


In [27]:
cm = metrics.confusion_matrix(Y, results[2])
print(cm)

[[ 11 165   5 340 278]
 [142 143 485 181  22]
 [194 184  66 448  95]
 [  9 145   8 257 209]
 [  0   0   0   0   0]]
