In [None]:
import tensorflow as tf
import numpy as np
import math
%matplotlib inline
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import os
from sklearn.cross_validation import train_test_split
import datetime

# Load dataset
from sklearn.datasets import fetch_20newsgroups

remove = ('headers', 'footers', 'quotes')

comp = ['comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x']

rec = ['rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey']

sci = ['sci.electronics',
 'sci.med',
 'sci.space']

talk = ['talk.politics.guns',
 'talk.politics.mideast']

cats = [comp, rec, sci, talk]

all_data = []
all_labels = []

for idx, cat in enumerate(cats):
    data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, 
                                remove=remove, categories=cat)
    all_data = all_data + data.data
    labels = np.ones(data.target.shape[0]).astype(int) * idx
    all_labels = all_labels + list(labels)
    
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(all_data)
y = np.array(all_labels)

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

print X_train.shape
print X_test.shape

In [None]:
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape

In [None]:
# Network Parameters
n_hidden_1 = 100 # 1st layer num features
n_hidden_2 = 200 # 2nd layer num features
n_hidden_3 = 200
encoded_dim = 50

n_input = X_train.shape[1]
# tf Graph input (only pictures)
X = tf.placeholder("float", [None, n_input])

dims = [n_input, 5000, 5000, 5000]

weights = {}
biases = {}

# Encoder weights and biases
for i in range(len(dims) - 1):
    weights_key = "encoder_h" + str(i+1)
    biases_key = "encoder_b" + str(i+1)
    weights[weights_key] = tf.Variable(tf.random_normal([dims[i], dims[i+1]]))
    biases[biases_key] = tf.Variable(tf.random_normal([dims[i+1]]))
    
# Decoder weights and biases
for i in range(len(dims) - 1):
    weights_key = "decoder_h" + str(i+1)
    biases_key = "decoder_b" + str(i+1)
    weights[weights_key] = tf.Variable(tf.random_normal([dims[::-1][i], dims[::-1][i+1]]))
    biases[biases_key] = tf.Variable(tf.random_normal([dims[::-1][i+1]]))

encoder_ops = []

# Encoder
def encoder(x):
    prev_layer = ""
    for i in range(len(dims) - 1):
        encoder_weights = "encoder_h" + str(i+1)
        encoder_biases = "encoder_b" + str(i+1)
        if i == 0:
            layer = tf.nn.sigmoid(tf.add(tf.matmul(x, weights[encoder_weights]), 
                               biases[encoder_biases]))
            prev_layer = layer
        else:
            layer = tf.nn.sigmoid(tf.add(tf.matmul(prev_layer, weights[encoder_weights]), 
                               biases[encoder_biases]))
            prev_layer = layer
        encoder_ops.append(layer)
    return layer

# Decoder
def decoder(x):
    prev_layer = ""
    for i in range(len(dims) - 1):
        decoder_weights = "decoder_h" + str(i+1)
        decoder_biases = "decoder_b" + str(i+1)
        if i == 0:
            layer = tf.nn.sigmoid(tf.add(tf.matmul(x, weights[decoder_weights]), 
                               biases[decoder_biases]))
            prev_layer = layer
        else:
            layer = tf.nn.sigmoid(tf.add(tf.matmul(prev_layer, weights[decoder_weights]), 
                               biases[decoder_biases]))
            prev_layer = layer
    return layer

encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

# Prediction
y_pred = decoder_op
# Groundtruths
y_true = X

# Parameters
learning_rate = 0.001
training_epochs = 50
batch_size = 50
display_step = 10

# Loss functions
cost = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)

In [None]:
# Initializing the variables
# init = tf.initialize_all_variables()

sess = tf.Session()
sess.run(tf.initialize_all_variables())

# Training
for epoch_i in range(training_epochs):
    # Loop over all batches    
    for batch_i in range(X_train.shape[0] // batch_size):
        batch_xs = X_train[batch_i * batch_size:(batch_i + 1) * batch_size].toarray()
        sess.run(optimizer, feed_dict={X: batch_xs})
    if epoch_i % display_step == 0:
        print(str(datetime.datetime.now()), epoch_i, sess.run(cost, feed_dict={X: batch_xs}))

print "Optimization Finished!"

In [None]:
batch_size = 50

temp = []
all_ravels = []

for batch_i in range(X_train.shape[0] // batch_size):
    batch_xs = X_train[batch_i * batch_size:(batch_i + 1) * batch_size].toarray()
    layers = [sess.run(encoder_ops[i], feed_dict={X: batch_xs}) for i in range(len(encoder_ops))]
    ravels = [np.array([row.ravel() for row in layers[i]]) for i in range(len(encoder_ops))]
    all_ravels.append(ravels)
    f = np.hstack(ravels)
    temp.append(f)

combined = np.vstack(temp)
print combined.shape

zipped = zip(*all_ravels)

ravels = []

for layer in zipped:
    ravels.append(np.vstack(layer))

# train = X_train.toarray()[:5000]
# layers = [sess.run(encoder_ops[i], 
#         feed_dict={X: train}) for i in range(len(encoder_ops))]

# ravels = (np.array([row.ravel() for row in layers[i]]) for i in range(len(encoder_ops)))
# combined = np.hstack(ravels)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(combined, y_train[:combined.shape[0]])
print str(datetime.datetime.now())

In [None]:
print knn.score(combined, y_train[:combined.shape[0]])
print str(datetime.datetime.now())

In [None]:
y_pred = knn.predict(combined)
print str(datetime.datetime.now())

In [None]:
from sklearn.metrics import confusion_matrix
import itertools
import numpy as np

plt.figure(figsize=(30, 30))

class_names = ["comp", "rec", "sci", "talk"]

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
#     plt.colorbar()
    tick_marks = np.arange(len(classes))
#     plt.xticks(tick_marks, classes, rotation=45)
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, str(cm[i, j])[:4],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

cnf_matrix = confusion_matrix(y_train[:combined.shape[0]], y_pred)
np.set_printoptions(precision=2)

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix for 20newsgroups')

plt.show()

In [None]:
ks = [5, 10, 20]
y = y_train[:combined.shape[0]]

from sklearn.neighbors import KNeighborsClassifier

for k in ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(combined, y)
    print ("KNN Score with k=%i" % k,  knn.score(combined, y))