In [2]:
from sklearn.preprocessing import normalize
import numpy as np
import json
from XMeans import XMeansTraining
import ipyvolume as ipv
import random
from sklearn.manifold import TSNE
from matplotlib import colors as mcolors
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Read data

titleList = []
DocVecs = []

with open("./sampleData.json",'r') as f:
    for line in f:
        data = json.loads(line)
        titleList.append(data["title"])
        DocVecs.append(data["docvec"])

data = np.array(DocVecs)

In [4]:
# Train XMeans

maxBranching = 5
Centroids,Labels = XMeansTraining(data,maxBranching)
print("Number of clusters = {}".format(len(Centroids)))

Number of clusters = 26


In [12]:
# Save Cluster titles at ./Clusters

for clID in range(len(Centroids)):
    indices = np.where(Labels == clID)[0].tolist()
    Titles = [titleList[i] for i in indices]
    with open("./Clusters/"+str(clID)+".txt", 'a') as file:
        for title in Titles:
            file.write("%s\n" % title)

### Visualize cluster samples

In [5]:
# Dimensionality reduction using tSNE

tSNE = TSNE(n_components=3,metric='cosine')
tsneModel = tSNE.fit(data)

In [6]:
# 3D scatter plot of samples from clusters

noSamples = 200
size = 0.75
tSNEData = tsneModel.embedding_
colors = [col for col in mcolors.CSS4_COLORS.keys()]

for k in np.arange(len(Centroids)):
    ind = np.where(Labels == k)[0].tolist()
    indices = random.sample(ind, noSamples)
    ipv.pylab.scatter(tSNEData[indices,0], tSNEData[indices,1], tSNEData[indices,2], size=size, marker="sphere",color=colors[k+2])

ipv.pylab.show()
ipv.pylab.xlabel("")
ipv.pylab.ylabel("")
ipv.pylab.zlabel("")
ipv.pylab.xyzlim(-20)