# Tutorial: Intrinsic dimension, Density estimation and Clustering

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from duly import data
from duly.plot import plot_SLAn, plot_MDS, plot_matrix


%load_ext autoreload
%autoreload 2
%matplotlib notebook
%matplotlib inline


In [None]:
ls

In [None]:

# import a test set
X = np.genfromtxt('datasets/Fig1.dat')[0:10000]

# X = np.genfromtxt('Fig2.dat')

# X = np.genfromtxt('FigS1.dat')
# X = np.genfromtxt('FigS2.dat')

# X = np.genfromtxt('FigS3.dat')

#X = np.genfromtxt('FigS4.dat')ls


In [None]:
#X = np.random.rand(50000, 2)

In [None]:
plt.figure(figsize = (5, 5))
plt.title('2D scatter of the data')
plt.scatter(X[:,0],X[:,1],s=15.,alpha=1.0,c='black',linewidths=0.0)
plt.show()

In [None]:
# initialise the Data class
_data = data.Data(X)


In [None]:
# compute distances up to the maxk NN
_data.compute_distances(maxk = 1000, njobs = 1)

In [None]:
_data.maxk

In [None]:
# estimate ID
_data.compute_id()

In [None]:
# estimate ID Bayesian
_data.compute_id_gammaprior(alpha = 2, beta = 2)

In [None]:
# estimate density with k-NN
_data.compute_density_kNN(k = 10)

In [None]:
# estimate density with PAk using cython implementation of Newton-Raphson minimisation
_data.compute_density_PAk()

In [None]:
type(_data.compute_density_PAk)

In [None]:
# estimate density with PAk using scipy optimisation
# _data.compute_density_PAk(method='NM')

In [None]:
f, [ax1 ,ax2] = plt.subplots(1, 2, figsize = (16, 7),gridspec_kw={'hspace': 0.05, 'wspace': 0})
ax1.yaxis.set_major_locator(plt.NullLocator())
ax1.xaxis.set_major_locator(plt.NullLocator())
ax1.set_title('PAk log densities')

ax1.scatter(X[:,0],X[:,1],s=15.,alpha=0.9, c = _data.Rho,linewidths=0.0)
ax2.yaxis.set_major_locator(plt.NullLocator())
ax2.xaxis.set_major_locator(plt.NullLocator())
ax2.set_title('PAk log densities interpolated')
ax2.tricontour(X[:,0],X[:,1],_data.Rho,levels=10, linewidths=0.5, colors='k')
fig2=ax2.tricontourf(X[:,0],X[:,1],_data.Rho,levels=250,alpha=0.9)

plt.colorbar(fig2)
plt.show()

In [None]:
# estimate clusters
# _data.compute_clustering(Z = 1, halo=False)

In [None]:
# estimate clusters
_data.compute_clustering_optimised(Z = 2., halo=False)

In [None]:

Nclus_m=len(_data.centers_m) 
cmap = plt.get_cmap('gist_rainbow', Nclus_m)
f, ax = plt.subplots(1, 1, figsize = (13, 10))
ax.yaxis.set_major_locator(plt.NullLocator())
ax.xaxis.set_major_locator(plt.NullLocator())
ax.set_title('DPA assignation with halo')
xdtmp=[]
ydtmp=[]
ldtmp=[]
xntmp=[]
yntmp=[]
for j in range(len(_data.labels)):
    if (_data.labels[j]!=-1):
        xdtmp.append(_data.X[j,0]) 
        ydtmp.append(_data.X[j,1])
        ldtmp.append(_data.labels[j])
    else:
        xntmp.append(_data.X[j,0])
        yntmp.append(_data.X[j,1])
        
plt.scatter(xdtmp,ydtmp,s=15.,alpha=1.0, c=ldtmp,linewidths=0.0,cmap=cmap)
plt.colorbar(ticks=range(Nclus_m))
plt.clim(-0.5, Nclus_m-0.5)
plt.scatter(xntmp,yntmp,s=10.,alpha=0.5, c='black',linewidths=0.0)

In [None]:
# plot dendogram
plot_SLAn(_data)

In [None]:
# plot graph of clusters
plot_MDS(_data)

In [None]:
# plot connectivity matrix
plot_matrix(_data)