In [1]:
import matplotlib
matplotlib.use('PDF')
import numpy as np
import sklearn
import scipy.stats
import matplotlib.pyplot as plt
import timeit
import pandas as pd
import math
import itertools
import time

import sklearn.neighbors
from sklearn.neighbors import (
    KernelDensity,
    KDTree,
)

In [2]:
df = pd.read_csv("../data/shuttle.csv")

In [3]:
def estimate_kde_bw(data):
    bw = np.std(data, axis=0) * (data.shape[0])**(-1.0/(data.shape[1]+4))
    return bw

In [48]:
stepx = 4
stepy = 2
xrange = [-52, 80+stepx]
yrange = [-4, 80+stepy]

In [49]:
xe = np.arange(xrange[0],xrange[1],stepx)
ye = np.arange(yrange[0],yrange[1],stepy)
H, xedges, yedges = np.histogram2d(
    df["4"],df["6"], 
    bins=[xe,ye], 
    range=[xrange,yrange],
)
X, Y = np.meshgrid(xedges, yedges)

In [50]:
fig = plt.figure(figsize=(3,4), dpi=300)
plt.pcolor(
    X,Y,H.T,
    norm=matplotlib.colors.LogNorm(),
    edgecolors="none",
#     linewidths=.05,
    cmap="Blues"
)
cb = plt.colorbar(orientation="horizontal")
cb.set_label("bin count")
plt.xlim(*xrange)
plt.ylim(*yrange)
plt.xlabel("A")
plt.ylabel("B")
fig.tight_layout()
plt.savefig("hist2.pdf")



In [51]:
fig = plt.figure(figsize=(3,4), dpi=300)
plt.imshow(
    H.T[::-1,:],
    aspect="auto",
    norm=matplotlib.colors.LogNorm(),
    cmap="Blues",
    interpolation="nearest",
    extent=(xrange[0],xrange[1],yrange[0],yrange[1])
)
cb = plt.colorbar(orientation="horizontal")
cb.set_label("bin count")
# plt.xlim(*xrange)
# plt.ylim(*yrange)
plt.xlabel("A")
plt.ylabel("B")
fig.tight_layout()
plt.savefig("hist3.pdf")



In [17]:
fig = plt.figure(figsize=(3,4), dpi=300)
plt.hist2d(
    df["4"],df["6"], 
    bins=[np.arange(xrange[0],xrange[1]+stepx,stepx),np.arange(yrange[0],yrange[1]+stepy,stepy)], 
    range=[xrange,yrange],
    norm=matplotlib.colors.LogNorm(),
    cmap="Blues",
)
cb = plt.colorbar(orientation="horizontal")
cb.set_label("bin count")
plt.xlim(*xrange)
plt.ylim(*yrange)
plt.xlabel("A")
plt.ylabel("B")
fig.tight_layout()
plt.savefig("shuttle_hist.pdf")



In [115]:
fig = plt.figure(figsize=(3,4), dpi=300)
plt.hexbin(
    df[[4]], df[[6]], 
    extent=(-50,80,-5,80), gridsize=(int(30*.8),int(40*.8)), 
    linewidths=(.2,), norm=matplotlib.colors.LogNorm(), cmap='Greys')
cb = plt.colorbar(orientation="horizontal")
cb.set_label("bin count")
plt.xlim(-50,80)
plt.ylim(-5,80)
plt.xlabel("A4")
plt.ylabel("A6")
fig.tight_layout()
plt.savefig("shuttle_hexbin.pdf")



In [7]:
bw = estimate_kde_bw(df[[4,6]].values)
print(bw)
data = df[[4,6]].values/bw
tol = .1
kde = KernelDensity(
    bandwidth=.7,
    kernel='gaussian',
    algorithm='kd_tree',
    rtol=tol,
)
kde.fit(data)

[ 3.65966579  2.21493988]


KernelDensity(algorithm='kd_tree', atol=0, bandwidth=0.7, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0.1)

In [8]:
x = np.arange(xrange[0],xrange[1],.5)
y = np.arange(yrange[0],yrange[1],.5)
X,Y = np.meshgrid(x,y,indexing="ij")
Z = np.zeros((len(x),len(y)))
for i in range(len(x)):
    for j in range(len(y)):
        Z[i,j]=kde.score_samples([[X[i,j],Y[i,j]]] / bw)[0]

In [9]:
fig = plt.figure(figsize=(3,4), dpi=300)
cs = plt.contourf(X,Y,np.exp(Z),levels=[0, np.exp(-8.0), 1], colors=['w','C0','k'])
cb = plt.colorbar(
    orientation="horizontal",
    format="%.1g"
)
cb.set_label("density")
plt.legend()
plt.xlim(*xrange)
plt.ylim(*yrange)
plt.xlabel("A")
plt.ylabel("B")
fig.tight_layout()
plt.savefig("shuttle_classify.pdf")

