In [1]:
using CSV, DataFrames, Clustering, DecisionTree, GLM, XGBoost
using StatsBase, Distributions, HypothesisTests, MultivariateStats
using Distances, MLLabelUtils, MLBase
using TSne, Gadfly
using ScikitLearn.CrossValidation: cross_val_score
using Random

In [34]:
# Renamed but unscaled
df = df = CSV.read("wifi_named.csv", header=true)

# Standardized signal features
df2 = CSV.read("wifi_standardized.csv", header=true)

first(df2, 3)

Unnamed: 0_level_0,WiFi1,WiFi2,WiFi3,WiFi4,WiFi5,WiFi6,WiFi7,Room
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.478077,1.28936,0.782308,0.275256,-0.231795,-1.34731,-1.2459,1.0
2,0.207303,1.20495,0.84217,0.479389,-0.0647823,-1.33451,-1.33451,1.0
3,0.708444,0.98395,0.98395,0.341103,-0.485416,-1.31193,-1.2201,1.0


In [45]:
# Drop the Room column on both feature matrixes
# X is standardized, XX isn't
X = Matrix(df2[1:7])
XX = Matrix(df[1:7])

# Show first 3 rows, all columns
X[1:3, 1:end]

3×7 Array{Float64,2}:
 0.478077  1.28936  0.782308  0.275256  -0.231795   -1.34731  -1.2459
 0.207303  1.20495  0.84217   0.479389  -0.0647823  -1.33451  -1.33451
 0.708444  0.98395  0.98395   0.341103  -0.485416   -1.31193  -1.2201

## Working with Clustering

In [53]:
# Number of clusters
nc = 4

4

In [54]:
n, m = size(X)

(2000, 7)

In [56]:
# author uses a for loop to transpose... then suggests using collect(adjoint(X))
# But transpose == transpose, so here we go!
XX = X'

7×2000 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
  0.478077   0.207303    0.708444  …   0.147736    0.122404   0.162858
  1.28936    1.20495     0.98395       0.320095    0.350891   0.65143
  0.782308   0.84217     0.98395       1.06698     0.693621   0.92286
  0.275256   0.479389    0.341103     -0.0246227   0.179526   0.108572
 -0.231795  -0.0647823  -0.485416      1.12444     1.32196    0.92286
 -1.34731   -1.33451    -1.31193   …  -1.28859    -1.47701   -1.41143
 -1.2459    -1.33451    -1.2201       -1.34604    -1.1914    -1.35715

In [57]:
# KMeans
R = kmeans(XX, nc; maxiter=200, display=:iter)

  Iters               objv        objv-change | affected 
-------------------------------------------------------------
      0       9.291066e+02
      1       6.010033e+02      -3.281033e+02 |        4
      2       5.939210e+02      -7.082325e+00 |        4
      3       5.930049e+02      -9.160677e-01 |        3
      4       5.928254e+02      -1.795539e-01 |        2
      5       5.927953e+02      -3.008943e-02 |        2
      6       5.927892e+02      -6.089255e-03 |        2
      7       5.927874e+02      -1.802885e-03 |        2
      8       5.927826e+02      -4.767285e-03 |        3
      9       5.927781e+02      -4.525659e-03 |        0
     10       5.927781e+02       0.000000e+00 |        0
K-means converged with 10 iterations (objv = 592.7781033135057)


KmeansResult{Array{Float64,2},Float64,Int64}([0.2623841755338189 1.2844768060034522 0.8785360993987652 0.5483841750633662; 0.5655365389473564 0.08442323225684432 0.48321398171164226 1.0988254047396844; … ; -1.386495460284453 -0.9773038436018847 -1.3236721029448493 -1.2638089599242694; -1.3837043727428167 -1.025863510588361 -1.3825409023905642 -1.3627749945150314], [4, 4, 4, 3, 3, 4, 4, 4, 4, 4  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0.088633862154607, 0.1601696612760808, 0.25203935016864243, 1.0171113631908497, 0.572203507658541, 0.2914318671535341, 0.24941442003220615, 0.6164728059882485, 0.4496610048292329, 0.379079850816467  …  0.19383882421588794, 0.07195790384627898, 0.07367341979481523, 0.09405132904213609, 0.09713508747414501, 0.22053401659064775, 0.03677470319603415, 0.2264529096588035, 0.3006991987272265, 0.03198291753014715], [501, 472, 520, 507], [501, 472, 520, 507], 592.7781033135057, 10, true)

In [60]:
# Assert the number of clusters in R matches the hyperparameter set earlier
@assert nclusters(R) == nc

In [61]:
# assigns labels of the various points
a = assignments(R)

2000-element Array{Int64,1}:
 4
 4
 4
 3
 3
 4
 4
 4
 4
 4
 4
 4
 4
 ⋮
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1

In [62]:
# Get the cluster sizes
c = counts(R)

4-element Array{Int64,1}:
 501
 472
 520
 507

In [63]:
# cluster centers (centroids), transposed for easier viewing
M = (R.centers)'

4×7 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
 0.262384  0.565537   0.840882   0.189405   0.911992   -1.3865    -1.3837
 1.28448   0.0844232  0.0933737  1.22007   -0.679172   -0.977304  -1.02586
 0.878536  0.483214   0.63147    0.804992  -0.0919998  -1.32367   -1.38254
 0.548384  1.09883    0.722645   0.396962  -0.140233   -1.26381   -1.36277

## Let's do fuzzy clustering with c-means!

In [65]:
R2 = fuzzy_cmeans(XX, nc, 2, maxiter=200, display=:iter)

  Iters      center-change
----------------------------
      1       2.208389e+00
      2       3.042300e-02
      3       5.726892e-02
      4       1.137675e-01
      5       2.025642e-01
      6       2.767112e-01
      7       2.668076e-01
      8       1.940725e-01
      9       1.456188e-01
     10       7.837594e-02
     11       3.934396e-02
     12       2.045023e-02
     13       1.062416e-02
     14       5.483257e-03
     15       2.825778e-03
     16       1.461861e-03
     17       7.622094e-04
Fuzzy C-means converged with 17 iterations (δ = 0.0007622094221868084)


FuzzyCMeansResult: 4 clusters for 2000 points in 7 dimensions (converged in 17 iterations)

In [66]:
M2 = (R2.centers)'

4×7 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
 1.28727   0.0769225  0.0845643  1.22368   -0.675091  -0.974169  -1.02318
 0.860159  0.517178   0.616294   0.799762  -0.120674  -1.31007   -1.36265
 0.269127  0.5673     0.836387   0.19491    0.89731   -1.38372   -1.38131
 0.583803  1.02725    0.713262   0.439698  -0.122727  -1.26465   -1.37663

In [68]:
# how much each data point belongs to each one 
# of the (4) clusters according to Cmeans
memberships = R2.weights

2000×4 Array{Float64,2}:
 0.0301805   0.109671   0.0629223  0.797226
 0.035037    0.138037   0.102674   0.724252
 0.0569563   0.225919   0.0710357  0.646089
 0.20371     0.373993   0.0961724  0.326124
 0.131249    0.45686    0.0847659  0.327126
 0.06742     0.200014   0.0851284  0.647438
 0.039371    0.213743   0.148214   0.598672
 0.0873086   0.384703   0.112807   0.415181
 0.0824885   0.304049   0.0914725  0.52199
 0.091655    0.282813   0.0678429  0.557689
 0.0607345   0.223179   0.085712   0.630374
 0.0449492   0.179604   0.1019     0.673547
 0.017422    0.0795105  0.0273125  0.875755
 ⋮                                 
 0.00873625  0.0296571  0.929443   0.0321641
 0.0276743   0.0697524  0.824119   0.0784544
 0.0248041   0.0622503  0.839766   0.0731798
 0.0110874   0.0309913  0.920604   0.0373174
 0.0120287   0.0358285  0.911527   0.0406159
 0.0135937   0.0388798  0.896557   0.0509698
 0.0138901   0.0358571  0.90579    0.0444623
 0.028885    0.0813954  0.793842   0.0958777
 0.00635