In [1]:
using CSV, DataFrames, Clustering, DecisionTree, GLM, XGBoost
using StatsBase, Distributions, HypothesisTests, MultivariateStats
using Distances, MLLabelUtils, MLBase
using TSne, Gadfly
using ScikitLearn.CrossValidation: cross_val_score
using Random

In [2]:
# Renamed but unscaled
df = df = CSV.read("wifi_named.csv", header=true)

# Standardized signal features
df2 = CSV.read("wifi_standardized.csv", header=true)

first(df2, 3)

Unnamed: 0_level_0,WiFi1,WiFi2,WiFi3,WiFi4,WiFi5,WiFi6,WiFi7,Room
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.478077,1.28936,0.782308,0.275256,-0.231795,-1.34731,-1.2459,1.0
2,0.207303,1.20495,0.84217,0.479389,-0.0647823,-1.33451,-1.33451,1.0
3,0.708444,0.98395,0.98395,0.341103,-0.485416,-1.31193,-1.2201,1.0


In [3]:
# Drop the Room column on both feature matrixes
# X is standardized, XX isn't
X = Matrix(df2[1:7])

# Show first 3 rows, all columns
X[1:3, 1:end]

3×7 Array{Float64,2}:
 0.478077  1.28936  0.782308  0.275256  -0.231795   -1.34731  -1.2459
 0.207303  1.20495  0.84217   0.479389  -0.0647823  -1.33451  -1.33451
 0.708444  0.98395  0.98395   0.341103  -0.485416   -1.31193  -1.2201

## Working with Clustering

In [4]:
# Number of clusters
nc = 4

4

In [5]:
n, m = size(X)

(2000, 7)

In [6]:
# author uses a for loop to transpose... then suggests using collect(adjoint(X))
# But transpose == transpose, so here we go!
XX = X'

7×2000 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
  0.478077   0.207303    0.708444  …   0.147736    0.122404   0.162858
  1.28936    1.20495     0.98395       0.320095    0.350891   0.65143
  0.782308   0.84217     0.98395       1.06698     0.693621   0.92286
  0.275256   0.479389    0.341103     -0.0246227   0.179526   0.108572
 -0.231795  -0.0647823  -0.485416      1.12444     1.32196    0.92286
 -1.34731   -1.33451    -1.31193   …  -1.28859    -1.47701   -1.41143
 -1.2459    -1.33451    -1.2201       -1.34604    -1.1914    -1.35715

In [7]:
# KMeans
R = kmeans(XX, nc; maxiter=200, display=:iter)

  Iters               objv        objv-change | affected 
-------------------------------------------------------------
      0       1.033036e+03
      1       7.260650e+02      -3.069706e+02 |        4
      2       6.892326e+02      -3.683241e+01 |        4
      3       6.686576e+02      -2.057498e+01 |        4
      4       6.603258e+02      -8.331856e+00 |        3
      5       6.540551e+02      -6.270609e+00 |        4
      6       6.459585e+02      -8.096635e+00 |        4
      7       6.334727e+02      -1.248580e+01 |        3
      8       6.177383e+02      -1.573442e+01 |        4
      9       6.049416e+02      -1.279672e+01 |        3
     10       5.980162e+02      -6.925389e+00 |        3
     11       5.945986e+02      -3.417603e+00 |        3
     12       5.932443e+02      -1.354311e+00 |        3
     13       5.928592e+02      -3.850685e-01 |        2
     14       5.927953e+02      -6.390851e-02 |        2
     15       5.927892e+02      -6.089255e-03 |        

KmeansResult{Array{Float64,2},Float64,Int64}([0.8785360993987652 0.2623841755338189 0.5483841750633662 1.2844768060034522; 0.48321398171164226 0.5655365389473564 1.0988254047396844 0.08442323225684432; … ; -1.3236721029448493 -1.386495460284453 -1.2638089599242694 -0.9773038436018847; -1.3825409023905642 -1.3837043727428167 -1.3627749945150314 -1.025863510588361], [3, 3, 3, 1, 1, 3, 3, 3, 3, 3  …  2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [0.088633862154607, 0.1601696612760808, 0.25203935016864243, 1.0171113631908497, 0.572203507658541, 0.2914318671535341, 0.24941442003220615, 0.6164728059882485, 0.4496610048292329, 0.379079850816467  …  0.19383882421588616, 0.07195790384628076, 0.07367341979481523, 0.09405132904213431, 0.09713508747414501, 0.22053401659064775, 0.03677470319603415, 0.2264529096588035, 0.3006991987272265, 0.03198291753014715], [520, 501, 507, 472], [520, 501, 507, 472], 592.7781033135057, 19, true)

In [8]:
# Assert the number of clusters in R matches the hyperparameter set earlier
@assert nclusters(R) == nc

In [9]:
# assigns labels of the various points
a = assignments(R)

2000-element Array{Int64,1}:
 3
 3
 3
 1
 1
 3
 3
 3
 3
 3
 3
 3
 3
 ⋮
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2

In [10]:
# Get the cluster sizes
c = counts(R)

4-element Array{Int64,1}:
 520
 501
 507
 472

In [11]:
# cluster centers (centroids), transposed for easier viewing
M = (R.centers)'

4×7 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
 0.878536  0.483214   0.63147    0.804992  -0.0919998  -1.32367   -1.38254
 0.262384  0.565537   0.840882   0.189405   0.911992   -1.3865    -1.3837
 0.548384  1.09883    0.722645   0.396962  -0.140233   -1.26381   -1.36277
 1.28448   0.0844232  0.0933737  1.22007   -0.679172   -0.977304  -1.02586

## Let's do fuzzy clustering with c-means!

In [12]:
R2 = fuzzy_cmeans(XX, nc, 2, maxiter=200, display=:iter)

  Iters      center-change
----------------------------
      1       2.201946e+00
      2       2.561988e-02
      3       2.944813e-02
      4       5.908007e-02
      5       1.141509e-01
      6       1.949047e-01
      7       2.532990e-01
      8       2.784805e-01
      9       2.617060e-01
     10       1.631835e-01
     11       8.466587e-02
     12       5.444228e-02
     13       3.024034e-02
     14       1.559566e-02
     15       7.840185e-03
     16       3.938820e-03
     17       1.999974e-03
     18       1.031348e-03
     19       5.521637e-04
Fuzzy C-means converged with 19 iterations (δ = 0.0005521637326594053)


FuzzyCMeansResult: 4 clusters for 2000 points in 7 dimensions (converged in 19 iterations)

In [13]:
M2 = (R2.centers)'

4×7 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
 1.28729   0.0769113  0.0845439  1.22369   -0.675118  -0.974145  -1.02317
 0.860147  0.517096   0.616297   0.799772  -0.12052   -1.31016   -1.36263
 0.269136  0.56732    0.836382   0.194916   0.897285  -1.38372   -1.38132
 0.583833  1.02733    0.713257   0.439721  -0.122957  -1.26457   -1.37661

In [14]:
# how much each data point belongs to each one 
# of the (4) clusters according to Cmeans
memberships = R2.weights

2000×4 Array{Float64,2}:
 0.0301681   0.109615   0.0629017  0.797315
 0.0350393   0.138041   0.102691   0.724229
 0.0569345   0.225785   0.0710142  0.646266
 0.2037      0.373949   0.0961748  0.326176
 0.131256    0.456725   0.084777   0.327242
 0.0673973   0.19992    0.0851061  0.647577
 0.0393886   0.213873   0.148296   0.598443
 0.0873056   0.38468    0.112812   0.415202
 0.0824746   0.303952   0.0914642  0.522109
 0.0916261   0.282622   0.0678266  0.557925
 0.0607165   0.223058   0.0856931  0.630532
 0.0449483   0.179598   0.101907   0.673547
 0.0173965   0.0793787  0.027275   0.87595
 ⋮                                 
 0.00873561  0.0296625  0.929453   0.0321493
 0.0276753   0.0697687  0.824122   0.0784342
 0.0248054   0.062264   0.839767   0.0731635
 0.0110888   0.0310011  0.9206     0.0373106
 0.0120288   0.0358366  0.911533   0.0406016
 0.0135941   0.0388886  0.896563   0.0509545
 0.0138915   0.0358671  0.905788   0.0444535
 0.0288853   0.0814099  0.793854   0.0958507
 0.00635

## Clean the data a bit

In [15]:
X = df2[1:7]
XX = map(Float64, Matrix(X))
y1 = map(string, df[:Room])
y2 = df[end]

2000-element Array{Float64,1}:
 -3830.076403892874
 -3965.2180884436843
 -3939.8869787581857
 -3875.0521950907514
 -3900.0130214272585
 -3960.0524501791515
 -3979.9618163176488
 -3909.877879990196
 -3885.0159706940613
 -4100.076892982874
 -4070.0383869957386
 -3905.0575928379626
 -3904.9761602550916
     ⋮
 -3814.9587355852414
 -3655.0249148548
 -3779.8978150118974
 -3720.048324591379
 -3969.8369667628403
 -4010.0494197155276
 -4000.183968429349
 -4125.085632528229
 -4019.946848662353
 -4024.9163175883937
 -3970.129805998784
 -3954.982839002469

In [16]:
# Train test split function
function partition(ind::Union{Array{Int64, 1}, UnitRange{Int64}}, r::Float64, shuffle::Bool = true)
    if typeof(ind) == UnitRange{Int64}; ind = collect(ind); end
    N = length(ind) # total number of data points in sample
    n = round(Int64, N*r) # number of data points in training set (train)
    train = [0, 0] # initialize train output
    test = [0, 0] # initialize test output
    
    if shuffle        
        ind_ = ind[randperm(N)]
    else
        ind_ = ind
    end
    
    train = ind_[1:n]
    test = ind_[(n+1):end]
    return train, test
end

partition (generic function with 2 methods)

In [17]:
train, test = partition(1:n, 0.7, true) # 70-30 split of the data

([280, 1437, 190, 537, 1830, 1950, 54, 191, 1690, 1650  …  34, 738, 1842, 1368, 321, 1464, 337, 1371, 1476, 970], [1730, 450, 1315, 1269, 291, 147, 137, 228, 400, 1522  …  1946, 536, 1019, 335, 1440, 1664, 1576, 1115, 1353, 362])

In [18]:
# Decision Tree
tree = DecisionTreeClassifier(max_depth=3)

DecisionTreeClassifier
max_depth:                3
min_samples_leaf:         1
min_samples_split:        2
min_purity_increase:      0.0
pruning_purity_threshold: 1.0
n_subfeatures:            0
classes:                  nothing
root:                     nothing

In [19]:
DecisionTree.fit!(tree, XX[train,:], y1[train])

DecisionTreeClassifier
max_depth:                3
min_samples_leaf:         1
min_samples_split:        2
min_purity_increase:      0.0
pruning_purity_threshold: 1.0
n_subfeatures:            0
classes:                  ["1", "2", "3", "4"]
root:                     Decision Tree
Leaves: 7
Depth:  3

In [20]:
print_tree(tree)

Feature 5, Threshold 0.42245649843991573
L-> Feature 2, Threshold 0.7673383346503632
    L-> Feature 5, Threshold -0.31269060292564277
        L-> 2 : 332/369
        R-> 3 : 287/339
    R-> Feature 2, Threshold 0.9993691531743649
        L-> 1 : 88/110
        R-> 1 : 246/246
R-> Feature 5, Threshold 0.6042511496066855
    L-> Feature 2, Threshold 0.9421451740468422
        L-> 4 : 16/17
        R-> 1 : 2/2
    R-> 4 : 317/317


In [21]:
yhat = DecisionTree.predict(tree, XX[test,:])

600-element Array{String,1}:
 "4"
 "1"
 "3"
 "3"
 "3"
 "1"
 "1"
 "1"
 "1"
 "4"
 "3"
 "4"
 "3"
 ⋮
 "4"
 "2"
 "4"
 "2"
 "1"
 "1"
 "2"
 "4"
 "4"
 "2"
 "3"
 "1"

In [22]:
sum(yhat .== y1[test]) / length(test)

0.9016666666666666

In [23]:
accuracy = cross_val_score(tree, XX, y1, cv=5)

5-element Array{Float64,1}:
 0.8075
 0.9075
 0.895
 0.89
 0.87

## Random Forest

In [24]:
n_subfeatures = 3
n_trees = 100
pst = 0.5 # portion of samples per tree
max_depth = 5

# Number of folds for the k-folds cross-validation later
n_folds = 5

5

In [25]:
forest1 = build_forest(y1[train], XX[train,:], n_subfeatures, n_trees, pst, max_depth) 

Ensemble of Decision Trees
Trees:      100
Avg Leaves: 19.11
Avg Depth:  5.0

In [26]:
yhat = apply_forest(forest1, XX[test,:])

600-element Array{String,1}:
 "4"
 "1"
 "3"
 "3"
 "3"
 "1"
 "1"
 "1"
 "1"
 "4"
 "3"
 "4"
 "3"
 ⋮
 "4"
 "2"
 "4"
 "2"
 "3"
 "1"
 "3"
 "4"
 "4"
 "3"
 "3"
 "1"

In [27]:
scores = apply_forest_proba(forest1, XX[test,:], map(string, 1:4)) # map(string, 1:4) => ["1", "2", "3", "4"] but it's more elegant and less risky

600×4 Array{Float64,2}:
 0.0   0.0   0.0   1.0
 1.0   0.0   0.0   0.0
 0.0   0.08  0.92  0.0
 0.0   0.0   1.0   0.0
 0.38  0.0   0.61  0.01
 0.95  0.0   0.05  0.0
 0.94  0.0   0.06  0.0
 1.0   0.0   0.0   0.0
 0.99  0.0   0.01  0.0
 0.0   0.0   0.0   1.0
 0.0   0.02  0.98  0.0
 0.0   0.0   0.0   1.0
 0.48  0.0   0.52  0.0
 ⋮                 
 0.0   0.0   0.0   1.0
 0.0   0.95  0.05  0.0
 0.0   0.0   0.0   1.0
 0.0   1.0   0.0   0.0
 0.28  0.24  0.48  0.0
 0.9   0.0   0.1   0.0
 0.02  0.03  0.95  0.0
 0.0   0.0   0.0   1.0
 0.0   0.0   0.0   1.0
 0.01  0.34  0.65  0.0
 0.0   0.0   1.0   0.0
 0.97  0.0   0.03  0.0

In [28]:
accuracy = nfoldCV_forest(y1, XX, n_folds, n_subfeatures)


Fold 1
Classes:  

4×4 Array{Int64,2}:
 98    0   6    3
  0  110   1    0
  3    6  72    0
  0    0   0  101

["1", "2", "3", "4"]
Matrix:   
Accuracy: 

4×4 Array{Int64,2}:
 98   0   3   0
  0  93   5   0
  7   3  94   0
  1   0   0  96

4×4 Array{Int64,2}:
 97   0   4   0
  0  99   5   0
  4   3  97   1
  1   0   0  89

4×4 Array{Int64,2}:
 93   0    5   0
  0  88    3   0
  3   6  103   0
  1   0    0  98

4×4 Array{Int64,2}:
 89   0   2    2
  0  92   4    0
  8   2  88    0
  1   0   0  112

0.9525
Kappa:    0.9363558711708845

Fold 2
Classes:  ["1", "2", "3", "4"]
Matrix:   
Accuracy: 0.9525
Kappa:    0.9366487171365219

Fold 3
Classes:  ["1", "2", "3", "4"]
Matrix:   
Accuracy: 0.955
Kappa:    0.939929918237944

Fold 4
Classes:  ["1", "2", "3", "4"]
Matrix:   
Accuracy: 0.955
Kappa:    0.9399028429294026

Fold 5
Classes:  ["1", "2", "3", "4"]
Matrix:   
Accuracy: 0.9525
Kappa:    0.9365439850377395

Mean Accuracy: 0.9535


5-element Array{Float64,1}:
 0.9525
 0.9525
 0.955
 0.955
 0.9525

In [29]:
forest2 = build_forest(y2[train], XX[train,:], 2, 10, 0.5, 6);

In [30]:
apply_forest(forest2, XX[test,:])

600-element Array{Float64,1}:
 -3948.819493026562
 -3876.6288675525693
 -3583.6483643187325
 -3621.35618411009
 -3846.3458063156895
 -3868.703191237402
 -3801.9414883703976
 -3908.429371435269
 -3889.407077157227
 -3899.914885223649
 -3598.385699316858
 -4008.0506047809577
 -3832.6061529835206
     ⋮
 -3929.1303157389025
 -3263.4695959284445
 -3934.5917115481907
 -2873.1781737632637
 -3566.6734515157877
 -3761.027363102069
 -3579.6748735997426
 -3942.527884263847
 -3927.811619337516
 -3553.280286523631
 -3521.3816914589856
 -3906.643607454446

In [31]:
accuracy = nfoldCV_forest(y2, XX, n_folds, n_subfeatures)


Fold 1
Mean Squared Error:     17950.858988306354
Correlation Coeff:      0.923532000385329
Coeff of Determination: 0.8528094932150105

Fold 2
Mean Squared Error:     16790.317403176658
Correlation Coeff:      0.9337888874124171
Coeff of Determination: 0.8715635010502809

Fold 3
Mean Squared Error:     15693.581425879742
Correlation Coeff:      0.9350160892939363
Coeff of Determination: 0.8735000088272649

Fold 4
Mean Squared Error:     14657.578229120467
Correlation Coeff:      0.9417044760801345
Coeff of Determination: 0.8856121053806987

Fold 5
Mean Squared Error:     14988.991797622019
Correlation Coeff:      0.9416442487217922
Coeff of Determination: 0.8851928172659674

Mean Coeff of Determination: 0.8737355851478446


5-element Array{Float64,1}:
 0.8528094932150105
 0.8715635010502809
 0.8735000088272649
 0.8856121053806987
 0.8851928172659674

In [32]:
ni = 1000 # number of iterations for boosted stumps

1000

In [33]:
model, coeffs = build_adaboost_stumps(y1[train], XX[train,:], ni)

(Ensemble of Decision Trees
Trees:      1000
Avg Leaves: 2.0
Avg Depth:  1.0, [0.531371234336294, 0.282066042538007, 0.42263858633617607, 0.4335406914866351, 0.4194288606242588, 0.4510543906575151, 0.41410328002009905, 0.4796910690432544, 0.37606181270974615, 0.5423794601151405  …  0.6196390881284066, 0.47368700850728784, 0.6008230591901262, 0.567100338396701, 0.4989492554333649, 0.5642311483761737, 0.5891840426054491, 0.5378583948535957, 0.5313512384307626, 0.5899812297162252])

In [34]:
apply_adaboost_stumps(model, coeffs, XX[test,:])

600-element Array{String,1}:
 "4"
 "1"
 "3"
 "3"
 "3"
 "1"
 "1"
 "1"
 "1"
 "4"
 "3"
 "4"
 "1"
 ⋮
 "4"
 "2"
 "4"
 "2"
 "3"
 "1"
 "3"
 "4"
 "4"
 "3"
 "3"
 "1"

In [35]:
apply_adaboost_stumps_proba(model, coeffs, XX[test,:], map(string, 1:4))

600×4 Array{Float64,2}:
 0.259931  0.076194    0.252546  0.411329
 0.411337  0.17134     0.306905  0.110419
 0.148453  0.399431    0.415684  0.0364321
 0.291403  0.248754    0.423422  0.0364209
 0.3649    0.00304306  0.365213  0.266844
 0.419372  0.170514    0.356591  0.0535232
 0.410897  0.18735     0.351192  0.0505607
 0.426674  0.15522     0.337667  0.0804392
 0.417332  0.17227     0.278538  0.13186
 0.354702  0.00896967  0.259803  0.376526
 0.219777  0.303066    0.431685  0.045472
 0.348923  0.0         0.256075  0.395003
 0.394309  0.160249    0.343926  0.101517
 ⋮                               
 0.220583  0.102943    0.244437  0.432037
 0.154888  0.482961    0.329491  0.0326596
 0.258925  0.0740924   0.255925  0.411058
 0.159062  0.483986    0.324292  0.0326596
 0.332661  0.24475     0.415506  0.0070822
 0.412697  0.170514    0.385965  0.0308243
 0.305216  0.221501    0.415998  0.0572844
 0.227133  0.102943    0.241774  0.42815
 0.264108  0.0656333   0.259855  0.410404
 0.169183 

In [36]:
accuracy = nfoldCV_stumps(y1, XX, n_folds, ni)

4×4 Array{Int64,2}:
 103   0   8   1
   0  88   4   0
   7   6  82   0
   0   0   2  99


Fold 1
Classes:  ["1", "2", "3", "4"]
Matrix:   
Accuracy: 0.93
Kappa:    0.9065202150035054


4×4 Array{Int64,2}:
 83   0   3    1
  0  88   3    0
  6   5  99    0
  1   0   1  110


Fold 2
Classes:  ["1", "2", "3", "4"]
Matrix:   
Accuracy: 0.95
Kappa:    0.9331187560088617


4×4 Array{Int64,2}:
 87   0   6   1
  0  97   5   0
  5   4  95   1
  0   0   0  99


Fold 3
Classes:  ["1", "2", "3", "4"]
Matrix:   
Accuracy: 0.945
Kappa:    0.9266183570850808


4×4 Array{Int64,2}:
 106   0   7   1
   0  94   5   0
   6   8  77   0
   1   0   1  94


Fold 4
Classes:  ["1", "2", "3", "4"]
Matrix:   
Accuracy: 0.9275
Kappa:    0.9030991562943781


4×4 Array{Int64,2}:
 83    0   9   1
  0  111   5   0
  4    7  87   1
  1    0   0  91


Fold 5
Classes:  ["1", "2", "3", "4"]
Matrix:   
Accuracy: 0.93
Kappa:    0.9063333695733987

Mean Accuracy: 0.9364999999999999


5-element Array{Float64,1}:
 0.93
 0.95
 0.945
 0.9275
 0.93