# Chapter 5 of Julia for Machine Learning

In [5]:
using CSV, DataFrames, Clustering, DecisionTree, GLM, XGBoost
using StatsBase, Distributions, HypothesisTests, MultivariateStats
using Distances, MLLabelUtils, MLBase
using TSne, Gadfly
using ScikitLearn.CrossValidation: cross_val_score
using Random

In [20]:
# Dataset is WiFI data
df = CSV.read("localization.csv", header=false)

# Show 4 rows, all columns
df[1:4, 1:end]

Unnamed: 0_level_0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,-64,-56,-61,-66,-71,-82,-81,1
2,-68,-57,-61,-65,-71,-85,-85,1
3,-63,-60,-60,-67,-76,-85,-84,1
4,-61,-60,-68,-62,-77,-90,-80,1


In [21]:
df[:RegressionTarget] = Matrix(df[[1, 4, 6, 7]])*[5, 10, 15, 20] + 0.1*randn(2000)

2000-element Array{Float64,1}:
 -3830.076403892874
 -3965.2180884436843
 -3939.8869787581857
 -3875.0521950907514
 -3900.0130214272585
 -3960.0524501791515
 -3979.9618163176488
 -3909.877879990196
 -3885.0159706940613
 -4100.076892982874
 -4070.0383869957386
 -3905.0575928379626
 -3904.9761602550916
     ⋮
 -3814.9587355852414
 -3655.0249148548
 -3779.8978150118974
 -3720.048324591379
 -3969.8369667628403
 -4010.0494197155276
 -4000.183968429349
 -4125.085632528229
 -4019.946848662353
 -4024.9163175883937
 -3970.129805998784
 -3954.982839002469

In [23]:
# Standardize/scale the dataset
X = StatsBase.standardize(ZScoreTransform,map(Float64, Matrix(df[1:7])), dims=2)

2000×7 Array{Float64,2}:
 0.478077   1.28936   0.782308   0.275256    -0.231795   -1.34731   -1.2459
 0.207303   1.20495   0.84217    0.479389    -0.0647823  -1.33451   -1.33451
 0.708444   0.98395   0.98395    0.341103    -0.485416   -1.31193   -1.2201
 0.884579   0.971791  0.274095   0.797367    -0.510813   -1.64457   -0.772449
 0.739762   0.551459  1.02222    0.739762    -0.578359   -0.954965  -1.51987
 0.563639   1.31916   0.647586   0.395747    -0.443716   -1.45107   -1.03134
 0.605834   1.00033   0.605834   0.408586     0.211337   -1.5639    -1.26802
 0.823905   0.643676  1.09425    0.373332    -0.347585   -1.51908   -1.0685
 0.465976   0.919009  1.00962    0.647189    -0.530695   -1.43676   -1.07433
 0.918708   1.08152   0.59309    0.430281    -0.546573   -1.035     -1.44202
 0.489995   1.01768   0.929733   0.489995    -0.389483   -0.917169  -1.62075
 0.50721    1.09895   0.901707   0.309962    -0.183159   -1.5639    -1.07078
 0.598345   1.14466   0.78045    0.41624     -0.31218

In [24]:
# Rename the columns
old_names = names(df)
new_names = map(Symbol, ["WiFi1", "WiFi2", "WiFi3", "WiFi4", "WiFi5", "WiFi6", "WiFi7", "Room"])
for i = 1:8
    rename!(df, old_names[i] => new_names[i])
end
names(df)

9-element Array{String,1}:
 "WiFi1"
 "WiFi2"
 "WiFi3"
 "WiFi4"
 "WiFi5"
 "WiFi6"
 "WiFi7"
 "Room"
 "RegressionTarget"

In [25]:
# first looks like .head
first(df, 5)

Unnamed: 0_level_0,WiFi1,WiFi2,WiFi3,WiFi4,WiFi5,WiFi6,WiFi7,Room,RegressionTarget
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Float64
1,-64,-56,-61,-66,-71,-82,-81,1,-3830.08
2,-68,-57,-61,-65,-71,-85,-85,1,-3965.22
3,-63,-60,-60,-67,-76,-85,-84,1,-3939.89
4,-61,-60,-68,-62,-77,-90,-80,1,-3875.05
5,-63,-65,-60,-63,-77,-81,-87,1,-3900.01


In [36]:
CSV.write("./wifi_named.csv", df)

"./wifi_named.csv"

In [30]:
# Standardize/scale the dataset
X = StatsBase.standardize(ZScoreTransform,map(Float64, Matrix(df[1:7])), dims=2)

2000×7 Array{Float64,2}:
 0.478077   1.28936   0.782308   0.275256    -0.231795   -1.34731   -1.2459
 0.207303   1.20495   0.84217    0.479389    -0.0647823  -1.33451   -1.33451
 0.708444   0.98395   0.98395    0.341103    -0.485416   -1.31193   -1.2201
 0.884579   0.971791  0.274095   0.797367    -0.510813   -1.64457   -0.772449
 0.739762   0.551459  1.02222    0.739762    -0.578359   -0.954965  -1.51987
 0.563639   1.31916   0.647586   0.395747    -0.443716   -1.45107   -1.03134
 0.605834   1.00033   0.605834   0.408586     0.211337   -1.5639    -1.26802
 0.823905   0.643676  1.09425    0.373332    -0.347585   -1.51908   -1.0685
 0.465976   0.919009  1.00962    0.647189    -0.530695   -1.43676   -1.07433
 0.918708   1.08152   0.59309    0.430281    -0.546573   -1.035     -1.44202
 0.489995   1.01768   0.929733   0.489995    -0.389483   -0.917169  -1.62075
 0.50721    1.09895   0.901707   0.309962    -0.183159   -1.5639    -1.07078
 0.598345   1.14466   0.78045    0.41624     -0.31218

In [35]:
X

2000×7 Array{Float64,2}:
 0.478077   1.28936   0.782308   0.275256    -0.231795   -1.34731   -1.2459
 0.207303   1.20495   0.84217    0.479389    -0.0647823  -1.33451   -1.33451
 0.708444   0.98395   0.98395    0.341103    -0.485416   -1.31193   -1.2201
 0.884579   0.971791  0.274095   0.797367    -0.510813   -1.64457   -0.772449
 0.739762   0.551459  1.02222    0.739762    -0.578359   -0.954965  -1.51987
 0.563639   1.31916   0.647586   0.395747    -0.443716   -1.45107   -1.03134
 0.605834   1.00033   0.605834   0.408586     0.211337   -1.5639    -1.26802
 0.823905   0.643676  1.09425    0.373332    -0.347585   -1.51908   -1.0685
 0.465976   0.919009  1.00962    0.647189    -0.530695   -1.43676   -1.07433
 0.918708   1.08152   0.59309    0.430281    -0.546573   -1.035     -1.44202
 0.489995   1.01768   0.929733   0.489995    -0.389483   -0.917169  -1.62075
 0.50721    1.09895   0.901707   0.309962    -0.183159   -1.5639    -1.07078
 0.598345   1.14466   0.78045    0.41624     -0.31218

In [37]:
df = CSV.read("wifi_named.csv", header=true)

first(df, 3)

Unnamed: 0_level_0,WiFi1,WiFi2,WiFi3,WiFi4,WiFi5,WiFi6,WiFi7,Room,RegressionTarget
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Float64
1,-64,-56,-61,-66,-71,-82,-81,1,-3830.08
2,-68,-57,-61,-65,-71,-85,-85,1,-3965.22
3,-63,-60,-60,-67,-76,-85,-84,1,-3939.89


In [38]:
# Standardize/scale the 7 signal features
X = StatsBase.standardize(ZScoreTransform,map(Float64, Matrix(df[1:7])), dims=2)

# Show first 5 rows of all columns of X
X[1:3, 1:end]

3×7 Array{Float64,2}:
 0.478077  1.28936  0.782308  0.275256  -0.231795   -1.34731  -1.2459
 0.207303  1.20495  0.84217   0.479389  -0.0647823  -1.33451  -1.33451
 0.708444  0.98395  0.98395   0.341103  -0.485416   -1.31193  -1.2201

In [39]:
XX = map(Float64, Matrix(df[1:7]))
XX[1:3, 1:end]

3×7 Array{Float64,2}:
 -64.0  -56.0  -61.0  -66.0  -71.0  -82.0  -81.0
 -68.0  -57.0  -61.0  -65.0  -71.0  -85.0  -85.0
 -63.0  -60.0  -60.0  -67.0  -76.0  -85.0  -84.0

In [40]:
X = StatsBase.standardize(ZScoreTransform, XX, dims=2)
X[1:3, 1:end]

3×7 Array{Float64,2}:
 0.478077  1.28936  0.782308  0.275256  -0.231795   -1.34731  -1.2459
 0.207303  1.20495  0.84217   0.479389  -0.0647823  -1.33451  -1.33451
 0.708444  0.98395  0.98395   0.341103  -0.485416   -1.31193  -1.2201

In [41]:
# So XX is the un-scaled data as floats
# And X is the standardized data

In [42]:
df2 = DataFrame(hcat(X, df[:Room]))
first(df2, 3)

Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.478077,1.28936,0.782308,0.275256,-0.231795,-1.34731,-1.2459,1.0
2,0.207303,1.20495,0.84217,0.479389,-0.0647823,-1.33451,-1.33451,1.0
3,0.708444,0.98395,0.98395,0.341103,-0.485416,-1.31193,-1.2201,1.0


In [43]:
# Rename the columns
old_names = names(df2)
new_names = map(Symbol, ["WiFi1", "WiFi2", "WiFi3", "WiFi4", "WiFi5", "WiFi6", "WiFi7", "Room"])
for i = 1:8
    rename!(df2, old_names[i] => new_names[i])
end
names(df2)

8-element Array{String,1}:
 "WiFi1"
 "WiFi2"
 "WiFi3"
 "WiFi4"
 "WiFi5"
 "WiFi6"
 "WiFi7"
 "Room"

In [None]:
CSV.write("./wifi_standardized.csv", df2)