In [39]:
println("Hello World!")

# Assign the current working directory path to a variable
current_directory = pwd()

# String concatenation is accomplished using commas
println("Currently, this notebook is running in the directory ", current_directory)

Hello World!
Currently, this notebook is running in the directory /Users/xronos/julia-for-data-science


In [40]:
# To install a package, run the following line in this notebook or from the Julia REPL.
# import Pkg; Pkg.add("CSV")

# If the CSV package is already installed, this is the only line necessary to use it in a program.
using CSV 

In [41]:
# Let's pull in some data for analysis
# This file is from https://archive.ics.uci.edu/ml/datasets/magic+gamma+telescope
data = CSV.File("./data/Magic/magic04.csv") 

print(data)

CSV.File("./data/Magic/magic04.csv"):
Size: 19019 x 11
Tables.Schema:
 Symbol("28.7967")  Float64
 Symbol("16.0021")  Float64
 Symbol("2.6449")   Float64
 Symbol("0.3918")   Float64
 Symbol("0.1982")   Float64
 Symbol("27.7004")  Float64
 Symbol("22.011")   Float64
 Symbol("-8.2027")  Float64
 Symbol("40.092")   Float64
 Symbol("81.8828")  Float64
 :g                 String 

We'll use the `magic` dataset and classify the observed radiation of the telescope as `gamma` or `hadron`.

## k-Nearest Neighbors Classification
The diagram of the kNN algorithm is below.

<img src="./kNN-diagram.jpeg" alt="kNN diagram">

In [42]:
# Distance should take in two dimensional vectors and return a single number (float).
# The commented-out implementation specifies input data types and the return data type
# function distance(x::Array{Float64, 1}, y::Array{Float64, 1})::Float64
#     distance = 0
#     for in in 1:length(x)
#         dist += (x[i] - y[i])^2
#     end
#     distance = sqrt(distance)
#     return distance
# end

function distance(x, y)
    distance = 0
    for i in 1:length(x)
        distance += (x[i] -y[i])^2
    end
    distance = sqrt(distance)
    return distance
end

distance (generic function with 3 methods)

In [43]:
# Define the implementation of the classify function
function classify(distances, labels)
    # Find all of the distinct classes
    class = unique(labels)

    # Determine the number of classees
    number_of_classes = length(class) 
    
    # Initialize the vector of indexes
    indexes = Array(Int, 0)
    
    # Specify the largest possible number that this vector can have
    
    M = maxtype(typeof(distances[1]))
    class_count = Array(Int, nc)
    for i in 1:k
        indexes[i] = inmin(distances)
        
        # make sure this element is not selected again
        distances[indexes[i]] = M
    end
    
    klabels = labe3ls[indexes]
    for i in 1:nc
        for j in 1:k
            if klabels[j] == class[i]
                class_count[i] += 1
                break
            end
        end
    end
    index = inmax(class_count)
    return class[index]
end


classify (generic function with 1 method)

In [50]:
# Another auxiliary function of the implementation of the kNN algorithm. This one performs classification of a point based on its distances from the known points of the dataset.
function apply_kNN(X, Y, k)

    N = size(X, 1) # Number of known datapoints
    n = size(Y, 1) # number of datapoints to classify
    D = Array(Float, N) # Initialize the distance vector
    z = Array(typeof(x[1]), n) # Initialize the labels vector (output)
    for i in 1:n
        for j in 1:N
            D[j] = distance(X[j,:], Y[i,:])
        end
        z[i] = classify(D, x, k)
    end
    return z
end
    

apply_kNN (generic function with 2 methods)

In [69]:
print(data)

CSV.File("./data/Magic/magic04.csv"):
Size: 19019 x 11
Tables.Schema:
 Symbol("28.7967")  Float64
 Symbol("16.0021")  Float64
 Symbol("2.6449")   Float64
 Symbol("0.3918")   Float64
 Symbol("0.1982")   Float64
 Symbol("27.7004")  Float64
 Symbol("22.011")   Float64
 Symbol("-8.2027")  Float64
 Symbol("40.092")   Float64
 Symbol("81.8828")  Float64
 :g                 String 

In [91]:
using DataFrames

# Create a dataframe out of the CSV data object
data = DataFrame(data)

Unnamed: 0_level_0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648
5,51.624,21.1502,2.9085,0.242,0.134,50.8761,43.1887,9.8145,3.613
6,48.2468,17.3565,3.0332,0.2529,0.1515,8.573,38.0957,10.5868,4.792
7,26.7897,13.7595,2.5521,0.4236,0.2174,29.6339,20.456,-2.9292,0.812
8,96.2327,46.5165,4.154,0.0779,0.039,110.355,85.0486,43.1844,4.854
9,46.7619,15.1993,2.5786,0.3377,0.1913,24.7548,43.8771,-6.6812,7.875
10,62.7766,29.9104,3.3331,0.2475,0.1261,-33.9065,57.5848,23.771,9.9144


In [92]:
# Set I to hold all of the numeric columns
# Al columns of the data matrix, apart from the last one and convert everything into a Float. Result = 10-dim Array of Float numbers
I = data[:, 1:(end-1)]

Unnamed: 0_level_0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648
5,51.624,21.1502,2.9085,0.242,0.134,50.8761,43.1887,9.8145,3.613
6,48.2468,17.3565,3.0332,0.2529,0.1515,8.573,38.0957,10.5868,4.792
7,26.7897,13.7595,2.5521,0.4236,0.2174,29.6339,20.456,-2.9292,0.812
8,96.2327,46.5165,4.154,0.0779,0.039,110.355,85.0486,43.1844,4.854
9,46.7619,15.1993,2.5786,0.3377,0.1913,24.7548,43.8771,-6.6812,7.875
10,62.7766,29.9104,3.3331,0.2475,0.1261,-33.9065,57.5848,23.771,9.9144


In [95]:
# Set O variable to hold the column of labels. "g" for "gamma" and "h" for "hadron"
O = data[:, end]

19019-element PooledArrays.PooledArray{String,UInt32,1,Array{UInt32,1}}:
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 ⋮  
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"