# Regressão k-NN

## 1. Sem condensation

In [4]:
function predict(X_train, y_train, x_test, k)
    # create list for distances and targets
    distances = []
    sum_y = 0
    for i in 1:size(X_train)[1]
        # first we compute the euclidean distance
        distance = (sum((x_test[j] - X_train[i, j])^2 for j in 1:length(x_test)))^0.5
        # add it to list of distances
        push!(distances, [distance, i])
    end

    # sort the list
    sort!(distances, by = x -> x[1])

    # make a list of the k neighbors' targets
    for i in 1:k
        index = convert(Int, distances[i][2])[1]
        sum_y += y_train[index]
    end

    # return mean of closest y
    return sum_y/k
end

predict (generic function with 1 method)

In [5]:
function kNearestNeighbor(X_train, y_train, X_test, k)
    predictions = []
    
    # check if k larger than n
    if k > length(X_train)
        println("O número de pontos deve ser maior que k")
        return 
    end

    # predict for each testing observation
    for i in 1:size(X_test)[1]
        append!(predictions, predict(X_train, y_train, X_test[i, :], k))
    end
    return predictions
end

kNearestNeighbor (generic function with 1 method)

In [6]:
X_train = [[5 45];[5.11 26]; [5.6 30]; [5.9 34]; [4.8 40]; [5.8 36]; [5.3 19];
            [5.8 28]; [5.5 23]; [5.6 32]]

10×2 Array{Float64,2}:
 5.0   45.0
 5.11  26.0
 5.6   30.0
 5.9   34.0
 4.8   40.0
 5.8   36.0
 5.3   19.0
 5.8   28.0
 5.5   23.0
 5.6   32.0

In [7]:
Y_train = [77; 47; 55; 59; 72; 60; 40; 60; 45; 58]

10-element Array{Int64,1}:
 77
 47
 55
 59
 72
 60
 40
 60
 45
 58

In [8]:
X_test = [5.5 38]

1×2 Array{Float64,2}:
 5.5  38.0

In [9]:
kNearestNeighbor(X_train, Y_train, X_test, 3)

1-element Array{Any,1}:
 63.6667

## 2. Com condensation

## 3. K-means

In [10]:
function kmeans(X, k, max_iter = 100, threshold = 0.001)

    # Let's pick k points from X without replacment
    centroids = X[:, sample(1:size(X,2), k, replace = false)]

    # create a copy. This is used to check if the centroids are moving or not.
    new_centroids = copy(centroids)

    # start an empty array for our cluster ids. This will hold the cluster assignment
    # for each point in X
    cluster_ids = Array{Int32}(size(X,2))

    for i in 1:max_iter # I use _ here as we're not using this variable inside the loop
        for col_idx in 1:size(X, 2) # iterate over each point

            # let's index the ponts one by one
            p = X[:, col_idx]

            # calculate the distance between the point and each centroid
            point_difference = mapslices(x -> x - p, centroids, 1)

            # we calculate the squared Euclidian distance
            distances = mapslices(sum, point_difference .^ 2, 1)

            # now find the index of the closest centroid
            cluster_ids[col_idx] = findmin(distances)[2]
        end

        # Iterate over each centroid
        for cluster_id in 1:size(centroids, 2)

            # find the mean of the assigned points for that particluar cluster
            new_centroids[:, cluster_id] = mapslices(mean, X[:, cluster_id .== cluster_ids], 2)
        end

        # now measure the total distance that the centroids moved
        center_change = sum(mapslices(x -> sum(x.^2), new_centroids .- centroids, 2))

        centroids = copy(new_centroids)

        # if the centroids move negligably, then we're done
        if center_change < threshold
            # println(i)
            break
        end
    end

    # we'll send back both the location of the centroids as well as the cluster ids for each point
    return centroids, cluster_ids
end

kmeans (generic function with 3 methods)

In [11]:
function RBFNetwork(X_train, Y_train, k, r)
    
    # get centroids with kmeans
    centroids = kmeans(X_train, k)[:,1]
    
    G = zeros(size(X_train)[1], k+1)
    
    for i in 1:size(X_train)[1]
        G[i,1] = 1
        for j in 1:k
            dist = (sum((centroids[j,k] - X_train[i,k])^2 for k in 1:size(X_train)[2]))^0.5
            G[i,j+1] = exp(-((dist/r)^2)/2)
        end
    end
    
    w = inv(transpose(G)*G)*transpose(G)*Y_train
    return w
end

RBFNetwork (generic function with 1 method)

In [12]:
using RDatasets

In [20]:
carData = RDatasets.datasets("car");
display(sort!(carData, :Columns));

Unnamed: 0,Package,Dataset,Title,Rows,Columns
1,car,CanPop,Canadian Population Data,16,2
2,car,Sahlins,Agricultural Production in Mazulu Village,20,2
3,car,USPop,Population of the United States,22,2
4,car,Friendly,Format Effects on Recall,30,2
5,car,Mandel,Contrived Collinear Data,8,3
6,car,Guyer,Anonymity and Cooperation,20,3
7,car,Burt,Fraudulent Data on IQs of Twins Raised Apart,27,3
8,car,Adler,Experimenter Expectations,97,3
9,car,UN,GDP and Infant Mortality,207,3
10,car,Transact,Transaction data,261,3


In [24]:
Leinhardt = RDatasets.dataset("car","Leinhardt")

Unnamed: 0,Nation,Income,Infant,Region,Oil
1,Australia,3426,26.7,Asia,no
2,Austria,3350,23.7,Europe,no
3,Belgium,3346,17.0,Europe,no
4,Canada,4751,16.8,Americas,no
5,Denmark,5029,13.5,Europe,no
6,Finland,3312,10.1,Europe,no
7,France,3403,12.9,Europe,no
8,West.Germany,5040,20.4,Europe,no
9,Ireland,2009,17.8,Europe,no
10,Italy,2298,25.7,Europe,no


In [27]:
Leinhardt[:, 2:3]

Unnamed: 0,Income,Infant
1,3426,26.7
2,3350,23.7
3,3346,17.0
4,4751,16.8
5,5029,13.5
6,3312,10.1
7,3403,12.9
8,5040,20.4
9,2009,17.8
10,2298,25.7
