## Importer les librairies

In [None]:
using CSV, DataFrames, Distributions, Gadfly, GLM
using Distances
import StatsBase       # Pour la standardisation des variables

## Traiter les données

In [6]:
data = CSV.read("../train.csv")
y_train = data[end]

temp_X_train = data[2:end-1] #omit id and diagnosis
# Estimation des paramètres de la standardisation
tx = StatsBase.fit(StatsBase.ZScoreTransform, Matrix{Float64}(temp_X_train), dims=1)
# Standardisation des variables
X_train = StatsBase.transform(tx, Matrix{Float64}(temp_X_train));

data_test = CSV.read("../test.csv")
X_test = StatsBase.transform(tx, Matrix{Float64}(data_test[2:end]));

id_test = data_test[1]


println(size(X_train))
println(size(X_test))


(455, 10)
(114, 10)


## Faire le modele KNN

In [7]:
function calculateDist(vec1, vec2)
    return euclidean(vec1,vec2)
end

function findIndexesNBiggest(n, tab)
    temp = copy(tab)
    indexes = []
    while (n > 0 && size(tab)[1] > 0)
        _, index_smaller = findmin(temp)
        push!(indexes, index_smaller)
        splice!(temp, index_smaller)
        n -= 1
    end
    return indexes
end

function predict(k, X_train, y_train, X_test) #works best with odd k    
    nb_data = size(X_train)[1]
    
    distances = []
    for elem in 1:nb_data
        push!(distances, calculateDist(X_test, X_train[elem, :]))
    end
    
    indexes_distances = findIndexesNBiggest(k, distances)
    
    nb_0 = 0
    for index in indexes_distances
        if (y_train[index] == 0)
            nb_0 += 1
        end
    end
    
    return convert(Int8, (nb_0 < k - nb_0))
    
end

function knn(k, X_train, y_train, X_test)
    ans = []
    for elem in 1:size(X_test)[1]
        push!(ans, predict(k, X_train, y_train, X_test[elem, :]))
    end
    return ans
end




"KNN.csv"

## K-cross validation

In [None]:
# K-cross validation
function findAllIndexes(length, nb_blocks)
    return [(convert(Int16, floor((i-1)*length/nb_blocks))+1, convert(Int16, floor(i*length/nb_blocks))) for i = 1:nb_blocks]
end

function countTFPN(t_label, predictions)
    TP, FP, FN, TN = 0, 0, 0, 0
    taille = size(t_label)[1]
    for i = 1:taille
        if(t_label[i] == 1 && predictions[i] == 1)
            TP += 1
        elseif(t_label[i] == 1 && predictions[i] == 0)
            FN += 1
        elseif(t_label[i] == 0 && predictions[i] == 1)
            FP += 1
        elseif(t_label[i] == 0 && predictions[i] == 0)
            TN += 1
        end
            
    end
    return TP, FP, FN, TN
end

function computeMetrics(t_label, predictions)
    TP, FP, FN, TN = countTFPN(t_label, predictions)
    precision, recall, accuracy = (TP/(TP + FP)), (TP/(TP + FN)), ((TP+TN)/(TP + FP + FN + TN))
    return [precision, recall, accuracy]
end

function split_train_test(X, y, index)
    sub_X_train = [X[1 : index[1]-1, :]; X[index[2] : end, :]]
    sub_X_test = X[index[1] : index[2], :]
    sub_y_train = [y[1 : index[1]-1]; y[index[2] : end]]
    true_y_test = y[index[1] : index[2]]
    return sub_X_train, sub_X_test, sub_y_train, true_y_test
end


function KCrossValidation(k, X, y, nb_blocks = 5)
    indexes = findAllIndexes(size(X_test)[1],nb_blocks)
    ans = []
    for interval = indexes
        sub_X_train, sub_X_test, sub_y_train, true_y_test = split_train_test(X, y, interval)
        sub_y_test =  knn(k, sub_X_train, sub_y_train, sub_X_test)
        push!(ans, computeMetrics(true_y_test, sub_y_test))
    end
    temp = mean(ans)
    return temp
end


## Trouver le K qui minimise l'erreur selon les métriques

In [None]:
metrics = []
gen_temp = 1:2:9
for i = 1:2:9
    temp = KCrossValidation(i, X_train, y_train)
    if (i ==  1)
        metrics = [[elem] for elem = temp]
    else
        taille = size(temp)[1]
        for index = 1:taille
            push!(metrics[index], temp[index])
        end
    end
    
end
ks = [i for i = gen_temp]

y_titles = ["precision", "recall", "accuracy"]


In [None]:
plot(x=ks, y=metrics[1], Geom.line, Guide.xlabel("K"), Guide.ylabel(y_titles[1]))

In [None]:
plot(x=ks, y=metrics[2], Geom.line, Guide.xlabel("K"), Guide.ylabel(y_titles[2]))

In [None]:
plot(x=ks, y=metrics[3], Geom.line, Guide.xlabel("K"), Guide.ylabel(y_titles[3]))

## Rouler le code avec ce K

In [None]:
k = 1
y_test = knn(k, X_train, y_train, X_test)
prediction = DataFrame(id = id_test, diagnosis = y_test)
CSV.write("KNN.csv",prediction)