## Importer les librairies

In [None]:
using CSV
using DataFrames
using Distributions
using Gadfly
using GLM
using Plots
using Distances
import StatsBase       # Pour la standardisation des variables
using LinearAlgebra
using Statistics
include("functions.jl")

## Traiter les données

In [None]:
data = CSV.read("../train.csv")
y_train = data[end]


X_train = data[2:end-1] #omit id and diagnosis

data_test = CSV.read("../test.csv")
X_test = data_test[2:end]

id_test = data_test[1]


println(size(X_train))
println(size(X_test))

length_train = size(X_train)[1]
length_test = size(X_train)[1]


new_X = vcat(X_train, X_test)


In [None]:
X = convert(Array{Float64}, new_X)

Z = standardize(X)

# Décomposition en valeurs singulières de la matrice rectangulaire Z
F = svd(Z)

# Extraction de la matrice U
U = F.U

# Extraction de la matrice V
V = F.V

# Extraction des valeurs singulières
γ = F.S

## Visualiser les données

<img id="myimage" src="pair.png" style=" height:500px; width:500px;">





## Version Agrandie par souci de lisibilité


<div class="img-zoom-container", style="overflow:auto; height:550px; width:5669px;">
  <img id="myimage" src="pair.png" style=" height:5669px; width:5669px;">>
</div>


In [None]:
Gadfly.set_default_plot_size(150cm, 150cm)

matrix = Array{Plot}(undef, 10, 10)

column = names(data)[2:end-1]
i = 1
for c1 in column
    j = 1
    for c2 in column
        if (i == j)
            matrix[i,j] = Gadfly.plot(data, x = c1, Geom.histogram(bincount = 30), color = :diagnosis)
        else
            matrix[i,j] = Gadfly.plot(data, x = c1, y = c2, color = :diagnosis)
        end
        j+=1
    end 
    i+=1
end 


#grid = gridstack(matrix) # graphing is so computationnally hard that it makes the jupyter bug

println("Graphing is done")

## Faire le modèle KNN

Le K-nearest neighbors algorithm est un algorithme de classification assez simple. Il se base sur le principe que si un nouvelle donnée dont nous ne connaissons pas la classification se trouve proche d'autre donné que nous connaissons la classification, la donnée dont nous ne connaissons pas la classe devrait appartenir à la classe dont les donnes

In [None]:
function calculateDist(vec1, vec2)
    dist = 0
    taille = size(vec1)[1]
    for i = 1:taille
        dist += (vec1[i]-vec2[i])^2
    end
    
    return dist
end

function findMin(n, tab, used_indexes)
    mini_pos = 1
    mini = tab[mini_pos]
    
    for i = 2:length(tab)
        if (tab[i] < mini && !(i in used_indexes))
            mini_pos = i
            mini = tab[i]
        end
    end
    return mini_pos
end


function findIndexesNSmallest(n, tab)
    temp = copy(tab)
    indexes = []
    for i = 1:n
        new_index = findMin(n, temp, indexes)
        push!(indexes, new_index)
    end
    

    return indexes
end

function predict(k, X_train, y_train, X_test) #works best with odd k    
    nb_data = size(X_train)[1]
    
    distances = []
    for elem in 1:nb_data
        push!(distances, calculateDist(X_test, X_train[elem, :]))
    end
    
    indexes_distances = findIndexesNSmallest(k, distances)
    
    nb_0 = 0
    for index in indexes_distances
        if (y_train[index] == 0)
            nb_0 += 1
        end
    end
    
    return convert(Int8, (nb_0 < k - nb_0))
    
end

function knn(k, X_train, y_train, X_test)
    ans = []
    for elem in 1:size(X_test)[1]
        push!(ans, predict(k, X_train, y_train, X_test[elem, :]))
    end
    return ans
end




## K-cross validation

In [None]:
# K-cross validation
function findAllIndexes(length, nb_blocks)
    return [(convert(Int16, floor((i-1)*length/nb_blocks))+1, convert(Int16, floor(i*length/nb_blocks))) for i = 1:nb_blocks]
end

function countTFPN(t_label, predictions)
    TP, FP, FN, TN = 0, 0, 0, 0
    taille = size(t_label)[1]
    for i = 1:taille
        if(t_label[i] == 1 && predictions[i] == 1)
            TP += 1
        elseif(t_label[i] == 1 && predictions[i] == 0)
            FN += 1
        elseif(t_label[i] == 0 && predictions[i] == 1)
            FP += 1
        elseif(t_label[i] == 0 && predictions[i] == 0)
            TN += 1
        end
            
    end
    return TP, FP, FN, TN
end

function computeMetrics(t_label, predictions)
    TP, FP, FN, TN = countTFPN(t_label, predictions)
    precision, recall, accuracy = (TP/(TP + FP)), (TP/(TP + FN)), ((TP+TN)/(TP + FP + FN + TN))
    return [precision, recall, accuracy]
end

function split_train_test(X, y, index)
    sub_X_train = [X[1 : index[1]-1, :]; X[index[2] : end, :]]
    sub_X_test = X[index[1] : index[2], :]
    sub_y_train = [y[1 : index[1]-1]; y[index[2] : end]]
    true_y_test = y[index[1] : index[2]]
    return sub_X_train, sub_X_test, sub_y_train, true_y_test
end


function KCrossValidation(k, X, y, nb_blocks = 15)
    indexes = findAllIndexes(size(X_test)[1],nb_blocks)
    ans = []
    for interval = indexes
        sub_X_train, sub_X_test, sub_y_train, true_y_test = split_train_test(X, y, interval)
        sub_y_test =  knn(k, sub_X_train, sub_y_train, sub_X_test)
        push!(ans, computeMetrics(true_y_test, sub_y_test))
    end
    temp = mean(ans)
    return temp
end


## Trouver le K qui minimise l'erreur selon les métriques

In [None]:
mat = γ[1] *U[:, 1]*V[:,1]'

X_train = mat[1:length_train, :]
X_test = mat[length_train+1:end, :]

println(size(X_train))
println(size(X_test))
println(size(y_train))
println(size(y_test))



In [None]:
spy_matrix = Array{AbstractFloat}(undef, 10, 10)
spy_matrix_text = Array{Plots.PlotText}(undef, 10, 10)
for j = 1:10
    println("Index : ", j)
    
    mat = γ[1] *U[:, 1]*V[:,1]'
    for k = 2:j
        mat += γ[k] *U[:, k]*V[:,k]'
    end
    

    X_train = mat[1:length_train, :]

    X_test = mat[length_train+1:end, :]
    metrics = []
    gen_temp = 1:2:19
    for i = 1:2:19
        temp = KCrossValidation(i, X_train, y_train)
        if (i ==  1)
            metrics = [[elem] for elem = temp]
        else
            taille = size(temp)[1]
            for index = 1:taille
                push!(metrics[index], temp[index])
            end
        end

    end
    ks = [i for i = gen_temp]

    
    precision = metrics[1]
    recall = metrics[2]
    for index = 1:length(precision)
        f1 = 2*(precision[index] * recall[index])/(precision[index] + recall[index])
        println("F1 score ",index*2 - 1,  "NN : ", f1)
        spy_matrix[index,j] = f1
        spy_matrix_text[index,j] = Plots.text(round(f1, digits=4),9)
    end
    
end


In [None]:
Gadfly.set_default_plot_size(12cm, 12cm)
Axis_k = ["1","3","5","7","9","11","13","15","17","19"]
Axis_PCA = ["1","2","3","4","5","6","7","8","9","10"]
p = heatmap(Axis_PCA,Axis_k,spy_matrix,
    ylabel ="Value of K in the KNN",
    xlabel = "Number of parameters conserved by the PCA",
    title = "F1 score in function of the hyperparameters",
    c=:Greens
)
annotate!( vec(tuple.((1:length(Axis_PCA))'.-0.5, (1:length(Axis_k)).-0.5, spy_matrix_text)) )


In [None]:
plot(x=ks, y=metrics[1], Geom.line, Guide.xlabel("K"), Guide.ylabel(y_titles[1]))

In [None]:
plot(x=ks, y=metrics[2], Geom.line, Guide.xlabel("K"), Guide.ylabel(y_titles[2]))

In [None]:
plot(x=ks, y=metrics[3], Geom.line, Guide.xlabel("K"), Guide.ylabel(y_titles[3]))

## Rouler le code avec ce K

In [None]:
k = 9

mat = γ[1] *U[:, 1]*V[:,1]'
for i = 2:8
    mat += γ[i] *U[:, i]*V[:,i]'
end
X_train = mat[1:length_train, :]
X_test = mat[length_train+1:end, :]

y_test = knn(k, X_train, y_train, X_test)
prediction = DataFrame(id = id_test, diagnosis = y_test)
CSV.write("KNN.csv",prediction)