In [1]:
;;; Let's do some Knn! That is, K nearest neighbour! 

In [1]:
(ns knn
  (:require
            [clojure.data.fressian :as fress]
            [clojure.java.io :as io]
            [clojure.set :as set]
            [clojure.string :as str]))

In [3]:
;;; Knn is an algorithm based on similarity score. So, let us use cosine similarity as our measure of similarity. 
;;; To use cosine similarity, we need to have a function to calculate dot product and magnitude between two vectors.

In [4]:
(reduce + (map * [1 2 3] [4 5 6])) ; To find the dot product between two vectors.

32

In [5]:
(defn dot [v1 v2]
(reduce + (map * v1 v2)))


#'knn/dot

In [6]:
(reduce + (map * [1 2 3] [1 2 3])) ;To find the magnitude of a vector.
(Math/sqrt 14)

3.7416573867739413

In [7]:
(defn magnitude [v]
(Math/sqrt (reduce + (map * v v))))


#'knn/magnitude

In [8]:
(defn cosine [v1 v2]
(/ (dot v1 v2) (* (magnitude v1) (magnitude v2))))

#'knn/cosine

In [9]:
(prn (cosine [1 2 3] [4 5 6])) ;;; To check for cosine scores between two vectors. 
(prn (cosine [1 2 3] [1 2 3])) ;;; To check that cosine score is 1 for same vector.

0.9746318461970762
1.0


In [10]:
;;; For our problem, we will use the Iris dataset and find the K nearest neighbour for a test datapoint.
;;; Since we want to predict a class for our test dataset, our knn for Iris is a Classification Problem.

In [2]:
;;; Functions to read csv file in clojure. 

(defn line->stype

  [line]
  (let [[a b c d e] (str/split line #"[,]")]
    {:sepal-length a
     :sepal-width b
     :petal-length c
     :petal-width d
     :iris e
     }))


(defn load-iris

  []
  (if (.exists (io/file "resources/Iris.csv"))
    (->> (io/resource "Iris.csv")
         (io/reader)
         (line-seq)
         (map #(line->stype %)))
    nil))


#'knn/load-iris

In [12]:
;;; Next few steps will be data preparation steps before we jump into the algorithms. 

In [3]:
;# Data Loading #;

(def iris-data (drop 1 (load-iris)))

#'knn/iris-data

In [4]:
(take 5 iris-data)

({:sepal-length "5.1", :sepal-width "3.5", :petal-length "1.4", :petal-width "0.2", :iris "Iris-setosa"} {:sepal-length "4.9", :sepal-width "3", :petal-length "1.4", :petal-width "0.2", :iris "Iris-setosa"} {:sepal-length "4.7", :sepal-width "3.2", :petal-length "1.3", :petal-width "0.2", :iris "Iris-setosa"} {:sepal-length "4.6", :sepal-width "3.1", :petal-length "1.5", :petal-width "0.2", :iris "Iris-setosa"} {:sepal-length "5", :sepal-width "3.6", :petal-length "1.4", :petal-width "0.2", :iris "Iris-setosa"})

In [15]:
;;; Now let's reshape the data in such a way that we have all the values together as a data point 
;;;(that is list in our case).

In [16]:
;;; Like {:name x :value [1 2 3 4 5]}

In [17]:
(def reshaped-iris-data (reduce (fn [v m] (conj v {:name (:iris m) :value  (mapv read-string [(:sepal-length m)(:sepal-width m) (:sepal-width m)
(:petal-length m)])}))
         [] iris-data ))

#'knn/reshaped-iris-data

In [18]:
(take 3 reshaped-iris-data)

({:name "Iris-setosa", :value [5.1 3.5 3.5 1.4]} {:name "Iris-setosa", :value [4.9 3 3 1.4]} {:name "Iris-setosa", :value [4.7 3.2 3.2 1.3]})

In [19]:
;;; Now, let's store only the value data points together. 

In [20]:
(def only-value (reduce (fn [v m] (conj v (:value m))) [] reshaped-iris-data))

#'knn/only-value

In [21]:
(prn (count only-value))
(prn (take 5 only-value))

150
([5.1 3.5 3.5 1.4] [4.9 3 3 1.4] [4.7 3.2 3.2 1.3] [4.6 3.1 3.1 1.5] [5 3.6 3.6 1.4])


In [22]:
;;; Now, let's take a test data point to find its closest nearest neighbour in our iris data set.
;;; Let the dummy test data be [1 2 3 4].
;;; We will store its cosine similarity values then.

In [23]:
(def cosine-values (mapv #(cosine [5.4 3 3 4.5] %) only-value))

#'knn/cosine-values

In [24]:
;;; Now, we will look into the highest cosine scores values and also store its indices.

In [25]:
(def max-values-with-index  (sort-by second > (map-indexed vector cosine-values)))
(take 3 max-values-with-index)

([84 1.0] [70 0.9999290061909883] [66 0.9998369574797291])

In [26]:
;;; First, we will look for the 5 nearest neighbours in our dataset. That is, we set k = 5.
;;; Later, we will make a generic function for any value of k.

In [27]:
(def index-of-max-values (reduce (fn [v l] (conj v (first l))) [] (take 5 max-values-with-index))) 
(take 3 index-of-max-values)

(84 70 66)

In [28]:
;;; Now, using the above indices, we will access the data points of our iris data. 

In [29]:
(def top-5  (mapv reshaped-iris-data index-of-max-values));# To get the values from vector of maps using index
(prn top-5)

[{:name "Iris-versicolor", :value [5.4 3 3 4.5]} {:name "Iris-versicolor", :value [5.9 3.2 3.2 4.8]} {:name "Iris-versicolor", :value [5.6 3 3 4.5]} {:name "Iris-virginica", :value [6.2 3.4 3.4 5.4]} {:name "Iris-virginica", :value [6.3 3.4 3.4 5.6]}]


In [30]:
;;; Therefore, above are the five closest neighbours of our chosen test data. 

In [31]:
;;; Now, we will select only the names of the neighbours and take the mode of the name, as this is a classification 
;;; problem. 

In [32]:
(def final-5 (reduce (fn [v m] (conj v (:name m))) [] top-5))
(prn final-5)

["Iris-versicolor" "Iris-versicolor" "Iris-versicolor" "Iris-virginica" "Iris-virginica"]


In [33]:
;Above 5 are the five closest neighbours of our chosen data point. 
;Now, to select the one with highest mode, we sort the labels on the basis of frequency and
;;; take the first one (sorted on the basis of descending order.)

In [34]:
(first (first (sort-by val >(frequencies final-5))))

"Iris-versicolor"

In [35]:
;;; Therefore, the nearest neighbour of our chosen test data point is Iris-Virginica. 

In [36]:
;;; Now we will create a generic function that works for any value of k.

In [37]:
;;; Our function will take a 3 argument.
;;; test-data : It will be a data point stored on the form of a vector.
;;; train-data : It will be a list of maps, with :name having name and :value having data points. 
;;; ({:name "Iris-setosa", :value [5.1 3.5 3.5 1.4]} {:name "Iris-setosa", :value [4.9 3 3 1.4]} )
;;; k : It will be the value of k chosen for k nearest neighbours. 

In [38]:
(defn knn [test-data train-data k]
    (let
        [only-value (reduce (fn [v m] (conj v (:value m))) [] train-data)
         cosine-values (mapv #(cosine test-data %) only-value)
         max-values-with-index (sort-by second > (map-indexed vector cosine-values))
         index-of-max-values (reduce (fn [v l] (conj v (first l))) [] (take k max-values-with-index))
         top-5-data (mapv train-data index-of-max-values)
         final-5-names (reduce (fn [v m] (conj v (:name m))) [] top-5-data)
         k-nearest-neighbour     (first (first (sort-by val >(frequencies final-5-names))))
         ]
        k-nearest-neighbour))

#'knn/knn

In [39]:
;;; Now let us try to calculate the accuracy of our algorithm. For that, we will split out dataset into training data
;;; and test data. We will create our model on training data and then test it on test data. Then comparing the
;;; predicted values with true values, we calculate the accuracy of our model. 

In [40]:
;;; We will split our data using 80:20 rule, with 80 percent of data being training data and 20 percent of data being 
;;; test data. 

In [41]:
(def iris-data-train (into [] (random-sample 0.8 reshaped-iris-data)))
(def iris-data-test (into [] (clojure.set/difference (into #{} reshaped-iris-data) (into #{}iris-data-train))))

#'knn/iris-data-test

In [42]:
(prn (count iris-data-train))
(prn (count iris-data-test))

124
23


In [43]:
(def true-values (reduce (fn [v m] (conj v (:name m))) [] iris-data-test))
(def test-data-fed (reduce (fn [v m] (conj v (:value m))) [] iris-data-test))

#'knn/test-data-fed

In [44]:
(def predicted-values (mapv #(knn % iris-data-train 10) test-data-fed))

#'knn/predicted-values

In [45]:
(defn comparison [v1 v2]
    (= v1 v2))

#'knn/comparison

In [46]:
(defn accuracy [data k split-factor]
    (let [iris-data-train (into [] (random-sample split-factor data))
          iris-data-test (into [] (clojure.set/difference (into #{} reshaped-iris-data) (into #{}iris-data-train)))
          true-values (reduce (fn [v m] (conj v (:name m))) [] iris-data-test)
          test-data-fed (reduce (fn [v m] (conj v (:value m))) [] iris-data-test)
          predicted-values (mapv #(knn % iris-data-train k) test-data-fed)
          total-true (count (filter identity (map #(comparison %1 %2) true-values predicted-values)))
          total-count (count true-values)]
        (float (* (/ total-true total-count))))) 

#'knn/accuracy

In [51]:
(accuracy reshaped-iris-data 10 0.8)

0.9259259

In [None]:
;;; Functions to read csv file in clojure. 

(defn line->stype

  [line]
  (let [[a b c d e] (str/split line #"[,]")]
    {:sepal-length a
     :sepal-width b
     :petal-length c
     :petal-width d
     :iris e
     }))


(defn load-mtcars

  []
  (if (.exists (io/file "resources/mtcars.csv"))
    (->> (io/resource "mtcars.csv")
         (io/reader)
         (line-seq)
         (map #(line->stype %)))
    nil))


Syntax error compiling at (form-init4835575241735323351.clj:6:21).
No such namespace: str
Syntax error compiling at (form-init4835575241735323351.clj:18:16).
No such namespace: io
