In [10]:
from typing import List
from collections import Counter
def raw_majority_vote(labels: List[str]) -> str:
  votes = Counter(labels)
  winner, _ = votes.most_common(1)[0]
  return winner
assert raw_majority_vote(['a', 'b', 'c', 'b']) == 'b'

In [11]:
def majority_vote(labels: List[str]) -> str:
  """suppose que les labels sont triés du plus proche au plus éloigné"""
  vote_counts = Counter(labels)
  winner, winner_count = vote_counts.most_common(1)[0]
  num_winners = len([count for count in vote_counts.values() if count == winner_count])
  if num_winners == 1:
    # un seul gagnant bien défini ; on le retourne
    return winner
  else:
    # recommencer en éliminant le plus éloigné
    return majority_vote(labels[:-1])
# égalité ; examiner les quatre premiers, puis 'b'
assert majority_vote(['a', 'b', 'c', 'b', 'a']) == 'b'

In [12]:
from typing import NamedTuple
from linear_algebra import Vector, distance
class LabeledPoint(NamedTuple):
  point: Vector
  label: str
def knn_classify(k: int,labeled_points: List[LabeledPoint],new_point: Vector) -> str:
  # trier les points de données étiquetés du plus proche au plus éloigné
  by_distance = sorted(labeled_points, key=lambda lp: distance(lp.point, new_point))
  # chercher les labels pour le k le plus proche
  k_nearest_labels = [lp.label for lp in by_distance[:k]]
  # et les faire voter
  return majority_vote(k_nearest_labels)

In [13]:
import requests
data = requests.get(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
with open('iris.dat', 'w') as f:
  f.write(data.text)

In [14]:
from typing import Dict
import csv
from collections import defaultdict
def parse_iris_row(row: List[str]) -> LabeledPoint:
  """sepal_length, sepal_width, petal_length,
  petal_width, class"""
  measurements = [float(value) for value in row[:-1]]
  # La classe est, par exemple, "Iris-virginica" ;
  # nous voulons uniquement "virginica".
  label = row[-1].split("-")[-1]
  return LabeledPoint(measurements, label)
with open('iris.dat') as f:
  reader = csv.reader(f)
  iris_data = [parse_iris_row(row) for row in reader if len(row)>0]
# En outre, nous grouperons les points uniquement par espèce/label
# de manière à les représenter graphiquement.
points_by_species: Dict[str, List[Vector]] = defaultdict(list)
for iris in iris_data:
  points_by_species[iris.label].append(iris.point)

In [15]:
points_by_species

defaultdict(list,
            {'setosa': [[5.1, 3.5, 1.4, 0.2],
              [4.9, 3.0, 1.4, 0.2],
              [4.7, 3.2, 1.3, 0.2],
              [4.6, 3.1, 1.5, 0.2],
              [5.0, 3.6, 1.4, 0.2],
              [5.4, 3.9, 1.7, 0.4],
              [4.6, 3.4, 1.4, 0.3],
              [5.0, 3.4, 1.5, 0.2],
              [4.4, 2.9, 1.4, 0.2],
              [4.9, 3.1, 1.5, 0.1],
              [5.4, 3.7, 1.5, 0.2],
              [4.8, 3.4, 1.6, 0.2],
              [4.8, 3.0, 1.4, 0.1],
              [4.3, 3.0, 1.1, 0.1],
              [5.8, 4.0, 1.2, 0.2],
              [5.7, 4.4, 1.5, 0.4],
              [5.4, 3.9, 1.3, 0.4],
              [5.1, 3.5, 1.4, 0.3],
              [5.7, 3.8, 1.7, 0.3],
              [5.1, 3.8, 1.5, 0.3],
              [5.4, 3.4, 1.7, 0.2],
              [5.1, 3.7, 1.5, 0.4],
              [4.6, 3.6, 1.0, 0.2],
              [5.1, 3.3, 1.7, 0.5],
              [4.8, 3.4, 1.9, 0.2],
              [5.0, 3.0, 1.6, 0.2],
              [5.0, 3.4, 1.6, 0.4],


In [16]:
import random
from machine_learning import split_data
random.seed(12)
iris_train, iris_test = split_data(iris_data, 0.70)
assert len(iris_train) == 0.7 * 150
assert len(iris_test) == 0.3 * 150