# Задача об ирисах

## Импорт библиотек

In [77]:
import pandas as pd

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

## Получение датасета

In [78]:
df = pd.read_csv('./Iris.csv')

## Вывод данных о датафрейме

### Описание столбцов

1. Id - Идентификатор записи
2. SepalLengthCm - Длина чашелистика в сантиметрах
3. SepalWidthCm - Ширина чашелистика в сантиметрах
4. PetalLengthCm - Длина лепестка в сантиметрах
5. PetalWidthCm - Ширина лепестка в сантиметрах
6. Species - Вид цветка

### Размер датасета

In [79]:
df.shape

(150, 6)

### Информация о полях

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


### Первые 10 записей

In [81]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


### Проверка пустых значений

In [82]:
df.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

Пустые значения не были найдены в датасете

## Подготовка данных для обработки

### Удалиние лишних данных

In [83]:
df.drop(['Id'], axis=1, inplace=True)

### Обработка классов

Переведем задачу в бинарную классификацию: Iris-setosa будем считать за единицу, а остальные за ноль.

In [84]:
df['target'] = df['Species'] == 'Iris-setosa'
df.drop('Species', axis=1, inplace=True)

### Результат обработки

In [85]:
df.head(10)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,target
0,5.1,3.5,1.4,0.2,True
1,4.9,3.0,1.4,0.2,True
2,4.7,3.2,1.3,0.2,True
3,4.6,3.1,1.5,0.2,True
4,5.0,3.6,1.4,0.2,True
5,5.4,3.9,1.7,0.4,True
6,4.6,3.4,1.4,0.3,True
7,5.0,3.4,1.5,0.2,True
8,4.4,2.9,1.4,0.2,True
9,4.9,3.1,1.5,0.1,True


## Построение модели

In [86]:
features = df.drop('target', axis=1)
target = df['target']

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.20, random_state=27)

In [87]:
for model in [
    KNeighborsClassifier(n_neighbors=5),
    LogisticRegression()
]:
    model.fit(features_train, target_train)
    prediction = model.predict(features_test)
    print("For model ", model, " f1-score equals ", f1_score(prediction, target_test))

    print(model.predict_proba(features_test))

For model  KNeighborsClassifier()  f1-score equals  1.0
[[1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]]
For model  LogisticRegression()  f1-score equals  1.0
[[9.99557352e-01 4.42648295e-04]
 [1.73254960e-02 9.82674504e-01]
 [9.99767051e-01 2.32948728e-04]
 [9.99974456e-01 2.55439021e-05]
 [9.76815748e-01 2.31842516e-02]
 [9.90228051e-01 9.77194901e-03]
 [9.94319373e-01 5.68062657e-03]
 [9.99460829e-01 5.39171154e-04]
 [9.98637290e-01 1.36271042e-03]
 [2.29141230e-02 9.77085877e-01]
 [9.85693140e-01 1.43068597e-02]
 [9.98694800e-01 1.30519969e-03]
 [1.92560968e-02 9.80743903e-01]
 [9.90357137e-01 9.64286278e-03]
 [9.95384576e-01 4.61542432e-03]
 [9.72166207e-01 2.78337934e-02]
 [2.70192823e-02 9.72980718e-01]
 [9.99821655e-01 1.78345149e-04]
 [9.99438255e-01 5.617451