# Заполнение пропусков с помощью KNN


1. Откройте файл с данными. Посмотрите где и в каком количестве есть пропуски. Строки, где пропуски по 2 и более признакам удалите (т.е. это строки под номерами 3 и 271).

In [21]:
# Подключение основных библиотек
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

import seaborn as sns
sns.set()

In [22]:
# Чтение датасета
penguinsDataset = pd.read_csv('penguins.csv');

In [23]:
# первые пять строк
penguinsDataset.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female


In [24]:
# Проверяем наличие пропусков
print(penguinsDataset.isnull().sum())

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64


In [25]:
# статистика по категориальным признакам
penguinsDataset.describe(include=['O'])

Unnamed: 0,species,island,sex
count,344,344,333
unique,3,3,2
top,Adelie,Biscoe,male
freq,152,168,168


In [26]:
penguinsDataset = penguinsDataset.dropna(thresh=len(penguinsDataset.columns)-1)

In [27]:
# статистика по категориальным признакам
penguinsDataset.describe(include=['O'])

Unnamed: 0,species,island,sex
count,342,342,333
unique,3,3,2
top,Adelie,Biscoe,male
freq,151,167,168


2. Разделите весь набор данных на 2 части: часть без пропусков data и часть data_with_nan, где имеются пропуски в столбце "sex" 



In [30]:
# Создаем набор данных без пропусков
data = penguinsDataset.dropna(subset=['sex'])

# Создаем набор данных с пропусками
data_with_nan = penguinsDataset[penguinsDataset['sex'].isnull()]

3. Отделите целевой признак от нецелевых

In [31]:
X = data.drop('sex', axis=1)
y = data['sex']

4. разделите данные на части для обучения train и тестирования test

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

5. Проведите кодирование категориальных признаков в train, а затем в test используя кодировщики, обученные на train

In [33]:
onehotencoder = OneHotEncoder()
res = onehotencoder.fit_transform(X_train[['island','species']])

In [34]:
X_train[onehotencoder.categories_[0] ] = res[:, :3].toarray()
X_train[onehotencoder.categories_[1] ] = res[:, 3:].toarray()

In [35]:
X_train.drop(['island','species'], axis=1, inplace=True)
X_train.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,Adelie,Chinstrap,Gentoo
232,49.1,14.5,212.0,4625.0,1.0,0.0,0.0,0.0,0.0,1.0
84,37.3,17.8,191.0,3350.0,0.0,1.0,0.0,1.0,0.0,0.0
306,40.9,16.6,187.0,3200.0,0.0,1.0,0.0,0.0,1.0,0.0
22,35.9,19.2,189.0,3800.0,1.0,0.0,0.0,1.0,0.0,0.0
29,40.5,18.9,180.0,3950.0,1.0,0.0,0.0,1.0,0.0,0.0


In [36]:
res = onehotencoder.transform(X_test[['island', 'species']])
X_test[onehotencoder.categories_[0] ] = res[:, :3].toarray()
X_test[onehotencoder.categories_[1] ] = res[:, 3:].toarray()
X_test.drop(['island','species'], axis=1, inplace=True)
X_test.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,Adelie,Chinstrap,Gentoo
30,39.5,16.7,178.0,3250.0,0.0,1.0,0.0,1.0,0.0,0.0
320,50.9,17.9,196.0,3675.0,0.0,1.0,0.0,0.0,1.0,0.0
79,42.1,19.1,195.0,4000.0,0.0,0.0,1.0,1.0,0.0,0.0
202,46.6,14.2,210.0,4850.0,1.0,0.0,0.0,0.0,0.0,1.0
63,41.1,18.2,192.0,4050.0,1.0,0.0,0.0,1.0,0.0,0.0


6. Проведите нормирование нецелевых признаков в train, а затем в test используя нормировщик, обученный на train

In [37]:
scaler_mm = MinMaxScaler()

X_train_norm = scaler_mm.fit_transform(X_train)
X_train_norm= pd.DataFrame(X_train_norm, columns = scaler_mm.feature_names_in_)
X_train_norm.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,Adelie,Chinstrap,Gentoo
0,0.618182,0.166667,0.677966,0.574627,1.0,0.0,0.0,0.0,0.0,1.0
1,0.189091,0.559524,0.322034,0.19403,0.0,1.0,0.0,1.0,0.0,0.0
2,0.32,0.416667,0.254237,0.149254,0.0,1.0,0.0,0.0,1.0,0.0
3,0.138182,0.72619,0.288136,0.328358,1.0,0.0,0.0,1.0,0.0,0.0
4,0.305455,0.690476,0.135593,0.373134,1.0,0.0,0.0,1.0,0.0,0.0


In [38]:
X_test_norm = scaler_mm.transform(X_test)
X_test_norm = pd.DataFrame(X_test_norm, columns =  scaler_mm.feature_names_in_)
X_test_norm.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,Adelie,Chinstrap,Gentoo
0,0.269091,0.428571,0.101695,0.164179,0.0,1.0,0.0,1.0,0.0,0.0
1,0.683636,0.571429,0.40678,0.291045,0.0,1.0,0.0,0.0,1.0,0.0
2,0.363636,0.714286,0.389831,0.38806,0.0,0.0,1.0,1.0,0.0,0.0
3,0.527273,0.130952,0.644068,0.641791,1.0,0.0,0.0,0.0,0.0,1.0
4,0.327273,0.607143,0.338983,0.402985,1.0,0.0,0.0,1.0,0.0,0.0


7. Проведите подбор гиперпараметров (количество соседей и метрика расстояния) в KNN, обучите классификатор с наилучшими гиперпараметрами на train, проверьте его качество на test

In [39]:
KNNClassifier = KNeighborsClassifier()
KNNClassifier.fit(X_train_norm, y_train)
print(KNNClassifier.score(X_train_norm, y_train), KNNClassifier.score(X_test_norm, y_test))

0.9360902255639098 0.8955223880597015


In [40]:
X_train_norm.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,Adelie,Chinstrap,Gentoo
0,0.618182,0.166667,0.677966,0.574627,1.0,0.0,0.0,0.0,0.0,1.0
1,0.189091,0.559524,0.322034,0.19403,0.0,1.0,0.0,1.0,0.0,0.0
2,0.32,0.416667,0.254237,0.149254,0.0,1.0,0.0,0.0,1.0,0.0
3,0.138182,0.72619,0.288136,0.328358,1.0,0.0,0.0,1.0,0.0,0.0
4,0.305455,0.690476,0.135593,0.373134,1.0,0.0,0.0,1.0,0.0,0.0


8. Используя обученный классификатор заполните пропуски пола в data_with_nan
  * у data_with_nan  удалить столбец "sex";
  * провести кодирование нецелевых признаков обученным на train кодировщиком;   

In [41]:
# Удаляем столбец "sex"
data_with_nan_no_sex = data_with_nan.drop('sex', axis=1)

# Кодируем
res = onehotencoder.transform(data_with_nan_no_sex[['island', 'species']])
data_with_nan_no_sex[onehotencoder.categories_[0] ] = res[:, :3].toarray()
data_with_nan_no_sex[onehotencoder.categories_[1] ] = res[:, 3:].toarray()
data_with_nan_no_sex.drop(['island','species'], axis=1, inplace=True)
data_with_nan_no_sex.head()


Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,Adelie,Chinstrap,Gentoo
8,34.1,18.1,193.0,3475.0,0.0,0.0,1.0,1.0,0.0,0.0
9,42.0,20.2,190.0,4250.0,0.0,0.0,1.0,1.0,0.0,0.0
10,37.8,17.1,186.0,3300.0,0.0,0.0,1.0,1.0,0.0,0.0
11,37.8,17.3,180.0,3700.0,0.0,0.0,1.0,1.0,0.0,0.0
47,37.5,18.9,179.0,2975.0,0.0,1.0,0.0,1.0,0.0,0.0


9. Провести нормировку обученным на train нормализатором.

In [42]:
# нормируем данные

data_with_nan_no_sex_norm = scaler_mm.transform(data_with_nan_no_sex)
data_with_nan_no_sex_norm = pd.DataFrame(data_with_nan_no_sex_norm, columns =  scaler_mm.feature_names_in_)
data_with_nan_no_sex_norm.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,Adelie,Chinstrap,Gentoo
0,0.072727,0.595238,0.355932,0.231343,0.0,0.0,1.0,1.0,0.0,0.0
1,0.36,0.845238,0.305085,0.462687,0.0,0.0,1.0,1.0,0.0,0.0
2,0.207273,0.47619,0.237288,0.179104,0.0,0.0,1.0,1.0,0.0,0.0
3,0.207273,0.5,0.135593,0.298507,0.0,0.0,1.0,1.0,0.0,0.0
4,0.196364,0.690476,0.118644,0.08209,0.0,1.0,0.0,1.0,0.0,0.0


10. После этого предскажите обученным классификатором значение целевого признака на данных их data_with_nan

In [43]:
# # Предсказываем значения пола
predicted_sex = KNNClassifier.predict(data_with_nan_no_sex_norm)

11. Объедините нецелевые столбцы из data_with_nan и предсказанные значения пола, выведите их. Сравните с заполнением, которое мы делали, ориентируясь на вес пингвина

In [44]:
# Добавляем предсказанные значения обратно в data_with_nan
data_with_nan['sex'] = predicted_sex

# Выводим результаты
result_data = data_with_nan[['species', 'island', 'bill_length_mm', 'bill_depth_mm',
                              'flipper_length_mm', 'body_mass_g', 'sex']]
print(result_data)

    species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
8    Adelie  Torgersen            34.1           18.1              193.0   
9    Adelie  Torgersen            42.0           20.2              190.0   
10   Adelie  Torgersen            37.8           17.1              186.0   
11   Adelie  Torgersen            37.8           17.3              180.0   
47   Adelie      Dream            37.5           18.9              179.0   
178  Gentoo     Biscoe            44.5           14.3              216.0   
218  Gentoo     Biscoe            46.2           14.4              214.0   
256  Gentoo     Biscoe            47.3           13.8              216.0   
268  Gentoo     Biscoe            44.5           15.7              217.0   

     body_mass_g     sex  
8         3475.0  female  
9         4250.0    male  
10        3300.0  female  
11        3700.0  female  
47        2975.0  female  
178       4100.0  female  
218       4650.0  female  
256       4725.0  femal

In [45]:
result_data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,female
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,male
10,Adelie,Torgersen,37.8,17.1,186.0,3300.0,female
11,Adelie,Torgersen,37.8,17.3,180.0,3700.0,female
47,Adelie,Dream,37.5,18.9,179.0,2975.0,female
178,Gentoo,Biscoe,44.5,14.3,216.0,4100.0,female
218,Gentoo,Biscoe,46.2,14.4,214.0,4650.0,female
256,Gentoo,Biscoe,47.3,13.8,216.0,4725.0,female
268,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,male
