Задание 1

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(10, 8)});

from scipy.stats import normaltest
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [None]:
import os
print(os.listdir("../input"))

Загружаем датасет

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head(20).T

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df['quality'].hist(bins=15);

In [None]:
normaltest(df['quality'])

Посмотрев на график и значение p-value делаем вывод что нормального распределние у выборки нету.

Предобработка и масштабирование данных

In [None]:
df['quality_log'] = np.log(df['quality'])
df_target_log = df['quality_log']
print(df['quality'])
print(df_target_log)

In [None]:
scaler = StandardScaler()
df_scaled = df.drop('quality', axis = 1)
df_scaled = df.drop('quality_log', axis = 1)
df_scaled_fin = scaler.fit_transform(df_scaled)
df_scaled_fin

Разобьем набор данных на обучающую и валидационную (тестовую) выборки.

In [None]:
scaler = StandardScaler()

y = df['quality']
X = df.drop('quality', axis=1)
X_new = scaler.fit_transform(X)
print(X_new[:6, :6])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_new,
                                                      y, 
                                                      test_size=0.2, 
                                                      random_state=42)

Обучим алгоритм регрессии.

In [None]:
knn = KNeighborsRegressor(n_neighbors=100)

In [None]:
knn.fit(X_train, y_train)
y_pred = knn.predict(X_valid)

In [None]:
knn.score(X_valid, y_valid)

In [None]:
mean_squared_error(y_valid, y_pred)

Задание 2
Настройка оптимального числа ближайших соседей в методе kNN

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=412)
knn = KNeighborsRegressor(n_neighbors=100)
scores = cross_val_score(knn, df_scaled_fin, df_target_log, cv=kf, scoring='neg_mean_squared_error')
scores.mean()

In [None]:
knn_params = {'n_neighbors': np.arange(1, 51)}
knn_grid = GridSearchCV(knn, knn_params, scoring='neg_mean_squared_error', cv=kf)
knn_grid.fit(X_train, y_train)

In [None]:
knn_grid.best_estimator_

In [None]:
knn_grid.best_score_

Наилучшее качество мы получили при количестве соседей = 3.Это значение при котором будет самое высокая оценка модели.

In [None]:
results_df = pd.DataFrame(knn_grid.cv_results_)

In [None]:
grid_results = pd.DataFrame(knn_grid.cv_results_)
plt.plot(grid_results['param_n_neighbors'], grid_results['mean_test_score'])
plt.xlabel('n_neighbors')
plt.ylabel('score')
plt.show()

Задание 3 
Выбор метрики в методе kNN

In [None]:
knn2 = KNeighborsClassifier(n_neighbors=1, weights='distance')
knn2_params = {'p': np.linspace(1, 10, num=200, endpoint=True)}
knn2_grid = GridSearchCV(knn2, 
                        knn2_params, 
                        scoring='accuracy',
                        cv=kf)
knn2_grid.fit(X_train, y_train)

In [None]:
knn2_grid.best_params_

In [None]:
knn2_grid.best_score_

При значании параметра p = 7.467336683417086, оптимальное значение score= 0.924935661764706

In [None]:
grid_results2 = pd.DataFrame(knn2_grid.cv_results_)
plt.plot(grid_results2['param_p'], grid_results2['mean_test_score'])
plt.xlabel('n_neighbors')
plt.ylabel('score')
plt.show()

Задание 4
Другие метрические методы

Используем метод NearestCentroid

In [None]:
from sklearn.neighbors import NearestCentroid
nc = NearestCentroid()
nc.fit(X_train, y_train)
y3_pred = nc.predict(X_valid)
nc.score(X_valid, y_valid)

In [None]:
accuracy_score(y_valid, y3_pred)