### _imports_

In [1]:
import warnings
warnings.filterwarnings('ignore')

## 1. _Перебор параметров_

In [2]:
import functools
import itertools
import operator

def produce_params(grid):
    return [ dict(zip(grid.keys(), x)) for x in itertools.product(*grid.values()) ]

In [3]:
def square_distance(x):
    return 1/x*2

params_grid = {
    'n_neighbors': range(1,101,2), # Перебираем от 1 до 100 с шагом 2
    'weights': ['uniform', 'distance', square_distance], # с каким весом берём каждого соседа. uniform - с одинаковым, distance - обратно пропорционально расстоянию. Ещё можно передать свою функцию
    'algorithm': ['auto'], # алгоритм поиска ближайших соседей. Влияет только на скорость обучения
    'leaf_size': [30], # параметры некоторых алгоритмов поиска ближ соседей. Влияет только на скорость обучения
    'metric': ['minkowski'], # метрика расстояния, используемая в алгоритме поиска ближайших соседей
    'p': [2] # параметр метрики, используемой в методе ближайших соседей 
}

In [4]:
param_dicts = produce_params(params_grid)

assert len(param_dicts) == functools.reduce(operator.mul, [len(param_values) for param_values in params_grid.values()], 1)

for params in param_dicts[:5]:
    print(params)

{'n_neighbors': 1, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'p': 2}
{'n_neighbors': 1, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'p': 2}
{'n_neighbors': 1, 'weights': <function square_distance at 0x000002421338C620>, 'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'p': 2}
{'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'p': 2}
{'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'p': 2}


## 2. _Titanic_ 

Предсказание выживших титаника. Загрузите файл и по выбранным вами числовым признакам постройте наилучшую модель ближайших соседей для предсказания колонки *Survived*

In [5]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
titanic_df = pd.read_csv('titanic.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
def prepare_titanic_dataset(df):
    # age
    df['Age'] = df['Age'].fillna(0)
    # sex
    df = pd.get_dummies(df, columns=['Sex'])
    # deck
    df[['Cabin', 'Embarked']] = df[['Cabin', 'Embarked']].fillna('Unknown')
    df['Deck'] = df['Cabin'].apply(lambda s: s[0])
    le = LabelEncoder().fit(df['Deck'])
    df['Deck'] = le.transform(df['Deck'])
    
    return df

In [8]:
titanic = prepare_titanic_dataset(titanic_df)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_female,Sex_male,Deck
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,Unknown,S,0,1,7
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,1,0,2
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,1,0,7
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,1,0,2
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,Unknown,S,0,1,7


In [9]:
knn = KNeighborsClassifier()

features_list = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Deck']

X, y = titanic[features_list], titanic['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [10]:
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print( acc )

0.65625



Добейтесь наилучшего качества (accuracy) на тестовой выборке

In [11]:
def best_params_score(param_dicts):
    best_acc = 0
    best_params = {}

    for params in param_dicts:
        knn = KNeighborsClassifier(**params)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        if acc > best_acc:
            best_acc = acc
            best_params = params

    return (best_acc, best_params)

best_acc, best_params = best_params_score(param_dicts)
print('Best accuracy: %f' % best_acc)
print(best_params)

Best accuracy: 0.718750
{'n_neighbors': 1, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'p': 2}



## 3.$^{*}$ _Las Vegas_

[Las Vegas Strip Data Set](https://archive.ics.uci.edu/ml/datasets/Las+Vegas+Strip)

_Попробуем предсказывать, что рейтинг (**Score**) на TripAdvisor > 3_.

In [12]:
lasvegas_df = pd.read_csv('LasVegasTripAdvisorReviews-Dataset.csv', sep=';')
lasvegas_df.head(2)

Unnamed: 0,User country,Nr. reviews,Nr. hotel reviews,Helpful votes,Score,Period of stay,Traveler type,Pool,Gym,Tennis court,Spa,Casino,Free internet,Hotel name,Hotel stars,Nr. rooms,User continent,Member years,Review month,Review weekday
0,USA,11,4,13,5,Dec-Feb,Friends,NO,YES,NO,NO,YES,YES,Circus Circus Hotel & Casino Las Vegas,3,3773,North America,9,January,Thursday
1,USA,119,21,75,3,Dec-Feb,Business,NO,YES,NO,NO,YES,YES,Circus Circus Hotel & Casino Las Vegas,3,3773,North America,3,January,Friday


In [13]:
lasvegas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 20 columns):
User country         504 non-null object
Nr. reviews          504 non-null int64
Nr. hotel reviews    504 non-null int64
Helpful votes        504 non-null int64
Score                504 non-null int64
Period of stay       504 non-null object
Traveler type        504 non-null object
Pool                 504 non-null object
Gym                  504 non-null object
Tennis court         504 non-null object
Spa                  504 non-null object
Casino               504 non-null object
Free internet        504 non-null object
Hotel name           504 non-null object
Hotel stars          504 non-null object
Nr. rooms            504 non-null int64
User continent       504 non-null object
Member years         504 non-null int64
Review month         504 non-null object
Review weekday       504 non-null object
dtypes: int64(6), object(14)
memory usage: 78.8+ KB


In [14]:
lasvegas_df.describe(include=[np.object, pd.Categorical]).T

Unnamed: 0,count,unique,top,freq
User country,504,48,USA,217
Period of stay,504,4,Mar-May,128
Traveler type,504,5,Couples,214
Pool,504,2,YES,480
Gym,504,2,YES,480
Tennis court,504,2,NO,384
Spa,504,2,YES,384
Casino,504,2,YES,456
Free internet,504,2,YES,480
Hotel name,504,21,Encore at wynn Las Vegas,24


In [15]:
def prepare_lasvegas_dataset(df, ohe_cols, yesno_cols):
    # score (what we'll try to predict)
    df['Score over 3'] = df['Score'] > 3
    
    # hotel stars
    df['Hotel stars'] = df['Hotel stars'].apply(lambda s: np.float16(s.replace(',','.')))
    # OHE
    df = pd.get_dummies(df, columns=ohe_cols)
    # label encoding
    # # YES, NO - good candidates for label encoding
    le = LabelEncoder().fit(df[yesno_cols[0]])
    for yn in yesno_cols:
        df[yn] = le.transform(df[yn])
    # month
    df['Review month'] = df['Review month'].apply(lambda m: dt.datetime.strptime(m, '%B').month)
    # weekday
    wd = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]
    df['Review weekday'] = df['Review weekday'].map(dict(zip(*(wd, range(len(wd))))))
    
    return df

In [16]:
lasvegas = prepare_lasvegas_dataset(lasvegas_df, 
                                    ['User country', 'Period of stay', 'Traveler type', 'Hotel name', 'User continent'], # OHE 
                                    ['Pool', 'Gym', 'Tennis court', 'Spa', 'Casino', 'Free internet']) # YES|NO label

In [17]:
knn = KNeighborsClassifier()

X, y = lasvegas.drop('Score over 3', axis=1), lasvegas['Score over 3']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

_KNN с параметрами по умолчаиню_.

In [18]:
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print( acc )

0.7920792079207921


_Найдём наилучшие параметры из того же самого набора возможных комбинаций_.

In [19]:
best_acc, best_params = best_params_score(param_dicts)
print('Best accuracy: %f' % best_acc)
print(best_params)

Best accuracy: 0.861386
{'n_neighbors': 25, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'p': 2}
