### Читаем данные

In [48]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_percentage_error

data = pd.read_csv("data/train.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73995 entries, 0 to 73994
Data columns (total 72 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   user_id                         73995 non-null  uint64 
 1   org_id                          73995 non-null  uint64 
 2   rating                          73995 non-null  float64
 3   ts                              73995 non-null  int64  
 4   user_city                       73995 non-null  object 
 5   org_city                        73995 non-null  object 
 6   average_bill                    45433 non-null  float64
 7   rating_org                      73995 non-null  float64
 8   rubrics                         73995 non-null  object 
 9   food_delivery                   73995 non-null  int64  
 10  breakfast                       73995 non-null  int64  
 11  takeaway                        73995 non-null  int64  
 12  summer_terrace                  

In [49]:
data.corrwith(data.rating_org)

  data.corrwith(data.rating_org)


user_id                  0.004119
org_id                  -0.021809
rating                   0.269386
ts                       0.040647
average_bill             0.068525
                           ...   
accepted_credit_cards   -0.015476
kalyan                   0.019148
teahouse                 0.022627
bread_from_tandoor       0.014323
handmade_goods           0.028334
Length: 69, dtype: float64

### Функция преобразований для улучшения результата

In [50]:
columns_to_drop = [
    "user_id",
    "org_id",
    "ts",
    "breakfast",
    "business_lunch",
    "closed_for_quarantine",
    "special_menu",
    "sports_broadcasts",
    "projector",
    "vinotheque",
    "handmade_goods",
    "parking_disabled", "toilet_for_disabled", "wheelchair_accessible", "automatic_door",
]


def manipulate(data):
    data = data.assign(
        disabled=(
                (
                        data.parking_disabled + data.toilet_for_disabled + data.wheelchair_accessible + data.automatic_door
                ) / 2
        ).astype(bool),
    )
    data.average_bill = pd.cut(
        data.average_bill.fillna(0),
        bins=[0, 250, 500, 1500],
        labels=["bill_0", "bill_1", "bill_2"],
        ordered=False
    )
    # Нормализация rating
    minimum, maximum = data.rating.min(), data.rating.max()
    data.rating = data.rating.apply(lambda x: (x - minimum) / (maximum - minimum))
    data = pd.get_dummies(data, columns=["user_city", "org_city", "rubrics", "average_bill"])
    data = data.drop(columns=columns_to_drop)
    return data

In [51]:
data = manipulate(data)
data.corrwith(data.rating_org)

rating                 0.269386
rating_org             1.000000
food_delivery         -0.105047
takeaway               0.058840
summer_terrace        -0.022908
                         ...   
rubrics_Столовая      -0.078229
rubrics_Суши-бар      -0.084240
average_bill_bill_0         NaN
average_bill_bill_1   -0.166023
average_bill_bill_2    0.049127
Length: 75, dtype: float64

### Находим оптимальное `n_neighbors`

In [52]:
def predict_for_n_neighbors(train_X, train_Y, test_X, test_Y, n_neighbors):
    model = KNeighborsRegressor(n_neighbors=n_neighbors).fit(train_X, train_Y)
    predicted = model.predict(test_X)
    mape = mean_absolute_percentage_error(test_Y, predicted)
    return model, mape

In [53]:
train_size = int(0.8 * len(data))
train_X, train_Y = data.drop(columns=["rating_org"])[:train_size], data.rating_org[:train_size]
test_X, test_Y = data.drop(columns=["rating_org"])[train_size:], data.rating_org[train_size:]

best_model, best_mape = None, None
n_neighbors = 1
while n_neighbors <= 1000:
    model, mape = predict_for_n_neighbors(train_X, train_Y, test_X, test_Y, n_neighbors)
    if best_mape is None or mape < best_mape:
        best_model, best_mape = model, mape
    print("CURRENT:", model, mape)
    print("BEST:", best_model, best_mape)
    n_neighbors += n_neighbors // 10 + 1

CURRENT: KNeighborsRegressor(n_neighbors=1) 0.03690617706144547
BEST: KNeighborsRegressor(n_neighbors=1) 0.03690617706144547
CURRENT: KNeighborsRegressor(n_neighbors=2) 0.03550802695628921
BEST: KNeighborsRegressor(n_neighbors=2) 0.03550802695628921
CURRENT: KNeighborsRegressor(n_neighbors=3) 0.035289701135277886
BEST: KNeighborsRegressor(n_neighbors=3) 0.035289701135277886
CURRENT: KNeighborsRegressor(n_neighbors=4) 0.0350714902132443
BEST: KNeighborsRegressor(n_neighbors=4) 0.0350714902132443
CURRENT: KNeighborsRegressor() 0.03503407657142852
BEST: KNeighborsRegressor() 0.03503407657142852
CURRENT: KNeighborsRegressor(n_neighbors=6) 0.03499214358147343
BEST: KNeighborsRegressor(n_neighbors=6) 0.03499214358147343
CURRENT: KNeighborsRegressor(n_neighbors=7) 0.03507423897444114
BEST: KNeighborsRegressor(n_neighbors=6) 0.03499214358147343
CURRENT: KNeighborsRegressor(n_neighbors=8) 0.035390836728822136
BEST: KNeighborsRegressor(n_neighbors=6) 0.03499214358147343
CURRENT: KNeighborsRegres

### Смотрим счет модели с оптимальным числом `n_neighbors`

In [54]:
X = data.drop(columns=["rating_org"])
Y = data.rating_org

model = best_model.fit(X, Y)
model.score(X, Y)

0.5125856486600303

### Запускаем на тестовых данных

In [55]:
test_data = pd.read_csv("data/test_x.csv")
test_data = manipulate(test_data)
result = model.predict(test_data)

### Записываем результаты в `result.csv`

In [56]:
sample = pd.read_csv("data/sample_submission.csv")
sample["rating_org"] = result[:len(sample)]
sample.to_csv("result.csv", index=False)