### Читаем данные

In [68]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_percentage_error

data = pd.read_csv("data/train.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206474 entries, 0 to 206473
Data columns (total 76 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   city                                 206474 non-null  object 
 1   floor                                76209 non-null   float64
 2   id                                   206474 non-null  object 
 3   lat                                  206474 non-null  float64
 4   lng                                  206474 non-null  float64
 5   osm_amenity_points_in_0.001          206474 non-null  int64  
 6   osm_amenity_points_in_0.005          206474 non-null  int64  
 7   osm_amenity_points_in_0.0075         206474 non-null  int64  
 8   osm_amenity_points_in_0.01           206474 non-null  int64  
 9   osm_building_points_in_0.001         206474 non-null  int64  
 10  osm_building_points_in_0.005         206474 non-null  int64  
 11  osm_building_

In [69]:
data.corrwith(data.per_square_meter_price)

  data.corrwith(data.per_square_meter_price)


floor                             0.185877
lat                               0.133961
lng                              -0.223485
osm_amenity_points_in_0.001       0.439348
osm_amenity_points_in_0.005       0.489408
                                    ...   
reform_mean_floor_count_500       0.114983
reform_mean_year_building_1000   -0.083208
reform_mean_year_building_500    -0.073135
total_square                     -0.028438
realty_type                       0.138283
Length: 70, dtype: float64

### Функция преобразований для улучшения результата

In [70]:
columns_to_drop = [
    "floor",
    "reform_house_population_1000",
    "reform_house_population_500",
    "reform_mean_floor_count_1000",
    "reform_mean_floor_count_500",
    "reform_mean_year_building_1000",
    "reform_mean_year_building_500",
    "street",
    "city",
    "region",
    "date",
    "id",
    "osm_city_nearest_name",
]


def manipulate(data):
    data.osm_city_nearest_population = data.osm_city_nearest_population.fillna(data.osm_city_nearest_population.mean())
    data = data.assign(
        osm_city_nearest_population=data.osm_city_nearest_population.apply(np.log2),
    )
    # Нормализация
    for i in data.columns:
        if i not in ["per_square_meter_price", *columns_to_drop]:
            minimum, maximum = data[i].min(), data[i].max()
            data[i] = data[i].apply(lambda x: (x - minimum) / (maximum - minimum))
    data = data.drop(columns=columns_to_drop)
    return data

In [71]:
data = manipulate(data)
data = data.assign(
    per_square_meter_price=data.per_square_meter_price.apply(np.log2),
)
data.corrwith(data.per_square_meter_price)

lat                             0.156673
lng                            -0.275346
osm_amenity_points_in_0.001     0.357936
osm_amenity_points_in_0.005     0.428900
osm_amenity_points_in_0.0075    0.438521
                                  ...   
per_square_meter_price          1.000000
reform_count_of_houses_1000     0.192430
reform_count_of_houses_500      0.136675
total_square                   -0.119264
realty_type                     0.168014
Length: 63, dtype: float64

### Подбор параметров

In [72]:
def predict_with_param(train_X, train_Y, test_X, test_Y, alpha, max_iter, learning_rate):
    model = SGDRegressor(alpha=alpha, max_iter=max_iter, learning_rate=learning_rate).fit(train_X, train_Y)
    predicted = model.predict(test_X)
    mape = mean_absolute_percentage_error(test_Y, predicted)
    return model, mape

In [73]:
train_size = int(0.8 * len(data))
train_X, train_Y = data.drop(columns=["per_square_meter_price"])[:train_size], data.per_square_meter_price[:train_size]
test_X, test_Y = data.drop(columns=["per_square_meter_price"])[train_size:], data.per_square_meter_price[train_size:]
params_permutations = [
    (alpha / 10000, max_iter, learning_rate)
    for alpha in range(1, 11)
    for max_iter in range(100, 1001, 100)
    for learning_rate in ["constant", "invscaling", "adaptive"]
]

best_model, best_mape = None, None
for alpha, max_iter, learning_rate in params_permutations:
    model, mape = predict_with_param(train_X, train_Y, test_X, test_Y, alpha, max_iter, learning_rate)
    if best_mape is None or mape < best_mape:
        best_model, best_mape = model, mape
    print("CURRENT:", model, mape)
    print("BEST:", best_model, best_mape)

CURRENT: SGDRegressor(learning_rate='constant', max_iter=100) 0.06748472290323168
BEST: SGDRegressor(learning_rate='constant', max_iter=100) 0.06748472290323168
CURRENT: SGDRegressor(max_iter=100) 0.0677113663684961
BEST: SGDRegressor(learning_rate='constant', max_iter=100) 0.06748472290323168
CURRENT: SGDRegressor(learning_rate='adaptive', max_iter=100) 0.06711585543185275
BEST: SGDRegressor(learning_rate='adaptive', max_iter=100) 0.06711585543185275
CURRENT: SGDRegressor(learning_rate='constant', max_iter=200) 0.06737193543632625
BEST: SGDRegressor(learning_rate='adaptive', max_iter=100) 0.06711585543185275
CURRENT: SGDRegressor(max_iter=200) 0.06762214185526637
BEST: SGDRegressor(learning_rate='adaptive', max_iter=100) 0.06711585543185275
CURRENT: SGDRegressor(learning_rate='adaptive', max_iter=200) 0.06711925016338778
BEST: SGDRegressor(learning_rate='adaptive', max_iter=100) 0.06711585543185275
CURRENT: SGDRegressor(learning_rate='constant', max_iter=300) 0.06753969506195684
BEST:

### Смотрим счет модели

In [74]:
X = data.drop(columns=["per_square_meter_price"])
Y = data.per_square_meter_price

model = best_model.fit(X, Y)
model.score(X, Y)

0.41766414263378204

### Запускаем на тестовых данных

In [75]:
test_data = pd.read_csv("data/test_x.csv")
test_data = manipulate(test_data)
result = model.predict(test_data)

### Записываем результаты в `result.csv`

In [86]:
sample = pd.read_csv("data/sample_submission.csv")
sample["per_square_meter_price"] = result[:len(sample)]
sample.per_square_meter_price = sample.per_square_meter_price.apply(lambda x: 2 ** x / 4)
sample.to_csv("result.csv", index=False)