## Прогнозирование цен домов методами глубокого машинного обучения и с помощью автоэнкодера

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import datetime

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

import torch
import torchvision
from torch.utils.data import Dataset, DataLoader

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchvision import transforms as T

import torchmetrics
from torchmetrics import MeanSquaredLogError

# from pytorchtools import EarlyStopping

In [3]:
#pip install category-encoders

In [4]:
#pip install torchmetrics

In [5]:
# pip install pytorchtools

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [7]:
# если надо запустить в гугл сколар
# from google.colab import drive
# drive.mount('/content/drive')

In [8]:
data_train = pd.read_csv('train.csv.zip', engine='python', sep = ';') # index_col=0? /content/drive/MyDrive/Colab Notebooks/
data_train.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f283,f284,f285,f286,f287,f288,f289,f290,f291,target
0,0.0,19.08.2014,37.0,19.0,5.0,16.0,1.0,1984.0,1.0,9.0,...,6.0,2.0,0.0,4.0,8.0,0.0,2.0,21.0,5.0,4900000.0
1,1.0,22.01.2014,53.0,30.0,10.0,12.0,1.0,1983.0,2.0,8.0,...,290.0,120.0,24.0,92.0,165.0,1.0,75.0,167.0,13.0,13500000.0
2,2.0,30.07.2012,38.0,17.0,11.0,,,,,,...,15.0,4.0,0.0,13.0,28.0,1.0,2.0,82.0,9.0,6100000.0
3,3.0,21.08.2012,75.0,,2.0,,,,,,...,4.0,2.0,0.0,3.0,15.0,1.0,0.0,24.0,4.0,5662500.0
4,4.0,31.12.2013,35.0,19.0,6.0,12.0,1.0,1971.0,1.0,10.0,...,18.0,1.0,0.0,5.0,14.0,0.0,4.0,44.0,5.0,4999000.0


In [9]:
data_test = pd.read_csv('test.csv.zip', engine='python', sep = ';') # index_col=0
data_test.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f282,f283,f284,f285,f286,f287,f288,f289,f290,f291
0,19999,19.06.2014,41,20.0,9.0,14.0,1.0,1984.0,1.0,8.0,...,33,7,4,0,10,15,0,0,40,3
1,20000,25.02.2014,58,43.0,2.0,9.0,1.0,1971.0,3.0,6.0,...,20,5,3,0,7,25,0,10,56,7
2,20001,29.10.2014,35,14.0,4.0,23.0,1.0,2005.0,1.0,9.0,...,28,9,3,0,11,23,0,5,77,17
3,20002,27.11.2013,33,19.0,3.0,9.0,1.0,1971.0,1.0,6.0,...,19,13,8,1,8,13,0,2,46,0
4,20003,24.04.2013,53,30.0,11.0,,,,,,...,45,15,4,0,13,28,1,0,72,5


In [10]:
id_test = data_test['id']

## Обработка данных

In [12]:
# номера столбцов с временем 1,154,155,177,178,200,201,223,224,246,247,269,270
# кроме 1го все остальные даты битые (в столбиках некоторые даты даны полностью, а некоторые частично дд/гггг или дд/гг)

# Остальные столбики:
# 11 инвестиция или собственность: перекодировать 0-1
# 12 названия районов? one-hot-encoder
# 29, 33-40, 107, 115, 119 yes-no: перекод 1-0
# 75 имеет к числам странную приписку FDKEFFLd
# 153 - no data, poor, satisfactory, good, excellent

In [13]:
# удаляем строки дубликаты
data_train = data_train.drop_duplicates(keep='first')

In [14]:
# удаляем столбцы с битыми датами
data_train = data_train.drop(columns=['f154', 'f155', 'f177', 'f178', 'f200', 'f201', 'f223', 'f224', 'f246', 'f247', 'f269', 'f270'])

In [15]:
# 75 имеет к числам странную приписку FDKEFFLd
# удалим буквы после числа в 75м столбике
# data_train['f75'].notna() = int(str(data_train['f75'].notna())[:-8])

def remove_letters(s):
    return s[:-8] if isinstance(s, str) else s

# Применяем функцию к столбцу датафрейма
data_train['f75'] = data_train['f75'].apply(lambda x: remove_letters(x))

In [16]:
# удалим строки где таргет = nan
data_train = data_train.dropna(subset=['target'])

In [17]:
# ищем дублированные столбцы
duplicates = data_train.columns[data_train.T.duplicated(keep=False).values]
duplicates

Index(['f17', 'f20', 'f53', 'f56'], dtype='object')

In [18]:
# удаляем одинаковые столбцы (удалилось 2шт 'f53', 'f56')
data_train = data_train.T.drop_duplicates(keep='first').T

In [19]:
# Находим количество пропущенных данных в каждом столбце
missing_data = data_train.isnull().sum()

# т.к. у нас осталось почти 20000 строк, то лучше удалить столбцы, где больше 30% (6000 строк) пропущ данных
# лучше не заполнять эти 30% nan какими-то значениями, т.к. это может исказить данные
missing_data.sort_values(ascending=False)[:15]

# удаляем эти столбцы
data_train = data_train.drop(columns=['f24', 'f7', 'f10', 'f162', 'f163', 'f161', 'f6', 'f8', 'f9', 'f5'])

In [20]:
missing_data.sort_values(ascending=False)[:12]

f24     9423
f7      8889
f10     8809
f161    8704
f162    8704
f163    8704
f5      6223
f6      6223
f8      6223
f9      6223
f18     4340
f21     4338
dtype: int64

In [21]:
# Находим количество нулей в каждом столбце
zeros_count = data_train.eq(0).sum()
zeros_count.sort_values(ascending=False)[:15]

# есть столбцы, где очень много нулей
# те столбцы, где больше 90% нулей (18000 строк) лучше удалить, так как они могут зашумлять данные
data_train = data_train.drop(columns=['f173', 'f196', 'f170', 'f219', 'f193', 'f174', 'f169', 'f30', 'f216', 'f239', 'f242'])

In [22]:
zeros_count.sort_values(ascending=False)[:13]

f173    19905
f196    19616
f170    19458
f219    19250
f193    19108
f174    18980
f169    18878
f30     18752
f216    18723
f239    18276
f242    18239
f26     17970
f23     17944
dtype: int64

In [23]:
data_train

Unnamed: 0,id,f1,f2,f3,f4,f11,f12,f13,f14,f15,...,f283,f284,f285,f286,f287,f288,f289,f290,f291,target
0,0.0,19.08.2014,37.0,19.0,5.0,Investment,Birjulevo Zapadnoe,8464344.053,85721.0,0.075206,...,6.0,2.0,0.0,4.0,8.0,0.0,2.0,21.0,5.0,4900000.0
1,1.0,22.01.2014,53.0,30.0,10.0,Investment,Tverskoe,7307410.574,75377.0,0.065444,...,290.0,120.0,24.0,92.0,165.0,1.0,75.0,167.0,13.0,13500000.0
2,2.0,30.07.2012,38.0,17.0,11.0,Investment,Beskudnikovskoe,3292112.223,73148.0,0.063747,...,15.0,4.0,0.0,13.0,28.0,1.0,2.0,82.0,9.0,6100000.0
3,3.0,21.08.2012,75.0,,2.0,OwnerOccupier,Poselenie Sosenskoe,66772450.69,9553.0,0.336177,...,4.0,2.0,0.0,3.0,15.0,1.0,0.0,24.0,4.0,5662500.0
4,4.0,31.12.2013,35.0,19.0,6.0,Investment,Ochakovo-Matveevskoe,17526506.21,118843.0,0.133373,...,18.0,1.0,0.0,5.0,14.0,0.0,4.0,44.0,5.0,4999000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19994,19994.0,28.03.2012,45.0,25.0,3.0,Investment,Kuz'minki,7984443.568,142243.0,0.372735,...,7.0,2.0,0.0,6.0,23.0,0.0,7.0,62.0,13.0,6100000.0
19995,19995.0,01.08.2014,48.0,30.0,3.0,Investment,Gol'janovo,14286990.83,157010.0,0.389354,...,5.0,2.0,0.0,6.0,12.0,0.0,2.0,44.0,5.0,1000000.0
19996,19996.0,21.12.2011,52.0,30.0,11.0,Investment,Severnoe Butovo,8889466.752,90114.0,0.579645,...,3.0,1.0,0.0,5.0,12.0,1.0,0.0,27.0,3.0,2000000.0
19997,19997.0,02.06.2015,27.0,14.0,3.0,Investment,Ochakovo-Matveevskoe,17526506.21,118843.0,0.133373,...,32.0,12.0,3.0,10.0,25.0,1.0,8.0,83.0,2.0,1500000.0


In [24]:
# Преобразуем столбец с датой в тип данных datetime
data_train['f1'] = pd.to_datetime(data_train['f1'], dayfirst=True)

# Создаем новые столбцы 'day', 'month', 'year' и заполняем их данными из столбца 'date'
data_train['day'] = data_train['f1'].dt.day
data_train['month'] = data_train['f1'].dt.month
data_train['year'] = data_train['f1'].dt.year

# Удаляем столбец 'date', так как теперь у нас есть отдельные столбцы для дня, месяца и года
data_train = data_train.drop(columns=['f1'])

In [25]:
# Эти столбцы будем преобразовывать в трансформере
categorical_features = ['f107', 'f11', 'f115', 'f119', 'f29', 'f33',
       'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40']
      #  f153 f12
# Находим все столбцы с символьным типом данных
all_categorical_features = ['f107', 'f11','f12', 'f115', 'f119', 'f153', 'f29', 'f33',
       'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40']

In [26]:
# все столбцы с числовыми типами данных для трансформера
numerical_features = data_train.columns.difference(['target', 'id']).difference(all_categorical_features)
numerical_features

Index(['day', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f108',
       'f109',
       ...
       'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'month',
       'year'],
      dtype='object', length=243)

In [27]:
numerical = Pipeline(
    steps=[
    ('missing_num', SimpleImputer(strategy = 'most_frequent')),
    ('scaler', StandardScaler())]
)

# status_map = [{
#     'col': ['f11'],
#     'mapping': {'Investment':0, 'OwnerOccupier':1}
#     }]

# status_transformer = Pipeline(
#     steps=[
#     ('missing_num', SimpleImputer(strategy = 'most_frequent')),
#     ('ce',ce.OrdinalEncoder(mapping=status_map)),
#     ('scaler', StandardScaler())
#     ])

# 153 столбик - no data? заменить на nan?, poor, satisfactory, good, excellent
class_map = [{
    'col': 'f153',
    'mapping': {'no data': np.NaN, 'poor':2, 'satisfactory':3, 'good':4, 'excellent':5}
    }]

class_transformer = Pipeline(
    steps=[
    ('missing_num', SimpleImputer(strategy = 'most_frequent')),
    ('ce',ce.OrdinalEncoder(mapping=class_map)),
    ('scaler', StandardScaler())
    ])

# Получение категорий по районам из тренировочного набора данных
categories = [data_train['f12'].unique()]

# Создание объекта OneHotEncoder с параметром categories по районам
district = Pipeline(
    steps=[('enc', OneHotEncoder(handle_unknown='ignore', sparse_output=False, categories=categories).set_output(transform="pandas")),
    ('scaler', StandardScaler())]
)

data_transformer = ColumnTransformer([
        ("numerical", numerical, numerical_features),
        # убрать быстую разряженную и затем добавить тип pandas
        ("categorical", OneHotEncoder(sparse_output=False).set_output(transform="pandas"), categorical_features),
        ("district", district, ['f12']),
#         ("ordinal_map", status_transformer, ['f11']),
        ("Class_map", class_transformer, ['f153'])
        ]).set_output(transform='pandas')

In [28]:
df = data_transformer.fit_transform(data_train)
df

Unnamed: 0,numerical__day,numerical__f100,numerical__f101,numerical__f102,numerical__f103,numerical__f104,numerical__f105,numerical__f106,numerical__f108,numerical__f109,...,district__f12_Jakimanka,district__f12_Poselenie Kievskij,district__f12_Vostochnoe,district__f12_Poselenie Shhapovskoe,district__f12_Arbat,district__f12_Poselenie Krasnopahorskoe,district__f12_Molzhaninovskoe,district__f12_Poselenie Klenovskoe,district__f12_Poselenie Mihajlovo-Jarcevskoe,Class_map__f153
0,0.296777,0.678578,-0.911252,-0.726258,0.350472,-0.147267,-0.147267,-0.870920,-1.114427,0.334260,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,0.363201
1,0.642170,1.573793,-0.358610,-0.247537,1.077324,-0.191602,-0.191602,0.751426,1.237313,-1.314839,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,-0.112356
2,1.563218,0.893430,-0.689020,-0.554322,0.524917,-0.175043,-0.175043,-0.242505,-0.130364,-0.475374,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,-0.112356
3,0.527039,0.284684,1.887875,1.019428,0.786584,1.243843,1.243843,2.908965,-0.734526,0.541527,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,-1.539027
4,1.678349,-0.216637,-0.856065,-0.893701,-0.376380,-0.160424,-0.160424,-0.018298,-0.972694,-0.325357,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,0.363201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19994,1.332956,-1.290895,-0.410180,-0.356632,0.001583,-0.167827,-0.167827,2.315590,-0.474208,-0.892050,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,-0.112356
19995,-1.775582,-0.753766,-0.086783,-0.083318,-0.812491,-0.113438,-0.113438,1.198027,-0.642210,-0.430549,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,0.838758
19996,0.527039,0.284684,-0.244790,-0.384062,0.786584,-0.256623,-0.256623,-1.014082,-1.165465,0.347774,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,0.838758
19997,-1.660451,0.857621,-0.805801,-0.567836,0.495843,-0.217109,-0.217109,-0.456795,-0.174567,-0.904039,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,0.363201


In [29]:
df['target']=data_train['target']

In [30]:
# Делаем все то же самое с тестовым набором данных

data_test = data_test.drop(columns=['f154', 'f155', 'f177', 'f178', 'f200', 'f201', 'f223', 'f224', 'f246', 'f247', 'f269', 'f270'])
data_test['f75'] = data_test['f75'].apply(lambda x: remove_letters(x))
data_test = data_test.drop(columns=['f24', 'f7', 'f10', 'f162', 'f163', 'f161', 'f6', 'f8', 'f9', 'f5', 'f53', 'f56',
                                     'f173', 'f196', 'f170', 'f219', 'f193', 'f174', 'f169', 'f30', 'f216', 'f239', 'f242'])
# Преобразуем столбец с датой в тип данных datetime
data_test['f1'] = pd.to_datetime(data_test['f1'], dayfirst=True)

# Создаем новые столбцы 'day', 'month', 'year' и заполняем их данными из столбца 'date'
data_test['day'] = data_test['f1'].dt.day
data_test['month'] = data_test['f1'].dt.month
data_test['year'] = data_test['f1'].dt.year

# Удаляем столбец 'date', так как теперь у нас есть отдельные столбцы для дня, месяца и года
data_test = data_test.drop(columns=['f1'])

data_test = data_transformer.transform(data_test)
data_test

Unnamed: 0,numerical__day,numerical__f100,numerical__f101,numerical__f102,numerical__f103,numerical__f104,numerical__f105,numerical__f106,numerical__f108,numerical__f109,...,district__f12_Jakimanka,district__f12_Poselenie Kievskij,district__f12_Vostochnoe,district__f12_Poselenie Shhapovskoe,district__f12_Arbat,district__f12_Poselenie Krasnopahorskoe,district__f12_Molzhaninovskoe,district__f12_Poselenie Klenovskoe,district__f12_Poselenie Mihajlovo-Jarcevskoe,Class_map__f153
0,0.296777,0.463727,-0.561364,-0.479834,0.176028,-0.203848,-0.203848,0.246323,-1.007603,0.053718,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,-0.112356
1,0.987563,-0.324063,-0.737485,-0.823908,-0.463602,-0.279078,-0.279078,1.768835,-0.954210,-0.590208,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,-0.112356
2,1.448087,-1.290895,-0.697791,-0.789605,-1.248603,-0.238299,-0.238299,0.969924,-0.217018,-1.053253,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,-0.112356
3,1.217825,1.752836,-1.073843,-1.092562,1.222695,-0.176243,-0.176243,0.304977,-0.950661,-0.350683,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,0.363201
4,0.872432,-1.362512,-0.620924,-0.115208,-1.306751,-0.245373,-0.245373,1.154577,-0.438988,-0.352353,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,0.838758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4373,-1.660451,-0.216637,-0.600847,-0.443184,0.495843,-0.241581,-0.241581,0.186655,-0.544595,-0.634949,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,0.363201
4374,-1.660451,0.284684,1.301815,1.603085,0.030657,0.332378,0.332378,0.455313,-0.329465,0.792188,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,-1.539027
4375,-0.739403,0.606961,-0.635434,-0.367700,0.292324,-0.172995,-0.172995,0.180174,-1.049232,-0.265311,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,0.838758
4376,1.217825,-1.040234,-0.439185,-0.338703,-1.045084,-0.176091,-0.176091,1.275933,-0.744410,-0.461239,...,-0.050064,-0.007071,-0.015814,-0.010001,-0.022367,-0.027397,-0.010001,-0.007071,-0.007071,0.838758


## Обучение на части выборки и проверка на валидационном наборе
Тут подбираются параметры и идет проверка на валидационной выборке.
Представлена модель уже с подобранными параметрами. 

In [32]:
# Создание PyTorch Dataset
class TensorDataset(Dataset):
    def __init__(self, features, target):
        self.features = features
        self.target = target

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.target[idx]
    
# Преобразование данных в PyTorch Tensor
features_tensor = torch.tensor(df.drop('target', axis=1).values, dtype=torch.float32)
target_tensor = torch.tensor(df['target'], dtype=torch.float32)
new_shape = (len(target_tensor), 1)
target_tensor = target_tensor.view(new_shape)

# Определяем размер тренировочного и валидационного наборов и делим их
train_size = int(0.9 * len(features_tensor)) # выбираем 90% тренировочный
val_size = len(features_tensor) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(
    TensorDataset(features_tensor, target_tensor), [train_size, val_size]
)

# Создание DataLoader из Dataset
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [33]:
len(train_dataset)

17999

In [34]:
# количество входных признаков в модель
INP_SHAPE = len(df.axes[1])-1
INP_SHAPE

416

### Автоэнкодер

In [36]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
#              nn.Conv1d(in_channels=1, out_channels=4, kernel_size=3),  
#             nn.ReLU(),
#             nn.MaxPool1d(kernel_size=2), 
            nn.Linear(INP_SHAPE, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, INP_SHAPE),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

autoencoder = Autoencoder().to(device)
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001) 
criterion = nn.MSELoss()

epochs = 10
for epoch in range(epochs):
    
    epoch_loss = 0
    autoencoder.train()
    for batch_features, _ in train_loader:
        
        optimizer.zero_grad()

        batch_features = batch_features.to(device)

        predictions = autoencoder(batch_features)
        
        loss = criterion(predictions, batch_features)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()

    avg_epoch_loss = epoch_loss / len(train_loader)

    print("epoch : {}/{}, loss = {:.6f}".format(epoch + 1, epochs, avg_epoch_loss))
    

epoch : 1/10, loss = 0.629663
epoch : 2/10, loss = 0.563265
epoch : 3/10, loss = 0.559843
epoch : 4/10, loss = 0.555662
epoch : 5/10, loss = 0.551535
epoch : 6/10, loss = 0.548386
epoch : 7/10, loss = 0.549336
epoch : 8/10, loss = 0.548165
epoch : 9/10, loss = 0.544407
epoch : 10/10, loss = 0.544564


### Модель

In [38]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        
        self.model = nn.Sequential(
            nn.Linear(128, 84),
            nn.ReLU(),
            nn.Linear(84, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
#             nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4, 1),
            nn.Softplus()
        )
        
#         инициализация весов нейронной сети
        for m in self.model:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0.0) #constant_(m.bias, 0.0) 

    def forward(self, x):
        return self.model(x)
    

model = NeuralNetwork().to(device)

# optimizer = optim.Adam(list(autoencoder.encoder.parameters()) + list(model.parameters()), lr=0.01) лучше метрика
optimizer = optim.Adam(model.parameters(), lr=0.01) 
# выбираем Адам т.к. чаще всего он самый оптимальный
# пробовала lr 1е-2 и 1е-4, оба очень сильно ухудшают метрику и увеличивают loss

# optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) плохо работает
# optimizer = optim.Adagrad(model.parameters(), lr=0.01) Адаград неплохо отработал и показал хорошие рез-ты, можно взять на вооружение
# другие оптимайзеры работают хуже

# criterion = nn.MSELoss()
criterion = MeanSquaredLogError()

# early_stopping = EarlyStopping(patience=5) НЕ РАБОТАЕТ!!! НАПИСАЛА СОБСТВЕННОЕ УСЛОВИЕ ОСТАНОВКИ ОБУЧЕНИЯ
patience = 10
best_loss = float('inf')
counter = 0


epochs = 10
for epoch in range(epochs):
    
    train_losses = 0
    model.train()
    for batch_features, targets in train_loader:
        
        optimizer.zero_grad()

        batch_features = batch_features.to(device)
        batch_features = autoencoder.encoder(batch_features)

        outputs = model(batch_features)
        outputs = outputs.view(-1, 1)

        train_loss = criterion(outputs, targets)

        train_loss.backward()

        optimizer.step()

        train_losses += train_loss.item()

    # compute the epoch training loss
    train_losses = train_losses / len(train_loader)

    # display the epoch training loss
    print("epoch : {}/{}, loss_train = {:.6f}".format(epoch + 1, epochs, train_losses))
    
    model.eval() 
    val_losses = 0
    with torch.no_grad():
        
        for batch_features, targets in val_loader:

            batch_features = batch_features.to(device)
            batch_features = autoencoder.encoder(batch_features)

            outputs = model(batch_features)
            outputs = outputs.view(-1, 1)
            targets = targets.view(-1, 1)

            # compute training reconstruction loss
            val_loss = criterion(outputs, targets) #MeanSquaredLogError(outputs, targets)

            # add the mini-batch training loss to epoch loss
            val_losses += val_loss.item()

            # compute the epoch training loss
        val_losses = val_losses / len(val_loader)
        print("epoch : {}/{}, loss_test = {:.6f}".format(epoch + 1, epochs, val_losses)) 
        
#     # early_stopping needs the validation loss to check if it has decresed,
#     # and if it has, it will make a checkpoint of the current model
#     early_stopping(val_losses, model)

#     if early_stopping.early_stop:
#         print("Early stopping")
#         break

#  early_stopping не работал, пришлось написать такое условие
        if val_losses < best_loss:
            best_loss = val_losses
            counter = 0
        else:
            counter += 1

        if counter >= patience:
            print("No improvement in loss for 10 epochs. Stopping training.")
            break

epoch : 1/10, loss_train = 13.226584
epoch : 1/10, loss_test = 0.401345
epoch : 2/10, loss_train = 0.406940
epoch : 2/10, loss_test = 0.393676
epoch : 3/10, loss_train = 0.400343
epoch : 3/10, loss_test = 0.384292
epoch : 4/10, loss_train = 0.377174
epoch : 4/10, loss_test = 0.343056
epoch : 5/10, loss_train = 0.336157
epoch : 5/10, loss_test = 0.318789
epoch : 6/10, loss_train = 0.323668
epoch : 6/10, loss_test = 0.312357
epoch : 7/10, loss_train = 0.318181
epoch : 7/10, loss_test = 0.308229
epoch : 8/10, loss_train = 0.314232
epoch : 8/10, loss_test = 0.305775
epoch : 9/10, loss_train = 0.312195
epoch : 9/10, loss_test = 0.304354
epoch : 10/10, loss_train = 0.310845
epoch : 10/10, loss_test = 0.303180


## Предикты

In [40]:
test_inputs = torch.Tensor(data_test.values) # test_data - данные для предсказания
test_inputs = autoencoder.encoder(test_inputs)
targets = model(test_inputs)

In [41]:
targets

tensor([[5434385.0000],
        [6943065.5000],
        [5548428.5000],
        ...,
        [5679485.0000],
        [7022287.0000],
        [4894226.0000]], grad_fn=<SoftplusBackward0>)

In [42]:
sub = pd.DataFrame({"id": id_test, "prediction": targets.detach().numpy().squeeze()})
sub.to_csv('sample_submission.csv',index=False)
# файл скачается в ту же папку, где находится программный код