In [4]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from PIL import Image

data_path = '/home/jupyter/datasphere/project/new_sorted/new_sorted'
data = pd.read_csv('preprocessed_data.csv')
data.drop(columns=["Unnamed: 0", "link", "photos", 'is_auction'], inplace=True)
data['is_complete'] = data['is_complete'].replace({'True': 1, 'False': 0, '0.0': 0, '1.0': 1})
data['is_complete'] = data['is_complete'].astype(float)
data['price'] = np.log1p(data['price'])

train_df, temp_df = train_test_split(data, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Создаем кастомный Dataset
class HouseDataset(Dataset):
    def __init__(self, dataframe, data_path, transform=None):
        self.dataframe = dataframe
        self.data_path = data_path
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        tabular_data = row.drop(['id', 'price']).values.astype(np.float32) 
        price = np.float32(row['price'])  

        folder_path = os.path.join(self.data_path, str(int(row['id'])))
        images = []
        for img_name in os.listdir(folder_path):
            img_path = os.path.join(folder_path, img_name)
            if os.path.isfile(img_path) and img_name.lower().endswith(('png', 'jpg', 'jpeg')): 
                image = Image.open(img_path).convert('RGB')
                if self.transform:
                    image = self.transform(image)
                images.append(image)

        if len(images) == 0:
            raise ValueError(f"No valid images found in folder {folder_path}") 

        images = torch.stack(images) 
        return images, tabular_data, price



# Преобразования для изображений
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = HouseDataset(train_df, data_path, transform=image_transforms)
val_dataset = HouseDataset(val_df, data_path, transform=image_transforms)
test_dataset = HouseDataset(test_df, data_path, transform=image_transforms)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Модель для обработки изображений
class CombinedModel(nn.Module):
    def __init__(self, tabular_input_size):
        super(CombinedModel, self).__init__()
        self.image_model = models.mobilenet_v3_large(pretrained=True)
        self.image_model.classifier = nn.Sequential(
            nn.Linear(self.image_model.classifier[0].in_features, 256),
            nn.ReLU()
        )
        self.tabular_model = nn.Sequential(
            nn.Linear(tabular_input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Linear(256 + 64, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, images, tabular_data):
        batch_size, num_images, C, H, W = images.size()
        images = images.view(-1, C, H, W) 
        image_features = self.image_model(images)
        image_features = image_features.view(batch_size, num_images, -1).mean(1)  # усредняем по картинкам
        tabular_features = self.tabular_model(tabular_data)
        combined = torch.cat([image_features, tabular_features], dim=1)
        output = self.fc(combined)
        return output

num_tabular_features = train_df.shape[1] - 2 
model = CombinedModel(num_tabular_features).to('cuda')

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Функция обучения
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=8):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for images, tabular_data, targets in train_loader:
            images, tabular_data, targets = images.to('cuda'), tabular_data.to('cuda'), targets.to('cuda')

            optimizer.zero_grad()
            outputs = model(images, tabular_data)
            loss = criterion(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for images, tabular_data, targets in val_loader:
                images, tabular_data, targets = images.to('cuda'), tabular_data.to('cuda'), targets.to('cuda')
                outputs = model(images, tabular_data)
                loss = criterion(outputs.squeeze(), targets)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

train_model(model, train_loader, val_loader, criterion, optimizer, epochs=8)

model.eval()
test_loss = 0
with torch.no_grad():
    for images, tabular_data, targets in test_loader:
        images, tabular_data, targets = images.to('cuda'), tabular_data.to('cuda'), targets.to('cuda')
        outputs = model(images, tabular_data)
        loss = criterion(outputs.squeeze(), targets)
        test_loss += loss.item()

print(f"Test Loss: {test_loss/len(test_loader):.4f}")



Epoch 1/8, Train Loss: 75.3947, Val Loss: 5.3133
Epoch 2/8, Train Loss: 0.4976, Val Loss: 0.8816
Epoch 3/8, Train Loss: 0.2933, Val Loss: 0.5761
Epoch 4/8, Train Loss: 0.2073, Val Loss: 0.8307
Epoch 5/8, Train Loss: 0.1712, Val Loss: 0.5920
Epoch 6/8, Train Loss: 0.1415, Val Loss: 0.6308
Epoch 7/8, Train Loss: 0.1368, Val Loss: 0.4697
Epoch 8/8, Train Loss: 0.1398, Val Loss: 0.4917
Test Loss: 0.5527


In [10]:
model.eval()
y_true_list, y_pred_list = [], []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with torch.no_grad():
    for tabular_features, images, labels in test_loader:
        tabular_features, images, labels = tabular_features.to(device), images.to(device), labels.to(device)
        outputs = model(tabular_features, images)
        y_true_list.extend(labels.cpu().numpy()) 
        y_pred_list.extend(torch.clamp(outputs, min=0).cpu().numpy()) 

# Метрики на логарифмах
msle = mean_squared_log_error(y_true_list, y_pred_list)
mae = mean_absolute_error(y_true_list, y_pred_list)

print(f"Test MSLE: {msle:.4f}")
print(f"Test MAE: {mae:.4f}")

print("Sample predictions (log prices):")
print(f"True values (log): {y_true_list[:10]}")
print(f"Predicted values (log): {y_pred_list[:10]}")

Test MSLE: 0.0013
Test MAE: 0.5419
Sample predictions (log prices):
True values (log): [18.865751, 17.588272, 16.852491, 18.603003, 16.1593, 16.296242, 17.529083, 18.033195, 15.935886, 16.29205]
Predicted values (log): [array([17.726133], dtype=float32), array([17.673765], dtype=float32), array([16.90988], dtype=float32), array([19.279734], dtype=float32), array([16.628782], dtype=float32), array([17.376747], dtype=float32), array([18.088667], dtype=float32), array([18.041325], dtype=float32), array([16.897636], dtype=float32), array([18.024426], dtype=float32)]
