In [1]:
import os
from copy import deepcopy

import random

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import torch
from torch import nn
from torch.optim import Adam
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

Код, который объединяет решения `classification`, `model_0`, `model_1`
Для тестовых данных выполняем следующие шаги:
1. Решаем задачу классификации (определяем, будет ли точка близка к медианному значению или нет)
2. Для точек, близких к медианном значению, для предсказания используем `model_0`
2. Для точек, не близких к медианном значению, для предсказания используем `model_1`

Оказывается, что такой комбинированный подход работает лучше, чем просто одна нейронная сеть, обучення на всех данных.

In [2]:
# Код из classification.ipynb
class ClassificationDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return torch.tensor(self.X[i]).float(), torch.tensor(self.y[i]).long()

In [3]:
# Код из model_0.ipynb и model_1.ipynb
class RegressionDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return torch.tensor(self.X[i]).float(), torch.tensor(self.y[i]).float()

In [4]:
# Код из classification.ipynb
class NNClassification:
    def __init__(self, n_hidden=128, n_hidden_layers=2, lr=1e-3, n_epochs=200, batch_size=32, sigmoid_rate=0):
        self.n_hidden = n_hidden
        self.n_hidden_layers = n_hidden_layers
        self.lr = lr
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.sigmoid_rate = sigmoid_rate
        self.model = None
        self.train_loss = None
        self.val_loss = None
    
    def fit(self, X_train, y_train, X_val, y_val):
        n_val_X = X_train.shape[-1]
        n_val_y = len(np.unique(y_train))

        lawyers = [nn.Linear(n_val_X, self.n_hidden), nn.ReLU()]
        for i in range(self.n_hidden_layers):
            lawyers.append(nn.Linear(self.n_hidden, self.n_hidden))
            if random.random() < self.sigmoid_rate:
                lawyers.append(nn.Sigmoid())
            else:
                lawyers.append(nn.ReLU())
        lawyers.append(nn.Linear(self.n_hidden, n_val_y))
        self.model = nn.Sequential(*lawyers)
        
        optim = Adam(self.model.parameters(), lr=self.lr)
        dataset = ClassificationDataset(X_train, y_train)
        dataloader = DataLoader(dataset, shuffle=True, batch_size=self.batch_size)
        self.train_loss = []

        val_dataset = ClassificationDataset(X_val, y_val)
        val_dataloader = DataLoader(val_dataset, batch_size=self.batch_size)
        self.val_loss = []

        best_model = None
        best_num_epoch = None
        best_val_loss = None
        
        for epoch in range(1, self.n_epochs + 1):
            losses = []
            for x_batch, y_batch in dataloader:
                y_pred = self.model(x_batch)
                loss = F.cross_entropy(y_pred, y_batch)
                optim.zero_grad()
                loss.backward()
                optim.step()
                losses.append(loss.detach().item())
            self.train_loss.append(np.mean(losses))

            with torch.no_grad():
                losses = []
                for x_batch, y_batch in val_dataloader:
                    y_pred = self.model(x_batch)
                    loss = F.cross_entropy(y_pred, y_batch)
                    losses.append(loss.detach().item())
                self.val_loss.append(np.mean(losses))
            
            if best_val_loss == None or best_val_loss > self.val_loss[-1]:
                best_val_loss = self.val_loss[-1]
                best_num_epoch = epoch
                best_model = deepcopy(self.model)
            if epoch % 20 == 0:
                print(f"Epoch {epoch}, loss {self.val_loss[-1]}")
        self.model = deepcopy(best_model)
        print(f"Best loss was at {best_num_epoch} epoch: {best_val_loss}")

    def predict(self, X):
        with torch.no_grad():
            X = torch.tensor(X).float()
            y = self.model(X).numpy()
            return np.argmax(y, axis=-1)

In [5]:
# Код из model_0.ipynb и model_1.ipynb
class NNRegression:
    def __init__(self, n_hidden=128, n_hidden_layers=2, lr=1e-3, n_epochs=200, batch_size=32, sigmoid_rate=0):
        self.n_hidden = n_hidden
        self.n_hidden_layers = n_hidden_layers
        self.lr = lr
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.sigmoid_rate = sigmoid_rate
        self.model = None
        self.train_loss = None
        self.val_loss = None
    
    def fit(self, X_train, y_train, X_val, y_val):
        n_val_X = X_train.shape[-1]
        n_val_y = y_train.shape[-1]

        lawyers = [nn.Linear(n_val_X, self.n_hidden), nn.ReLU()]
        for i in range(self.n_hidden_layers):
            lawyers.append(nn.Linear(self.n_hidden, self.n_hidden))
            if random.random() < self.sigmoid_rate:
                lawyers.append(nn.Sigmoid())
            else:
                lawyers.append(nn.ReLU())
        lawyers.append(nn.Linear(self.n_hidden, n_val_y))
        self.model = nn.Sequential(*lawyers)
        
        optim = Adam(self.model.parameters(), lr=self.lr)
        dataset = RegressionDataset(X_train, y_train)
        dataloader = DataLoader(dataset, shuffle=True, batch_size=self.batch_size)
        self.train_loss = []

        val_dataset = RegressionDataset(X_val, y_val)
        val_dataloader = DataLoader(val_dataset, batch_size=self.batch_size)
        self.val_loss = []

        best_model = None
        best_num_epoch = None
        best_val_loss = None
        
        for epoch in range(1, self.n_epochs + 1):
            losses = []
            for x_batch, y_batch in dataloader:
                y_pred = self.model(x_batch)
                loss = F.mse_loss(y_pred, y_batch)
                optim.zero_grad()
                loss.backward()
                optim.step()
                losses.append(loss.detach().item())
            self.train_loss.append(np.mean(losses))

            with torch.no_grad():
                losses = []
                for x_batch, y_batch in val_dataloader:
                    y_pred = self.model(x_batch)
                    loss = F.mse_loss(y_pred, y_batch)
                    losses.append(loss.detach().item())
                self.val_loss.append(np.mean(losses))
            
            if best_val_loss == None or best_val_loss > self.val_loss[-1]:
                best_val_loss = self.val_loss[-1]
                best_num_epoch = epoch
                best_model = deepcopy(self.model)
            if epoch % 100 == 0:
                print(f"Epoch {epoch}, loss {round(self.val_loss[-1])}")
        self.model = deepcopy(best_model)
        print(f"Best loss was at {best_num_epoch} epoch: {round(best_val_loss)}")

    def predict(self, X):
        with torch.no_grad():
            X = torch.tensor(X).float()
            y = self.model(X).numpy()
            return y

In [6]:
classifications = {}
with open('./classifications/info.txt', 'r') as f:
    for line in f.readlines():
        line = line.strip()
        if not line:
            continue
        id = int(line[:line.find('.')])
        score = float(line[line.rfind(' ') + 1:])
        # Используем только те модели классификаци,
        # которые дают accurace больше 91 %
        if score > 0.91:
            classifications[f'classification_{id}.pth'] = id

models_0 = {}
with open('./models_0/info.txt', 'r') as f:
    for line in f.readlines():
        line = line.strip()
        if not line:
            continue
        id = int(line[:line.find('.')])
        score = int(line[line.rfind(' ') + 1:])
        # Используем только те модели регрессии,
        # которые дают MSE меньше 100
        if score < 100:
            models_0[f'model_0_{id}.pth'] = id

models_1 = {}
with open('./models_1/info.txt', 'r') as f:
    for line in f.readlines():
        line = line.strip()
        if not line:
            continue
        id = int(line[:line.find('.')])
        score = int(line[line.rfind(' ') + 1:])
        # Используем только те модели регрессии,
        # которые дают MSE меньше 5000
        if score < 5000:
            models_1[f'model_1_{id}.pth'] = id

# Выводим модели, которые будут использованы для генерации решений
print(classifications)
print(models_0)
print(models_1)


{'classification_1.pth': 1, 'classification_4.pth': 4, 'classification_8.pth': 8, 'classification_12.pth': 12, 'classification_17.pth': 17, 'classification_18.pth': 18}
{'model_0_2.pth': 2}
{'model_1_2.pth': 2, 'model_1_5.pth': 5}


In [7]:
df_X_test = pd.read_csv("./data/x_test.csv", header=None)
np_X_test = df_X_test.to_numpy()
n = np_X_test.shape[0]

In [8]:
for classification_name in classifications:
    model_classification = torch.load(os.path.join('./classifications/', classification_name), weights_only=False)
    # 1. Классификация
    y_bin = model_classification.predict(np_X_test)

    X_test_0 = []
    X_test_1 = []
    D_0 = {}
    D_1 = {}
    for i in range(n):
        if y_bin[i] == 0:
            D_0[i] = len(X_test_0)
            X_test_0.append(np_X_test[i])
        else:
            D_1[i] = len(X_test_1)
            X_test_1.append(np_X_test[i])

    X_test_0 = np.array(X_test_0)
    X_test_1 = np.array(X_test_1)

    for model_0_name in models_0:
        # 2. Регрессия для точек из группы 0
        model_0 = torch.load(os.path.join('./models_0/', model_0_name), weights_only=False)
        y_pred_0 = model_0.predict(X_test_0)
        for model_1_name in models_1:
            model_1 = torch.load(os.path.join('./models_1/', model_1_name), weights_only=False)
            # 3. Регрессия для точек из группы 1
            y_pred_1 = model_1.predict(X_test_1)

            # Объединяем предсказания для точек из обеих групп
            y_pred = [y_pred_0[D_0[i]] if y_bin[i] == 0 else y_pred_1[D_1[i]] for i in range(n)]
            y_pred = np.array(y_pred)[:, 0]
            df_y_pred = pd.DataFrame(y_pred)
            # Сохраняем файл с предсказаниями
            name = f'./submissions/submission_{classifications[classification_name]}_{models_0[model_0_name]}_{models_1[model_1_name]}.csv'
            # Выводим название файла
            print(name)
            df_y_pred.to_csv(name, header=False, index=False)
            

./submissions/submission_1_2_2.csv
./submissions/submission_1_2_5.csv
./submissions/submission_4_2_2.csv
./submissions/submission_4_2_5.csv
./submissions/submission_8_2_2.csv
./submissions/submission_8_2_5.csv
./submissions/submission_12_2_2.csv
./submissions/submission_12_2_5.csv
./submissions/submission_17_2_2.csv
./submissions/submission_17_2_5.csv
./submissions/submission_18_2_2.csv
./submissions/submission_18_2_5.csv


In [9]:
# Сгенерируем "усреднённое" решение
# Используем только модели из следующих списков
clf_list = [1, 4]
m0_list = [2]
m1_list = [2, 5]
num = 0
a = np.array([0. for i in range(n)])
for clf in clf_list:
    for m0 in m0_list:
        for m1 in m1_list:
            a += pd.read_csv(f"./submissions/submission_{clf}_{m0}_{m1}.csv", header=None).to_numpy()[:, 0]
            num += 1

a /= num
# a - массив, содержащий усреднённые предсказания
# print(a)
pd.DataFrame(a).to_csv("./submissions/submission_avg.csv", header=False, index=False)

Решение `./submissions/submission_avg.csv` демонстрирует достаточно высокую точность предсказаний и оказывается лучше других.

In [10]:
import shutil
from zipfile import ZipFile

# Код который конвертирует все .csv файлы с решением, лежащие в папке 'submissions',
# в требуемый .zip формат в папке 'send'

for filename in os.listdir('./submissions'):
    if filename.startswith('submission') and filename.endswith('.csv'):
        name = filename.split('.')[0]
        path = os.path.join('./send', name)
        if os.path.exists(path):
            shutil.rmtree(path)
        os.mkdir(f'./send/{name}')
        shutil.copy(os.path.join('./submissions', filename), os.path.join(path, filename))
        os.rename(os.path.join('./send', name, filename), os.path.join(path, '01.out'))
        with ZipFile(os.path.join(path, 'output.zip'), 'w') as mz:
            mz.write(os.path.join(path, '01.out'), '01.out')
        os.remove(os.path.join(path, '01.out'))
