In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Загрузка данных

In [None]:
input_dir = "/kaggle/input/tabular-playground-series-dec-2021/"
train = pd.read_csv(input_dir+"train.csv", index_col='Id')
test = pd.read_csv(input_dir+"test.csv", index_col='Id')
sub = pd.read_csv(input_dir+"sample_submission.csv")

# Выделили непрерывные колонки, бинарные и целевую

In [None]:
cont_cols = ["Elevation","Aspect","Slope","Horizontal_Distance_To_Hydrology", \
                   "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",\
                   "Horizontal_Distance_To_Fire_Points",\
                  "Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]

binary_cols = [f"Wilderness_Area{i}" for i in range(1,5)]+[f"Soli_Type{i}" for i in range(1,41)]
target_col = "Cover_Type"

In [None]:
train[target_col].value_counts()

# Искусственно добавили 20 строчек класса 5

In [None]:
row_5 = train[train[target_col]==5] 
for i in range(20):
    train = train.append( row_5, ignore_index=True)

In [None]:
train[target_col].value_counts()

In [None]:
train[target_col] = train[target_col]-1

# Поделили train/val 0.9/0.1

In [None]:
from sklearn.model_selection import train_test_split
train, val, _, _ = train_test_split(train, train[target_col], test_size=0.1, stratify = train[target_col])

# Нормируем непрерывные колонки

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train[cont_cols] = scaler.fit_transform(train[cont_cols])
val[cont_cols] = scaler.transform(val[cont_cols])
test[cont_cols] = scaler.transform(test[cont_cols])

In [None]:
all_cols = cont_cols+binary_cols
n_classes = len(train[target_col].unique())

# Класс Dataset (особенность pytorch - нужно переопределять класс dataset)

In [None]:
import time
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch import optim

class ForestDataset(Dataset):
    def __init__(self, csv):
        if target_col in csv.columns:
            self.X = csv.drop(columns=[target_col]).values
            self.y = csv[target_col].values
        else:
            self.X = csv.values
            csv[target_col] = 0
            self.y = csv[target_col].values
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
train_dataset = ForestDataset(train)
val_dataset = ForestDataset(val)
#test_dataset = ForestDataset(test)

# Класс модели (полносвязная сеть)

In [None]:
class MultiLayerPerceptron(nn.Module):
    def __init__(self, len_fc1, len_fc2):
        super().__init__()
        self.fc1 = nn.Linear(len(all_cols), len_fc1)
        self.act1 = nn.Tanh()
        self.fc2 = nn.Linear(len_fc1, len_fc2)
        self.act2 = nn.Tanh()
        self.fc3 = nn.Linear(len_fc2, n_classes)
        
    def forward(self, x):
        x = self.act1( self.fc1(x) )
        x = self.act2( self.fc2(x) )
        return self.fc3(x)

# Модель, лосс, оптимизатор

In [None]:
mlp_model = MultiLayerPerceptron(3*len(test.columns), 3*len(test.columns)).to('cuda')

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=1e-4)

# функция, которая выполняет 1 эпоху train

In [None]:
def train_epoch(model,criterion,optimizer,dataset,epoch):
    train_dataset=dataset
    data_loader=DataLoader(dataset,batch_size=32,shuffle=True,num_workers=4)
    dataset_size=len(dataset)
    print(f"Epoch#{epoch}. Train")
    start_time=time.time()
    model.train()
    running_loss=0.0 #накопление лосса
    running_acc=0.0
    epoch_loss=0.0
    
    for inputs,labels in tqdm( data_loader):
        inputs=inputs.to('cuda').type(torch.float)
        labels=labels.to('cuda')#.type(torch.float) #передаем батч на GPU(cuda)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss=criterion(outputs,labels)
        loss.backward() # обратное распостранение градиента
        optimizer.step() # шаг оптимизатора
        running_loss+=loss.item()*inputs.size(0)
        
        _,preds=torch.max(outputs,dim=1)
        running_acc+= (torch.sum(preds == labels.data))
    epoch_loss = running_loss / dataset_size
    epoch_acc = running_acc / dataset_size
    print(f'Loss (cross-entropy): { epoch_loss }')
    print(f"Accuracy (multiclass): { epoch_acc }")
    print(f"Epoch#{epoch} (Train) completed. {round(time.time()-start_time,3)}s ")
    return model, epoch_loss, epoch_acc

# функция, которая считает 1 эпоху валидации

In [None]:
def valid_epoch(model,criterion,optimizer,dataset,epoch):
    val_dataset=dataset
    data_loader=DataLoader(dataset,batch_size=32,shuffle=True,num_workers=4)
    dataset_size=len(val_dataset)
    print(f"Epoch#{epoch}. Validation")
    start_time=time.time()
    model.eval()
    running_loss=0.0 # накопление лосc
    running_acc=0.0
    epoch_loss=0.0
    with torch.no_grad():
        for inputs,labels in tqdm( data_loader):
            inputs=inputs.to('cuda').type(torch.float)
            labels=labels.to('cuda')#.type(torch.float) #передаем батч на GPU(cuda)
            outputs = model(inputs)
            loss=criterion(outputs,labels)
            running_loss+=loss.item()*inputs.size(0)
            _,preds=torch.max(outputs,dim=1)
            running_acc+= (torch.sum(preds == labels.data))
            
    epoch_loss = running_loss / dataset_size
    epoch_acc = running_acc / dataset_size
    print(f'Loss (cross-entropy): { epoch_loss } ')
    print(f"Accuracy (multiclass): { epoch_acc }")
    print(f"Epoch#{epoch} (Validation) completed. {round(time.time()-start_time,3)}s ")
    return model, epoch_loss, epoch_acc

# train-loop модели (выбор лучшей за n эпох - по accuracy на валидации)

In [None]:
best_model = mlp_model
best_epoch = 1
best_loss = 1000000
best_acc = 0
#num_epochs=len(keys)*2
num_epochs = 10

train_loss_history = []
val_loss_history = []

train_acc_history = []
val_acc_history = []

for epoch in range(1,num_epochs+1):
    #тренировка
    mlp_model, train_loss, train_acc = train_epoch(mlp_model,criterion,optimizer,train_dataset,epoch)
    train_loss_history.append(train_loss)
    train_acc_history.append(train_acc)
    
    mlp_model, val_loss, val_acc = valid_epoch(mlp_model,criterion,optimizer,val_dataset,epoch)
    val_loss_history.append(val_loss)
    val_acc_history.append(val_acc)
    
    #if(val_loss<best_loss):
    if(val_acc>best_acc):
        best_model = mlp_model
        best_epoch = epoch

# Сохранили модель в файл

In [None]:
#saving
output_model_file = 'best_model.bin'
torch.save(best_model, output_model_file)

# Inference - для kaggle соревнования, вывод на test-выборке

In [None]:
test_dataset = ForestDataset(test)
test_dataset

In [None]:
data_loader=DataLoader(test_dataset,batch_size=32,shuffle=False,num_workers=4)
dataset_size=len(test_dataset)
best_model.eval()

preds_list = []
with torch.no_grad():
    for inputs,labels in tqdm( data_loader):
        inputs=inputs.to('cuda').type(torch.float)
        labels=labels.to('cuda')
        outputs = best_model(inputs)
        _,preds=torch.max(outputs,dim=1)
        preds_list.append(preds)
torch.cat(preds_list)

In [None]:
sub["Cover_Type"] = torch.cat(preds_list).cpu().detach().numpy()
sub["Cover_Type"] = sub["Cover_Type"]+1 #вернули обратно номера классов, потому что в начале делали -1

In [None]:
sub.to_csv("submission_mlp.csv", index=False)