In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import os

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import catalyst
from catalyst import dl, utils

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, accuracy_score

import gc

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

# Load data

In [None]:
data = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv')
data.drop('Id', axis=1, inplace=True)

data

In [None]:
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv')
test_ids = test_data['Id']
test_data.head()

# EDA

In [None]:
len(data.columns)

In [None]:
data.dtypes

In [None]:
data.describe()

In [None]:
for column in data.columns:
    unique_num = data[column].nunique()
    print('Column {} has {} unique values'.format(column, unique_num))

In [None]:
for column in data.columns:
    nan_num = data[column].isnull().sum()
    print('Column {} has {} NaN values'.format(column, nan_num))

### No NaN values - great!

In [None]:
data.hist(figsize=(40,40))

In [None]:
item_counts = data['Cover_Type'].value_counts(sort=False)
item_counts

In [None]:
item_counts_frequencies = data['Cover_Type'].value_counts(sort=False, normalize=True)
i = 1
for item in item_counts_frequencies:
  print('{}: {:.8f}%'.format(i, item*100))
  i += 1

### We can make two conclusions
1.   Two features (Wilderness_Area and Soil_Type are one-hot encoded)
2.   Cover_Type is very imbalanced. Class '6' is at noise level, class '5' has only one value! Let's delete class '5'

In [None]:
cont_features = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']

In [None]:
data[cont_features] = data[cont_features].astype(np.int16)
data[data.columns[~data.columns.isin(cont_features)]] = data[data.columns[~data.columns.isin(cont_features)]].astype(np.int8)
data.dtypes

In [None]:
corr_matr = data[cont_features + ['Cover_Type']].corr()
corr_matr

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(corr_matr)

### All features have low correlation.

# Let's make a simple PyTorch model with training in Catalyst framework

### We need to move all targets to the left because, according to Docs, CrossEntropyLoss gets class indices from 0 to C-1. Our class indices after deletion of row with target '5' are 1, 2, 3, 4, 6, 7 which is not correct input for loss function

In [None]:
data = data.drop(data.loc[data['Cover_Type'].isin([5])].index)

In [None]:
for i in tqdm(range(0, len(data['Cover_Type']))):
    if i == 3403875:
        continue
    if data['Cover_Type'][i] < 5:
        data['Cover_Type'][i] = data['Cover_Type'][i] - 1
    else:
        data['Cover_Type'][i] = data['Cover_Type'][i] - 2
data['Cover_Type']

In [None]:
X = data.drop('Cover_Type', axis=1)
y = data['Cover_Type']

In [None]:
class TPS_Dec_2021(Dataset):
    
    def __init__(self, X, y, dataset_type):
        self.dataset_type = dataset_type
        if self.dataset_type in ['train', 'valid', 'test']:
            self.X = np.asarray(X)
            print(self.X.shape)
            self.y = np.asarray(y)
            print(self.y.shape)
        else: 
            self.X = np.asarray(X)
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        if self.dataset_type in ['train', 'valid', 'test']:
            return torch.tensor(self.X[idx], dtype=torch.float).to(device), torch.tensor(self.y[idx], dtype=torch.long).to(device)
        else:
            return torch.tensor(self.X[idx], dtype=torch.float).to(device)

In [None]:
# model definition
class model_catalyst(nn.Module):
    # define model elements
    def __init__(self):
        super(model_catalyst, self).__init__()
        self.linear1 = nn.Linear(54, 108)
        self.linear2 = nn.Linear(108, 108)
        self.linear3 = nn.Linear(108, 108)
        self.linear4 = nn.Linear(108, 108)
        self.linear5 = nn.Linear(108, 108)
        self.linear6 = nn.Linear(108, 108)
        self.out = nn.Linear(108, 6)
 
    # forward propagate input
    def forward(self, X):
        logits = self.linear1(X)
        logits = self.linear2(logits)
        logits = self.linear3(logits)
        logits = self.linear4(logits)
        logits = self.linear5(logits)
        logits = self.linear6(logits)
        logits = self.out(logits)
        
        return logits

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

### Split dataset to train, valid and test

In [None]:
X_train, X_buf, y_train, y_buf = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_buf, y_buf, test_size=0.5, random_state=42)

In [None]:
train_dataset = TPS_Dec_2021(X_train, y_train, 'train')
valid_dataset = TPS_Dec_2021(X_valid, y_valid, 'valid')
test_dataset = TPS_Dec_2021(X_test, y_test, 'test')

In [None]:
trainloader = DataLoader(train_dataset, batch_size=512)
validloader = DataLoader(valid_dataset, batch_size=512)
testloader = DataLoader(test_dataset, batch_size=1)

loaders = {
    "train": trainloader,
    "valid": validloader,
}

In [None]:
model = model_catalyst().to(device)
model

### Train our model

In [None]:
gc.collect()

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [None]:
runner = dl.SupervisedRunner(
    input_key="features", output_key="logits", target_key="targets", loss_key="loss"
)

In [None]:
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    loaders=loaders,
    num_epochs=10,
    callbacks=[
        dl.CriterionCallback(
            input_key="logits", target_key="targets", metric_key="loss"
        ),
        dl.AccuracyCallback(input_key="logits", target_key="targets", num_classes=6, topk_args=[1,2,3,4,5,6])
    ],
    logdir="./logs",
    valid_loader="valid",
    valid_metric="loss",
    minimize_valid_metric=True,
    verbose=True,
    load_best_on_end=True,
    seed=42,
)

### Inference

In [None]:
preds = []
with torch.no_grad():
    model.eval()
    for data in tqdm(testloader):
        features = data
        outputs = model(features[0])
        outputs = outputs.detach().cpu().numpy()
        preds.append(outputs)

In [None]:
preds[:10]

In [None]:
test_preds = []
for pred in preds:
    test_preds.append(np.argmax(pred[0]))

In [None]:
test_preds[:10]

In [None]:
balanced_accuracy_score(y_test, test_preds)

In [None]:
accuracy_score(y_test, test_preds)

In [None]:
gc.collect()

### Submission

In [None]:
out_data = pd.DataFrame()
out_data['Id'] = test_ids
test_data = test_data.drop('Id', axis=1)
X_subm = test_data.to_numpy()
X_subm

In [None]:
submission_dataset = TPS_Dec_2021(X_subm, None, 'submission')

In [None]:
submissionloader = DataLoader(submission_dataset, batch_size=1)

In [None]:
preds = []
with torch.no_grad():
    model.eval()
    for data in tqdm(submissionloader):
        features = data
        outputs = model(features)
        outputs = outputs.detach().cpu().numpy()
        preds.append(outputs)

In [None]:
preds[:10]

In [None]:
final_preds = []
for pred in preds:
    if np.argmax(pred[0]) > 3:
        final_preds.append(np.argmax(pred[0])+2)
    else:
        final_preds.append(np.argmax(pred[0])+1)

In [None]:
final_preds[:10]

In [None]:
out_data['Cover_Type'] = final_preds

In [None]:
out_data

In [None]:
out_data.to_csv('torch_baseline.csv',index=None)

In [None]:
gc.collect()

# LB is 0.90618