# Imports

In [None]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import math
import mpmath
import torch
import torch.nn.functional as F
from tqdm import tqdm
import torch.nn as nn
from copy import deepcopy
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import NeighbourhoodCleaningRule

In [None]:
raw_train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
raw_test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')

In [None]:
device = "cuda:0" if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 256
EPOCHS = 15
print(device)

# Basic EDA and Feature Engineering

In [None]:
raw_train.describe()

In [None]:
raw_test.describe()

In [None]:
categorical_cols = raw_train.select_dtypes(include=['object'])
print(f'Categorical Columns: {len(categorical_cols.columns)}')
for col in categorical_cols.columns:
    print(f'{col}: {len(categorical_cols[col].unique())} unique labels')

numerical_cols = raw_train.select_dtypes(include=['number'])
print(f'\nNumerical Columns: {len(numerical_cols.columns)}')
for col in numerical_cols.columns:
    print(f'{col}: {len(numerical_cols[col].unique())}')

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
def target_dist(df):
    sns.set_style("whitegrid")
    palette = sns.color_palette("flare")
    plt.figure(figsize=(12, 7))
    splot = sns.countplot(x=df.Cover_Type, palette=palette, orient="h")
    for p in splot.patches:
        splot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()),
                       ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
    plt.show()
    
target_dist(raw_train)

In [None]:
raw_train = reduce_mem_usage(raw_train)
raw_test = reduce_mem_usage(raw_test)

Credits:
* @chryzal [https://www.kaggle.com/chryzal/features-engineering-for-you/notebook](http://)
* @gulshanmishra [https://www.kaggle.com/gulshanmishra/tps-dec-21-tensorflow-nn-feature-engineering](http://)
* @sergiosaharovskiy [https://www.kaggle.com/sergiosaharovskiy/tps-dec-2021-a-complete-guide-eda-pytorch#5.-Feature-engineering](http://)

In [None]:
def feature_engineering_copied(train, test):
    # Manhattan distance to Hydrology
    train["mht_dist_hydrlgy"] = np.abs(train.iloc[:, 3]) + np.abs(train.iloc[:, 4])
    test["mht_dist_hydrlgy"] = np.abs(test.iloc[:, 3]) + np.abs(test.iloc[:, 4])

    # Clips hillshades 0 to 255 index.
    hillshades = [col for col in train.columns if col.startswith('Hill')]
    train[hillshades] = train[hillshades].clip(0, 255)
    test[hillshades] = test[hillshades].clip(0, 255)

    # Clips 'Aspect' 0 to 360 degrees.
    col = 'Aspect'
    train.loc[train[col] < 0, col] += 360
    train.loc[train[col] > 359, col] -= 360
    test.loc[test[col] < 0, col] += 360
    test.loc[test[col] > 359, col] -= 360

    return train, test

In [None]:
gc.collect()
y = raw_train.drop(index = int(np.where(raw_train["Cover_Type"] == 5 )[0]))['Cover_Type']
FEATURES = [col for col in raw_train.columns if col not in ['Id','Cover_Type']]

useless_feat = [col for col in raw_train.columns if len(raw_train[col].unique())<2]
for col in useless_feat:
    if col in FEATURES:
        FEATURES.remove(str(col))
        
train_df = pd.DataFrame(data=raw_train.drop(index = int(np.where(raw_train["Cover_Type"] == 5 )[0]))[FEATURES],
                        columns=FEATURES)
test_df = pd.DataFrame(data=raw_test[FEATURES], columns=FEATURES)

# feature eng.
train_df, test_df = feature_engineering_copied(train_df, test_df)
# reduction 
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

This approach is one the combined-undersampling approaches, and uses ENN and kNN to for removal of noisy samples. I've been experimenting with sampling strategies and haven't found any robust approach. The strategies I've used (including this one), more or less give me the same test-score.

In [None]:
print(f'Count samples WRT classes: {dict(y.value_counts())}')

ncr_ = NeighbourhoodCleaningRule(sampling_strategy=[1,2], kind_sel="all", n_jobs=-1)
train_df_res, y_res = ncr_.fit_resample(train_df, y)
print(f'resampled data shape: ({train_df_res.shape}) ({y_res.shape})')
print(f'reduction from: {train_df.shape[0]} to: {train_df_res.shape[0]}, difference: {y.shape[0] - y_res.shape[0]}')

saving and loading the feature engineered data into a csv, since kaggle kernel keeps restarting due to memory issues.

In [None]:
temp_df = deepcopy(train_df_res)
temp_df['target'] = y_res
temp_df.to_csv('cleaned.csv', index=False)

In [None]:
train_df_res = pd.read_csv('../input/tpsdeccleanedfeateng/cleaned.csv')
y_res = train_df_res.loc[:,'target']
train_df_res.drop(['target'], axis=1, inplace=True)
train_df_res = reduce_mem_usage(train_df_res)

In [None]:
print(f'Count samples WRT resampled classes: {dict(y_res.value_counts())}')

gc.collect()
encoder_ = LabelEncoder()
scaler_ = StandardScaler()
y_res = encoder_.fit_transform(y_res)
FEATURES = [col for col in train_df_res.columns]
train_df_res[FEATURES] = scaler_.fit_transform(train_df_res[FEATURES])
test_df[FEATURES] = scaler_.transform(test_df[FEATURES])

print(f'{"-"*30}')
train_df_res = reduce_mem_usage(train_df_res)
test_df = reduce_mem_usage(test_df)

In [None]:
class CustomDataset:
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        else:
            return self.X[idx], self.y[idx]

weight initialization based on uniform-rule.

In [None]:
# takes in a module and applies the specified weight initialization
def weights_init_uniform_rule(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        n = m.in_features
        y = 1.0/np.sqrt(n)
        m.weight.data.uniform_(-y, y)
        m.bias.data.fill_(0)

# Model and Training

In [None]:
def fc_block(in_f, out_f):
        return nn.Sequential(
            nn.Linear(in_f, out_f),
            nn.ReLU(),
            nn.BatchNorm1d(out_f),
        )    
class Net(nn.Module):
    def __init__(self, num_features, output_classes):
        super(Net, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = fc_block(num_features, 156)
        self.fc2 = fc_block(156, 108)
        self.fc3 = fc_block(108, 64)
        self.fc4 = fc_block(64, 48)
        self.out = nn.Linear(48, output_classes)
        
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.fc4(x)
        x = self.out(x)
        return x

net_model = Net(len(FEATURES), len(set(y))).to(device)
net_model.apply(weights_init_uniform_rule)

In [None]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    return acc*100

In [None]:
def batch_gd(model, train_loader, test_loader, epochs, val_score_best, lr_scheduler):
    train_losses = np.zeros(epochs)
    test_losses = np.zeros(epochs)
    epochs_no_improve = 0
    for it in range(epochs):
        t0 = datetime.now()
        model.train()
        train_loss = []
        train_acc = []
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            # move data to GPU
            inputs, targets = inputs.to(device), targets.to(device)
            # zero the parameter gradients
            optimizer.zero_grad()
            # Forward pass
            outputs = model(inputs)
            # accuracy and loss
            loss = criterion(outputs, targets)
            acc_ = multi_acc(outputs, targets)
            # Backward and optimize
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
            train_acc.append(acc_)
            
        else:
            model.eval()
            test_loss = []
            test_acc = []
            for inputs, targets in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                acc_ = multi_acc(outputs, targets)
                test_loss.append(loss.item())
                test_acc.append(acc_)
            #get train and test loss
            test_loss = np.mean(test_loss)
            train_loss = np.mean(train_loss)
            lr_scheduler.step(test_loss)
            ###    
            print('learning_rate: {}'.format(optimizer.state_dict()['param_groups'][0]['lr']))
            # Save losses
            train_losses[it] = train_loss
            test_losses[it] = test_loss
            test_acc = torch.FloatTensor(test_acc)
            test_accuracy = torch.mean(test_acc)
            train_acc = torch.FloatTensor(train_acc)
            train_accuracy = torch.mean(train_acc)
            # saving best weights
            if test_loss < val_score_best:
                epochs_no_improve = 0
                val_score_best = test_loss
                print(f'--- saving best weights ---')
                torch.save(model.state_dict(), 'best_weights.pth')
            else:
                epochs_no_improve += 1
            # getting the duration
            dt = datetime.now() - t0
            print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, \
                    Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Improvement: {epochs_no_improve}, Duration: {dt}')
            if epochs_no_improve == 6:
                print(f'Early Stopping..\n')
                break
    return train_losses, test_losses

In [None]:
# garbage collection
gc.collect()
# creating and loading test data
test_dataset = CustomDataset(torch.from_numpy(test_df.to_numpy()).to(torch.float32))
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE)
# defining folds dictionary
folds_train_losses = {}
folds_test_losses = {}
# test data predictions
test_predictions = []
# defining skfolds
skf = StratifiedKFold(n_splits=5, random_state=47, shuffle=True)
for fold, (train_idx, val_idx) in tqdm(enumerate(skf.split(train_df_res, y_res)), leave=False):
    X_train, y_train = train_df_res.iloc[train_idx], y_res[train_idx]
    X_val, y_val = train_df_res.iloc[val_idx], y_res[val_idx]
    train_dataset = CustomDataset(X=torch.from_numpy(X_train.to_numpy()).to(torch.float32), y=torch.from_numpy(y_train).to(torch.long))
    val_dataset = CustomDataset(X=torch.from_numpy(X_val.to_numpy()).to(torch.float32), y=torch.from_numpy(y_val).to(torch.long))
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
        
    criterion = nn.CrossEntropyLoss(ignore_index=-1)    
    optimizer = torch.optim.AdamW(net_model.parameters(), lr=1e-3)
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=1, 
                                                          verbose=True, min_lr=1e-7, mode='min')    
    # training and validation
    val_score_best = math.inf
    train_losses, test_losses = batch_gd(net_model, train_loader, val_loader, EPOCHS, val_score_best, lr_scheduler)
    folds_train_losses[fold] = train_losses
    folds_test_losses[fold] = test_losses
    
    # loading best weights
    print(f'--- loading best weights ---')
    net_model.load_state_dict(torch.load('best_weights.pth'))
    
    # prediction on test data
    test_preds = []
    net_model.eval()
    with torch.no_grad():
        for idx, batch_tensor in enumerate(test_loader):
            batch_tensor = batch_tensor.to(device)
            y_test_pred = net_model(batch_tensor)
            _, y_pred_tags = torch.max(y_test_pred, dim = 1)
            test_preds.extend((y_pred_tags.cpu().numpy()))
    test_predictions.append(test_preds)
    gc.collect()

# Submission

In [None]:
predictions_df = pd.DataFrame(data=(np.array(test_predictions).astype(np.float32)).T, columns=['p1','p2','p3','p4','p5'])
predictions_df.to_csv('predictions_5fold.csv', index=None)

In [None]:
predictions_df['stack'] = predictions_df.mean(axis=1)
predictions_df['stack'] = predictions_df['stack'].round(0)
submission['Cover_Type'] = encoder_.inverse_transform(predictions_df['stack'].to_numpy().astype(np.int32))

In [None]:
submission['Cover_Type'] = encoder_.inverse_transform(predictions_df['p4'].to_numpy().astype(np.int16))
submission.to_csv('submission.csv',index=None)