In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
from warnings import filterwarnings as filt 
from scipy.stats import skew, norm 

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12, 6)
filt('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
base_path = '../input/tabular-playground-series-dec-2021'
base_path2 = '../input/forest-cover-type-prediction'

In [None]:
traindf  = pd.read_csv(f'{base_path}/train.csv')
traindf2 = pd.read_csv(f'{base_path2}/train.csv')
testdf   = pd.read_csv(f'{base_path}/test.csv')

traindf.shape, traindf2.shape, testdf.shape

In [None]:
cover_count = traindf.Cover_Type.value_counts()
px.bar(x = cover_count.index, y = cover_count, color = cover_count.index, title = 'tabular dataset')

In [None]:
cover_count2 = traindf2.Cover_Type.value_counts()
px.bar(x = cover_count2.index, y = cover_count2, color = cover_count2.index, title = 'original dataset')

let's merge class 4 and 5 from original dataset to tabular dataset since there are less number of datasets for 4 and 5 

In [None]:
cotton_aspen = traindf2[(traindf2.Cover_Type == 4) | (traindf2.Cover_Type == 5)]
cotton_aspen.Cover_Type.unique()

In [None]:
traindf = pd.concat([traindf, cotton_aspen]).reset_index(drop = True).drop_duplicates()
cover_count = traindf.Cover_Type.value_counts()
px.bar(x = cover_count.index, y = cover_count, color = cover_count.index, title = 'tabular dataset')

i have already made another notebook and trained the DL model with full futures, though i got around 95% accuracy in train and dev set , i didnt get good score in testing set after submission

lets drop some useless features using pearson correlation 

In [None]:
from sklearn.model_selection import train_test_split
from eli5 import show_weights
from eli5.sklearn import PermutationImportance
from sklearn.ensemble import RandomForestClassifier


def get_best_feats(x, y, top_feats):
    cols = x.columns
    corr_list = []
    for col in cols:
        coef = np.corrcoef(x[col], y)[0, 1]
        coef = 0 if np.isnan(coef) else np.abs(coef)
        corr_list.append(coef)
        
    top_idx = np.argmax(corr_list)[-top_feats: ]
    best_feats = x.iloc[:, top_idx].columns
    return best_feats, corr_list

def sample(x, y, frac, return_val = False):
    x, xt, y, yt = train_test_split(x, y, test_size = frac, stratify = y)
    if return_val:
        return x, xt, y, yt
    
    return x, y

def permImp(x, y, val_split = 0.2):
    assert val_split > 0 or val_split < 1, 'val_split must be ( > 0 ) and ( < 1 )'
    x, xt, y, yt = sample(x, y, val_split, True)
    model = RandomForestClassifier(n_estimators = 100, bootstrap = True, random_state = 0).fit(x, y)
    perm = PermutationImportance(model, n_iter = 10).fit(xt, yt)
    return show_weights(perm, feature_names = x.columns.tolist(), top = 100)

In [None]:
pd.options.display.max_columns = None
traindf.head()

In [None]:
x = traindf.drop(['Id', 'Cover_Type'], axis = 1)
y = traindf.Cover_Type - 1

x.shape, y.shape

In [None]:
# xx, xt, yy, yt = sample(x, y, 0.99, True)
# xx.shape, xt.shape
# permImp(xx, yy)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [None]:
traindf.shape[0] * 0.0055

In [None]:
y.unique()

In [None]:
std = StandardScaler()
stdx = pd.DataFrame(std.fit_transform(x), index = x.index, columns = x.columns)
x_train, x_dev, y_train, y_dev = sample(stdx, y, 0.0055, True)
num_cls = y.unique().shape[0]
y_train = to_categorical(y_train, num_classes = num_cls)
y_dev = to_categorical(y_dev, num_classes = num_cls)

y_train.shape, y_dev.shape

In [None]:
def hardmax(y):
    return np.argmax(y, axis = 1)

In [None]:
train_counts = pd.DataFrame(hardmax(y_train)).value_counts()
idx = [i[0] for i in train_counts.index]
print(train_counts)
px.bar(x = idx, y = train_counts, color = idx)

In [None]:
dev_counts = pd.DataFrame(hardmax(y_dev)).value_counts()
idx = [i[0] for i in dev_counts.index]
print(dev_counts)
px.bar(x = idx, y = dev_counts, color = idx)

In [None]:
import tensorflow as tf 
import tensorflow.keras as keras 
from tensorflow.keras import Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow_addons as tfa

loss = keras.losses
metrics = keras.metrics

class Network:
    def __init__(self, layers = [], activations = [], dropout = [], batchnorm = [], compile_params = {}):
        self.model = None
        self.losses = None 
        self.metrics = None
        self.layers = layers 
        self.activations = activations 
        self.compile_params = compile_params 
        self.dropout = dropout if len(dropout) == (len(activations) - 1) else [None for _ in range(len(activations) - 1)]
        self.batchnorm = batchnorm if len(batchnorm) == (len(activations) - 1) else [None for _ in range(len(activations) - 1)]
        self.initialize_model()
    
    def initialize_model(self):
        self.model = Sequential()
        
        # input layer         
        self.model.add(Input(shape = self.layers[0], name = 'Input_Layer'))
        
        # hidden layers         
        for idx in range(len(self.layers) - 2):
            units = self.layers[idx + 1]
            activation = self.activations[idx]
            dp = self.dropout[idx]
            bn = self.batchnorm[idx]
            self.model.add(Dense(units, activation = activation, name = f'Hidden_Layer_{idx + 1}'))
            if bn:
                self.model.add(BatchNormalization())
            if dp:
                self.model.add(Dropout(dp, name = f'Dropout_{idx + 1}_{dp}'))
                
        # output layer
        self.model.add(Dense(self.layers[-1], activation = self.activations[-1], name = 'Output_Layer'))
                
        self.model.compile(**self.compile_params)
        return self.model
    
    def fit(self, fit_params):
        history = self.model.fit(**fit_params)
        his = pd.DataFrame(history.history)
        l = [c for c in his.columns if 'loss' in c]
        m = [c for c in his.columns if 'loss' not in c]
        self.losses = his[l]
        self.metrics = his[m]
        return his
          
    def predict(self, x, softmax = 0):
        pred = self.model.predict(x)
        if softmax == 0:
            return np.argmax(pred, axis = 1)
        return pred
    
    def plot_arch(self):
        if self.model is not None:
            return plot_model(self.model, show_shapes = True, show_layer_names = True)
        
    def plot_loss(self):
        if self.losses is not None:
            self.losses.plot(kind = 'line')
            plt.title('loss comparison')
            plt.legend(self.losses.columns)
            
    def plot_metrics(self):
        if self.metrics is not None:
            self.metrics.plot(kind = 'line')
            plt.title('metrics comparison')
            plt.legend(self.metrics.columns)

        
def report(yt, pred, inverse_to_cat = True):
    if inverse_to_cat:
        yt = np.argmax(yt, axis = 1)
    print(classification_report(yt, pred))
    sns.heatmap(confusion_matrix(yt, pred), fmt = '.1f', annot = True)
    plt.title('confusion matrix')

In [None]:
def select_best_feats(x, y, top_feats):
    cols = x.columns
    corr_list = []
    for col in cols:
        coef = np.corrcoef(x[col], y)[0, 1]
        coef = 0 if np.isnan(coef) else np.abs(coef)
        corr_list.append(coef)
        
    top_idx = np.argsort(corr_list)[-top_feats: ]
    return x.iloc[:, top_idx].columns, corr_list

In [None]:
best_feats, feats_corr = select_best_feats(x_train, hardmax(y_train), 10)

In [None]:
x_train[best_feats].head()

In [None]:
layers         = [(10, ), 65, 95, 65, 7]
activations    = ['relu', 'relu', 'relu', 'softmax']
dropout        = []
batchNorm      = [True, True, True]
compile_params = {
    'optimizer' : keras.optimizers.SGD(learning_rate = 0.001),
    'metrics'   : ['accuracy', metrics.Recall(), tfa.metrics.F1Score(num_classes = y_dev.shape[1], average = 'micro')],
    'loss'      : loss.CategoricalCrossentropy()  
}
model1 = Network(layers = layers, activations = activations, compile_params = compile_params, dropout = dropout, batchnorm  = batchNorm)
model1.model.summary()

In [None]:
256 * 2

In [None]:
fit_params = {
    'x' : x_train[best_feats],
    'y' : y_train,
    'validation_data' : (x_dev[best_feats], y_dev),
    'epochs' : 7,
    'batch_size' : 512
}

his = model1.fit(fit_params)
model1.plot_loss()

loss is saturating right around the 7th epoch and it better to stop it here since it will drop slowly after this epoch 

In [None]:
model1.plot_metrics()

In [None]:
pred = model1.predict(x_dev[best_feats])
cls_counts = pd.DataFrame(pred).value_counts()
idx = [i[0] for i in cls_counts.index]
print(np.unique(pred))
px.bar(x = idx, y = cls_counts, color = idx)

In [None]:
report(y_dev, pred)

In [None]:
x_test = testdf.drop(['Id'], axis = 1)
std_x_test = std.transform(x_test)
x_test = pd.DataFrame(std_x_test, columns = x_test.columns , index = x_test.index)
submission1 = pd.read_csv(f'{base_path}/sample_submission.csv')
submission1['Cover_Type'] = model1.predict(x_test[best_feats]) + 1
x_test.shape, x_train.shape, submission1.shape

In [None]:
submission1.to_csv('submission1.csv')
submission1.Cover_Type.value_counts()

got around 91% for the test dataset

### model 2 - balanced class weights 

In [None]:
from sklearn.utils import class_weight

def create_class_weight(y, ctype = 'balanced'):
    num_cls = np.unique(y)
    cls_weight = class_weight.compute_class_weight(ctype, num_cls, y)
    return {
        ind : weight for ind, weight in enumerate(cls_weight)
    }

weights = create_class_weight(hardmax(y_train))
weights

In [None]:
train_counts

In [None]:
weights2 = {
    0: 0.4196408633416197,
    1: 0.46188408996594064,
    2: 1.4228890265491922,
    3: 22.48530660777985,
    4: 22.92751445855215,
    5: 4.365953407676545,
    6: 1.987800652003885
}

weights2

lets try to increase the class weight for 3 and 4 since they are so low compared to other classes 

In [None]:
layers         = [(10, ), 65, 75, 65, 7]
activations    = ['relu', 'relu', 'relu', 'softmax']
dropout        = [0.25, 0.3, 0.25]
batchNorm      = []
compile_params = {
    'optimizer' : keras.optimizers.SGD(learning_rate = 0.0005),
    'metrics'   : ['accuracy', metrics.Recall(), tfa.metrics.F1Score(num_classes = y_dev.shape[1], average = 'micro')],
    'loss'      : loss.CategoricalCrossentropy()  
}
model2 = Network(layers = layers, activations = activations, compile_params = compile_params, dropout = dropout, batchnorm  = batchNorm)
model2.model.summary()

In [None]:
512 / 2

In [None]:
fit_params = {
    'x'               : x_train[best_feats],
    'y'               : y_train,
    'validation_data' : (x_dev[best_feats], y_dev),
    'epochs'          : 7,
    'batch_size'      : 64,
    'class_weight'    : weights2 
}

his2 = model2.fit(fit_params)
model2.plot_loss()

In [None]:
model2.plot_metrics()

In [None]:
pred2 = model2.predict(x_dev[best_feats])
cls_counts2 = pd.DataFrame(pred2).value_counts()
idx2 = [i[0] for i in cls_counts2.index]
print(np.unique(pred2))
px.bar(x = idx2, y = cls_counts2, color = idx2)
sns.countplot(pred2)

In [None]:
dev_counts

In [None]:
report(y_dev, pred2)

In [None]:
submission2 = pd.read_csv(f'{base_path}/sample_submission.csv')
submission2['Cover_Type'] = model2.predict(x_test[best_feats]) + 1
x_test.shape, x_train.shape, submission1.shape

In [None]:
submission2.to_csv('submission2.csv')
submission2.Cover_Type.value_counts()

got around 88% for model 2