In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
from warnings import filterwarnings as filt 
from scipy.stats import skew, norm 

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12, 6)
filt('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
base_path = "../input/tabular-playground-series-dec-2021"
base_path2 = "../input/forest-cover-type-prediction"

In [None]:
traindf = pd.read_csv(f'{base_path}/train.csv')
traindf2 = pd.read_csv(f'{base_path2}/train.csv')
testdf = pd.read_csv(f'{base_path}/test.csv')

traindf.shape, traindf2.shape, testdf.shape

In [None]:
pd.options.display.max_columns = None
traindf.head()

In [None]:
pd.DataFrame(traindf.isnull().sum(), columns = ['null count']).T

In [None]:
print('Unique cover type in traindf   : ', sorted(traindf.Cover_Type.unique()))
print('Unique cover type in traindf 2 : ', sorted(traindf2.Cover_Type.unique()))

In [None]:
df = traindf.copy()
df2 = traindf2.copy()

label_rename = {
    1 : 'Spruce/Fir',
    2 : 'Lodgepole Pine',
    3 : 'Ponderosa Pine',
    4 : 'Cottonwood/Willow',
    5 : 'Aspen',
    6 : 'Douglas-fir',
    7 : 'Krummholz',
}

df['Cover_Type']  = df.Cover_Type.replace(label_rename)
df2['Cover_Type'] = df2.Cover_Type.replace(label_rename) 

In [None]:
cover_counts = df.Cover_Type.value_counts()
px.bar(x = cover_counts.index, y = cover_counts, color = cover_counts.index, title = 'Cover Type Count in df')

In [None]:
cover_counts

In [None]:
cover_counts2 = df2.Cover_Type.value_counts()
px.bar(x = cover_counts2.index, y = cover_counts2, color = cover_counts2.index, title = 'Cover Type Count in df 2')

since there are less number of cottonwood and aspen we can append them from df2 to df 

In [None]:
cotton_aspen = traindf2[(traindf2.Cover_Type == 4) | (traindf2.Cover_Type == 5)]
cotton_aspen.Cover_Type.unique()

In [None]:
traindf = pd.concat([traindf, cotton_aspen]).reset_index(drop = True)
cover_counts3 = traindf.Cover_Type.value_counts()
px.bar(x = cover_counts3.index, y = cover_counts3, color = cover_counts3.index, title = 'Cover Type Count after merging')

it's still unbalanced but we can merge 4, 5, 6 and 7 as 4 later if the model was overbiased to 1 and 2

In [None]:
traindf['Cover_Type'] = traindf.Cover_Type.apply(lambda x : x - 1)

In [None]:
traindf.Cover_Type.unique()

In [None]:
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

def sample(x, y, fold):
    xt, x, yt, y = train_test_split(x, y, stratify = y, test_size = fold)
    return x, y

def plot_3d(x, y, params, fold = 1):
    if fold < 1:
        x, y = sample(x, y, fold)
    tsne = TSNE(**params, verbose = 1)
    xt = pd.DataFrame(tsne.fit_transform(x), index = x.index, columns = ['x', 'y', 'z'])
    xt['target'] = y 
    return px.scatter_3d(data_frame = xt, x = 'x', y = 'y', z = 'z', color = 'target')

In [None]:
df = traindf.copy()

label_rename = {
    1 : 'Spruce/Fir',
    2 : 'Lodgepole Pine',
    3 : 'Ponderosa Pine',
    4 : 'Cottonwood/Willow',
    5 : 'Aspen',
    6 : 'Douglas-fir',
    7 : 'Krummholz',
}

df['Cover_Type']  = df.Cover_Type.apply(lambda x : x + 1).replace(label_rename)

In [None]:
params = {
    'n_components' : 3,
    'n_iter' : 2500,
    'learning_rate' : 150,
    'perplexity' : 35
}
# plot_3d(df.drop(['Cover_Type'], axis = 1), df.Cover_Type, params, 0.005)

In [None]:
px.scatter_3d(data_frame = df, x = 'Hillshade_9am', y = 'Hillshade_3pm', z = 'Hillshade_Noon', color = 'Cover_Type', title = 'Forest Cover Hillshades')

In [None]:
hillshade_9am = df.groupby('Cover_Type')['Hillshade_9am'].mean().sort_values(ascending = False)
px.bar(x = hillshade_9am.index, y = hillshade_9am, color = hillshade_9am.index, title = 'hillshade 9am for different cover types')

In [None]:
hillshade_3pm = df.groupby('Cover_Type')['Hillshade_3pm'].mean().sort_values(ascending = False)
px.bar(x = hillshade_3pm.index, y = hillshade_3pm, color = hillshade_3pm.index, title = 'hillshade 3pm for different cover types')

In [None]:
hillshade_noon = df.groupby('Cover_Type')['Hillshade_Noon'].mean().sort_values(ascending = False)
px.bar(x = hillshade_noon.index, y = hillshade_noon, color = hillshade_noon.index, title = 'hillshade noon for different cover types')

* Cotton wood usually have high hillshade at 9am but low hillshade at 3pm than the rest of the cover type
* ponderose pine on average have low hillshade at 9am, 12pm and 3pm

In [None]:
def show_corr_plot(df):
    corr = df[['Elevation', 'Aspect', 'Slope',
           'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
           'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
           'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']].corr()

    up = corr.where(np.tril(np.ones(corr.shape), k = -1).astype(bool))
    sns.heatmap(up, fmt = '.1f', annot = True, cmap = 'plasma')
    
show_corr_plot(df)

In [None]:
import plotly.figure_factory as ff

# slope = df.
# ff.create_distplot()
slope = df.groupby('Cover_Type')['Slope'].mean().sort_values(ascending = False)
px.bar(x = slope.index, y = slope, color = slope.index, title = 'mean slope for cover types')

In [None]:
px.scatter(data_frame = df, x = 'Horizontal_Distance_To_Roadways', y = 'Elevation', color = 'Cover_Type', title = 'distace and elevation')

In [None]:
elevation = df.groupby('Cover_Type')['Elevation'].mean().sort_values(ascending = False)
px.bar(x = elevation.index, y = elevation, color = elevation.index, title = 'mean elevation for cover type')

In [None]:
elevation = df.groupby('Cover_Type')['Horizontal_Distance_To_Roadways'].mean().sort_values(ascending = False)
px.bar(x = elevation.index, y = elevation, color = elevation.index, title = 'mean distance to roadways for cover type')

Among these cover types
* cotttonwood are the shortest cover types and also closer to roadways 
* krummholz are the tallest cover types and also far from roadways 

In [None]:
pd.options.display.max_columns = None
soil_types = [c for c in df.columns if 'soil' in c.lower()]
mean_soil_type = df.groupby('Cover_Type')[soil_types].mean()
plt.figure(figsize = (18, 6))
sns.heatmap(mean_soil_type, cmap = 'plasma');

Each cover type contains unique soil types except 2 
On Average
* Aspen have the highest __soil type 30          ( Como family - Rock land - Legault family complex, extremely stony )__
* Cottonwood have the highest __soil type 3      ( Haploborolis - Rock outcrop complex, rubbly )__
* Douglas fir have the highest __soil type 10    ( Bullwark - Catamount families - Rock outcrop complex, rubbly )__
* Krummholz have the highest __soil type 39      (  Cryorthents - Leighcan family complex, extremely stony )__
* Lodgepole pine not really have any high soil types 
* Ponderose pine not really have any high soil types 
* Spruce fir not really have any high soil types 

Spruce fir and Lodgepole pine almost contains similar soil types 

In [None]:
traindf.head()

In [None]:
cols_for_dist = df.loc[:, 'Elevation' : 'Horizontal_Distance_To_Fire_Points'].columns.tolist()
fig, ax = plt.subplots(len(cols_for_dist), 2, figsize = (12, 26))
fig.tight_layout()
for i in range(len(cols_for_dist)):
    col = cols_for_dist[i]
    sns.distplot(df[col], ax = ax[i, 0], fit = norm)
    sns.boxenplot(df[col], ax = ax[i, 1])

In [None]:
traindf.shape

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler

In [None]:
x = traindf.drop(['Id', 'Cover_Type'], axis = 1)
y = traindf.Cover_Type
x.shape, y.shape

In [None]:
traindf.Cover_Type.unique()

In [None]:
std = StandardScaler()
stdx = pd.DataFrame(std.fit_transform(x), index = x.index, columns = x.columns)
stdx.head()

In [None]:
stdx.shape

In [None]:
stdx.shape[0] * 0.005

In [None]:
x_train, x_dev, y_train, y_dev = train_test_split(stdx, y, test_size = 0.005, stratify = y, random_state = 0)
num_cls = y.unique().shape[0]
y_train = to_categorical(y_train, num_classes = num_cls)
y_dev = to_categorical(y_dev, num_classes = num_cls)

y_train.shape, y_dev.shape

In [None]:
import tensorflow as tf 
import tensorflow.keras as keras 
from tensorflow.keras import Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization

loss = keras.losses
metrics = keras.metrics

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

class Network:
    def __init__(self, layers = [], activations = [], dropout = [], batchnorm = [], compile_params = {}):
        self.model = None
        self.losses = None 
        self.metrics = None
        self.layers = layers 
        self.activations = activations 
        self.compile_params = compile_params 
        self.dropout = dropout if len(dropout) == (len(activations) - 1) else [None for _ in range(len(activations) - 1)]
        self.batchnorm = batchnorm if len(batchnorm) == (len(activations) - 1) else [None for _ in range(len(activations) - 1)]
        self.initialize_model()
    
    def initialize_model(self):
        self.model = Sequential()
        
        # input layer         
        self.model.add(Input(shape = self.layers[0], name = 'Input_Layer'))
        
        # hidden layers         
        for idx in range(len(self.layers) - 2):
            units = self.layers[idx + 1]
            activation = self.activations[idx]
            dp = self.dropout[idx]
            bn = self.batchnorm[idx]
            self.model.add(Dense(units, activation = activation, name = f'Hidden_Layer_{idx + 1}'))
            if bn:
                self.model.add(BatchNormalization())
            if dp:
                self.model.add(Dropout(dp, name = f'Dropout_{idx + 1}_{dp}'))
                
        # output layer
        self.model.add(Dense(self.layers[-1], activation = self.activations[-1], name = 'Output_Layer'))
                
        self.model.compile(**self.compile_params)
        return self.model
    
    def fit(self, fit_params):
        history = self.model.fit(**fit_params)
        his = pd.DataFrame(history.history)
        l = [c for c in his.columns if 'loss' in c]
        m = [c for c in his.columns if 'loss' not in c]
        self.losses = his[l]
        self.metrics = his[m]
        return his
          
    def predict(self, x, softmax = 0):
        pred = self.model.predict(x)
        if softmax == 0:
            return np.argmax(pred, axis = 1)
        return pred
    
    def plot_arch(self):
        if self.model is not None:
            return plot_model(self.model, show_shapes = True, show_layer_names = True)
        
    def plot_loss(self):
        if self.losses is not None:
            self.losses.plot(kind = 'line')
            plt.title('loss comparison')
            plt.legend(self.losses.columns)
            
    def plot_metrics(self):
        if self.metrics is not None:
            self.metrics.plot(kind = 'line')
            plt.title('metrics comparison')
            plt.legend(self.metrics.columns)

        
def report(yt, pred, inverse_to_cat = True):
    if inverse_to_cat:
        yt = np.argmax(yt, axis = 1)

    print(classification_report(yt, pred))
    sns.heatmap(confusion_matrix(yt, pred), fmt = '.1f', annot = True)
    plt.title('confusion matrix')
    
def hardmax(y):
    return np.argmax(y, axis = 1)

In [None]:
# y_train[0], y[x_train.index[0]]
x_train.shape, y_train.shape

In [None]:
import tensorflow_addons as tfa

In [None]:
layers         = [(54, ), 55, 55, 7]
activations    = ['relu', 'relu', 'softmax']
dropout        = [0.05, 0.05]
compile_params = {
    'optimizer' : keras.optimizers.Adam(learning_rate = 0.01),
    'metrics'   : ['accuracy', metrics.Recall(), tfa.metrics.F1Score(num_classes = y_dev.shape[1], average = 'micro')],
    'loss'      : loss.CategoricalCrossentropy()  
}
model1 = Network(layers = layers, activations = activations, compile_params = compile_params, dropout = dropout)
model1.plot_arch()

In [None]:
fit_params = {
    'x' : x_train,
    'y' : y_train,
    'validation_data' : (x_dev, y_dev),
    'epochs' : 5,
    'batch_size' : 128
}

his = model1.fit(fit_params)
model1.plot_loss()

looks like the loss is saturated around 50 epoch with max accuracy of 79%

In [None]:
model1.plot_metrics()

In [None]:
scores = his.iloc[4, :]
px.bar(x = scores.index, y = scores, color = scores.index)

In [None]:
'sdfsdfsdf'

In [None]:
pred = model1.predict(x_dev)
report(y_dev, pred)

In [None]:
x_test = testdf.drop(['Id'], axis = 1)
x_test = pd.DataFrame(std.transform(x_test), columns = x_test.columns)
x_test.shape, x_train.shape

In [None]:
pred = model1.predict(x_test)
np.unique(pred)

In [None]:
submission = pd.read_csv(f'{base_path}/sample_submission.csv')
submission.head()

In [None]:
submission.Cover_Type.unique()

In [None]:
ppred = pred + 1

submission['Cover_Type'] = ppred
submission.Cover_Type.unique()

In [None]:
submission.to_csv('submission.csv', index = False)

let's try to drop some useless features by using pearson correlation 