In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

%matplotlib inline

import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

#Some Styling
import plotly.io as pio
pio.templates.default = "plotly_dark"
sns.set_style("darkgrid")


#displaying markdown
from IPython.display import Markdown
def bold(string):
    display(Markdown(string))
    

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# Disable warnings 
import warnings
warnings.filterwarnings('ignore')

In [None]:
%matplotlib inline
from IPython.core.display import display, HTML, Javascript

html_contents ="""
<!DOCTYPE html>
<html lang="en">
    <head>
        <link rel="stylesheet" href="https://www.w3schools.com/w3css/4/w3.css">
        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Raleway">
        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto">
        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Verdana">
        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Open Sans">
        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
        <style>
        .title-section{
            font-family: "Roboto", Verdana, sans-serif;
            font-weight: bold;
            color: "#6A8CAF";
            letter-spacing: 6px;
        }
        hr { border: 1px solid #E58F65 !important;
             color: #E58F65 !important;
             background: #E58F65 !important;
           }
        body {
            font-family: "Verdana", sans-serif;
            }        
        </style>
    </head>    
</html>
"""

HTML(html_contents)

# <span class="title-section w3-xxlarge" style="color:green" id="codebook">0. Tabular Playground Series - December Edition - Forest Cover </span>

### <span class="title-section w3-large" style="color:blue" id="codebook">* Elevation - Elevation in meters </span>
### <span class="title-section w3-large" style="color:blue" id="codebook">0* Aspect - Aspect in degrees azimuth</span>



##### To study how aspect works , please refer the following website  https://pro.arcgis.com/en/pro-app/tool-reference/3d-analyst/how-aspect-works.htm  that explains how it works.

#####To study how aspect works , please refer the following website https://pro.arcgis.com/en/pro-app/tool-reference/3d-analyst/slope.htm

### <span class="title-section w3-large" style="color:blue" id="codebook">* Slope - Slope in degrees </span>
### <span class="title-section w3-large" style="color:blue" id="codebook">* Horizontal_Distance_To_Hydrology - Horz Dist to nearest surface water features</span>
### <span class="title-section w3-large" style="color:blue" id="codebook">* Vertical_Distance_To_Hydrology - Vert Dist to nearest surface water features</span>
### <span class="title-section w3-large" style="color:blue" id="codebook">* Horizontal_Distance_To_Roadways - Horz Dist to nearest roadway</span>
### <span class="title-section w3-large" style="color:blue" id="codebook">* Hillshade_9am (0 to 255 index) - Hillshade index at 9am, summer solstice </span>

##### To study how aspect works , please refer the following website https://pro.arcgis.com/en/pro-app/tool-reference/3d-analyst/hillshade.htm

### <span class="title-section w3-large" style="color:blue" id="codebook">* Hillshade_Noon (0 to 255 index) - Hillshade index at noon, summer solstice</span>
### <span class="title-section w3-large" style="color:blue" id="codebook">* Hillshade_3pm (0 to 255 index) - Hillshade index at 3pm, summer solstice</span>
### <span class="title-section w3-large" style="color:blue" id="codebook">* Horizontal_Distance_To_Fire_Points - Horz Dist to nearest wildfire ignition points</span>
### <span class="title-section w3-large" style="color:blue" id="codebook">* Wilderness_Area (4 binary columns, 0 = absence or 1 = presence) - Wilderness area designation</span>
### <span class="title-section w3-large" style="color:blue" id="codebook">* Soil_Type (40 binary columns, 0 = absence or 1 = presence) - Soil Type designation</span>
### <span class="title-section w3-large" style="color:blue" id="codebook">* Cover_Type (7 types, integers 1 to 7) - train Cover Type designation</span>

#### Seven Types of Forest Cover
* 0 - Spruce/Fir
* 1 - Lodgepole Pine
* 2 - Ponderosa Pine
* 3 - Cottonwood/Willow
* 4 - Aspen
* 5 - Douglas-fir
* 6 - Krummholz

#### Wilderness Areas

* 1 - Rawah Wilderness Area
* 2 - Neota Wilderness Area
* 3 - Comanche Peak Wilderness Area
* 4 - Cache la Poudre Wilderness Area

## Layout of the document
The prediction process is divided into two notebooks.

This notebook : Covers data statistics, data visualization, and feature selection

Part 2 : Covers prediction using various algorithms : 
***
## Data statistics
* Shape
* Datatypes
* Description
* Skew
* Class distribution

## Data Interaction
* Correlation
* Scatter plot

## Data Visualization
* Box and density plots
* Grouping of one hot encoded attributes

## Data Cleaning
* Remove unnecessary columns

## Data Preparation
* Original
* Delete rows or impute values in case of missing
* StandardScaler
* MinMaxScaler
* Normalizer

## Feature selection
* ExtraTreesClassifier
* GradientBoostingClassifier
* RandomForestClassifier
* XGBClassifier
* RFE
* SelectPercentile
* PCA
* PCA + SelectPercentile
* Feature Engineering

## Evaluation, prediction, and analysis
* LDA (Linear algo)
* LR (Linear algo)
* KNN (Non-linear algo)
* CART (Non-linear algo)
* Naive Bayes (Non-linear algo)
* SVC (Non-linear algo)
* Bagged Decision Trees (Bagging)
* Random Forest (Bagging)
* Extra Trees (Bagging)
* AdaBoost (Boosting)
* Stochastic Gradient Boosting (Boosting)
* Voting Classifier (Voting)
* MLP (Deep Learning)
* XGBoost

***

###### inspiration from https://www.kaggle.com/subinium/tps-may-categorical-eda, https://www.kaggle.com/nitin007/forest-cover-type-prediction-complete-part-i , https://www.kaggle.com/sharmasanthosh/exploratory-study-on-feature-selection & https://www.kaggle.com/siddheshpujari/forest-cover-eda#Descriptive-Statistics 

In [None]:
# reference : https://www.kaggle.com/subinium/dark-mode-visualization-apple-version
from cycler import cycler


raw_light_palette = [
    (0, 122, 255), # Blue
    (255, 149, 0), # Orange
    (52, 199, 89), # Green
    (255, 59, 48), # Red
    (175, 82, 222),# Purple
    (255, 45, 85), # Pink
    (88, 86, 214), # Indigo
    (90, 200, 250),# Teal
    (255, 204, 0)  # Yellow
]

raw_dark_palette = [
    (10, 132, 255), # Blue
    (255, 159, 10), # Orange
    (48, 209, 88),  # Green
    (255, 69, 58),  # Red
    (191, 90, 242), # Purple
    (94, 92, 230),  # Indigo
    (255, 55, 95),  # Pink
    (100, 210, 255),# Teal
    (255, 214, 10)  # Yellow
]

raw_gray_light_palette = [
    (142, 142, 147),# Gray
    (174, 174, 178),# Gray (2)
    (199, 199, 204),# Gray (3)
    (209, 209, 214),# Gray (4)
    (229, 229, 234),# Gray (5)
    (242, 242, 247),# Gray (6)
]

raw_gray_dark_palette = [
    (142, 142, 147),# Gray
    (99, 99, 102),  # Gray (2)
    (72, 72, 74),   # Gray (3)
    (58, 58, 60),   # Gray (4)
    (44, 44, 46),   # Gray (5)
    (28, 28, 39),   # Gray (6)
]


light_palette = np.array(raw_light_palette)/255
dark_palette = np.array(raw_dark_palette)/255
gray_light_palette = np.array(raw_gray_light_palette)/255
gray_dark_palette = np.array(raw_gray_dark_palette)/255

mpl.rcParams['axes.prop_cycle'] = cycler('color',dark_palette)
mpl.rcParams['figure.facecolor']  = gray_dark_palette[-2]
mpl.rcParams['figure.edgecolor']  = gray_dark_palette[-2]
mpl.rcParams['axes.facecolor'] =  gray_dark_palette[-2]

white_color = gray_light_palette[-2]
mpl.rcParams['text.color'] = white_color
mpl.rcParams['axes.labelcolor'] = white_color
mpl.rcParams['axes.edgecolor'] = white_color
mpl.rcParams['xtick.color'] = white_color
mpl.rcParams['ytick.color'] = white_color

mpl.rcParams['figure.dpi'] = 200

mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

In [None]:
sns.palplot(dark_palette)

In [None]:
# reduce memory usage

def reduce_memory_usage(df):
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    return df

# <span class="title-section w3-xxlarge" style="color:green" id="codebook">1. Load the Dataset </span>

In [None]:
train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
reduce_memory_usage(train)
test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
reduce_memory_usage(test)
sample_submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
reduce_memory_usage(sample_submission)

# <span class="title-section w3-xxlarge" style="color:green" id="codebook">1.1 Try a TSNE Diagram </span>

In [None]:
# Source - https://www.kaggle.com/carlmcbrideellis/some-pretty-t-sne-plots 
import cudf
import cuml

from cuml.manifold import TSNE



train_data = cudf.read_csv('../input/tabular-playground-series-dec-2021/train.csv',index_col=0, nrows=200000)
tsne    = TSNE(n_components=2, perplexity=40, learning_rate=2, random_state=101)
tsne_2D = tsne.fit_transform(train_data);
x, y = tsne_2D.as_matrix().T
fig, ax = plt.subplots(figsize=(30, 30))
ax.scatter(x, y, s=0.1, c=x**2+y**2+x*y-100, cmap=plt.cm.prism)
fig.set_facecolor('#202020')
plt.xticks([])
plt.yticks([])
plt.box(False)
plt.show()
del train_data

# <span class="title-section w3-xxlarge" style="color:green" id="codebook">1.2 Try simple Dataset Exploration </span>

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.head()

In [None]:
train = train.drop('Id', axis=1)
test = test.drop('Id', axis=1)

In [None]:
fig, ax = plt.subplots()
sns.countplot(x='Cover_Type', data=train, order=sorted(train['Cover_Type'].unique()), ax=ax)
ax.set_ylim(0, 2563000)
ax.set_title('Cover_Type Distribution', weight='bold')
plt.show()

In [None]:
train.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
test.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

## <span class="title-section w3-xlarge" style="color:green" id="codebook">1.2  Comparison of statistics of train and test.</span>


In [None]:
def diff_color(x):
    color = 'red' if x<0 else ('green' if x > 0 else 'black')
    return f'color: {color}'

(train.describe() - test.describe())[test.columns].T.iloc[:,1:].style\
        .bar(subset=['mean', 'std'], align='mid', color=['#d65f5f', '#5fba7d'])\
        .applymap(diff_color, subset=['min', 'max'])

## <span class="title-section w3-xlarge" style="color:green" id="codebook">1.3  Comparison of statistics of train and test.</span>


In [None]:
zero_data = ((train.iloc[:,:55]==0).sum() / len(train) * 100)[::-1]
fig, ax = plt.subplots(1,1,figsize=(10, 19))

ax.barh(zero_data.index, 100, color='#dadada', height=0.6)
barh = ax.barh(zero_data.index, zero_data, color=light_palette[1], height=0.6)
ax.bar_label(barh, fmt='%.01f %%   ', color='green', fontsize=7)
ax.spines[['left', 'bottom']].set_visible(False)

ax.set_xticks([])

ax.set_title('# of Zeros (by feature)', loc='center', fontweight='bold', fontsize=15)    
plt.show()

In [None]:
fig, axes = plt.subplots(10, 4, figsize=(10, 16))

target_order = sorted(train['Cover_Type'].unique())
mean = train.groupby('Cover_Type').mean().sort_index()
std = train.groupby('Cover_Type').std().sort_index()

for idx, ax in zip(range(1,40), axes.flatten()):
    ax.bar(mean[f'Soil_Type{idx}'].index, mean[f'Soil_Type{idx}'], 
           color=dark_palette[:4], width=0.6)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.margins(0.1)
    ax.spines['left'].set_visible(False)
    ax.set_title(f'Soil_Type_{idx}', loc='right', weight='bold', fontsize=11)

axes.flatten()[-1].axis('off')    
axes.flatten()[-2].axis('off')

fig.supxlabel('Average by class (by feature)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(8,5,figsize=(14, 14))
axes = axes.flatten()

features = "Soil_Type1 Soil_Type2 Soil_Type3 Soil_Type4 Soil_Type5 Soil_Type6 Soil_Type7 Soil_Type8 Soil_Type9 Soil_Type10 Soil_Type11 Soil_Type12 Soil_Type13 Soil_Type14 Soil_Type15 Soil_Type16 Soil_Type17 Soil_Type18 Soil_Type19 Soil_Type20 Soil_Type21 Soil_Type22 Soil_Type23 Soil_Type24 Soil_Type25 Soil_Type26 Soil_Type27 Soil_Type28 Soil_Type29 Soil_Type30 Soil_Type31 Soil_Type32 Soil_Type33 Soil_Type34 Soil_Type35 Soil_Type36 Soil_Type37 Soil_Type38 Soil_Type39 Soil_Type40".split()

for idx1, ax in enumerate(axes):
    idx = idx1 + 1
    sns.kdeplot(data=train, x=f'Soil_Type{idx}', 
                fill=True, 
                ax=ax)
    sns.kdeplot(data=test, x=f'Soil_Type{idx}', 
                fill=True, 
                ax=ax)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    if f'Soil_Type{idx}' in features:
        ax.spines[:].set_visible(True)
        ax.spines[:].set_color('red')
    ax.set_title(f'Soil_Type{idx}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Feature Distribution (by feature Soil_Type1-Soil_Type40)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
label_dict = {val:idx for idx, val in enumerate(sorted(train['Cover_Type'].unique()))}
train['Cover_Type'] = train['Cover_Type'].map(label_dict)

In [None]:
fig, ax = plt.subplots(figsize=(9 , 9))

corr = train.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True



sns.heatmap(corr,
        square=True, center=0, linewidth=0.2,
        cmap=sns.diverging_palette(240, 10, as_cmap=True),
        mask=mask, ax=ax) 

ax.set_title('Feature Correlation', loc='left', fontweight='bold')
plt.show()

## <span class="title-section w3-xlarge" style="color:green" id="codebook">1.4  Try a UMAP plotting</span>


In [None]:
from umap import UMAP

train_sub = train.sample(10000, random_state=0)
target = train_sub['Cover_Type']
umap = UMAP(random_state=0)
dr = umap.fit_transform(train_sub.iloc[:,:-1], target)

In [None]:
fig = plt.figure(figsize=(12, 12))
gs = fig.add_gridspec(5, 5)
ax = fig.add_subplot(gs[:-1,:])

sub_axes = [None] * 5
for idx in range(5): 
    sub_axes[idx] = fig.add_subplot(gs[-1,idx])

for idx in range(5):
    ax.scatter(x=dr[:,0][target==idx], y=dr[:,1][target==idx],
              s=10, alpha=0.2
              )

    for j in range(5):
        sub_axes[j].scatter(x=dr[:,0][target==idx], y=dr[:,1][target==idx],
                              s=10, alpha = 0.4 if idx==j else 0.008, color = (dark_palette[j%9]) if idx==j else white_color,
                            zorder=(idx==j)
                           )
        
    
    sub_axes[idx].set_xticks([])
    sub_axes[idx].set_yticks([])
    sub_axes[idx].set_xlabel('')
    sub_axes[idx].set_ylabel('')
    sub_axes[idx].set_title(f'Class_{idx+1}')
    sub_axes[idx].spines['right'].set_visible(True)
    sub_axes[idx].spines['top'].set_visible(True)

ax.set_title('Dimenstion Reduction (UMAP)', fontweight='bold', fontfamily='serif', fontsize=20, loc='left')   
    
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel('')
ax.set_ylabel('')
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

fig.tight_layout()
plt.show()

# <span class="title-section w3-xxlarge" style="color:green" id="codebook">2. Try various data visualization Plots using Plotly</span>

In [None]:
train['Cover_Type'].replace({0:'Spruce/Fir', 1:'Lodgepole Pine', 2:'Ponderosa Pine', 3:'Cottonwood/Willow', 4:'Aspen', 5:'Douglas-fir', 6:'Krummholz'}, inplace=True)

train = train.rename(columns={"Wilderness_Area1":"Rawah_WA","Wilderness_Area2":"Neota_WA",
"Wilderness_Area3":"Comanche_Peak_WA","Wilderness_Area4":"Cache_la_Poudre_WA","Horizontal_Distance_To_Hydrology":"HD_Hydrology",
"Vertical_Distance_To_Hydrology":"VD_Hydrology","Horizontal_Distance_To_Roadways":"HD_Roadways",
                               "Horizontal_Distance_To_Fire_Points":"HD_Fire_Points"})

In [None]:

train['Wild Areas'] = (train.iloc[:,11:15] == 1).idxmax(1)
train['Soil types'] = (train.iloc[:,15:55] == 1).idxmax(1)

#Drop the columns which are not required now
train = train.drop(columns=['Rawah_WA', 'Neota_WA', 'Comanche_Peak_WA',
       'Cache_la_Poudre_WA', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40'])
#I don't like this big Soil_Type name
#Let's replace it with some short name
#Which will help us in visualizations
lst = []
for value in train['Soil types']:
    value = value.replace('Soil_Type',"ST")
    lst.append(value)
    
train['Soil types'] = lst
#print(lst)

In [None]:
fig = px.histogram(train,x="Cover_Type",color="Cover_Type",height=400,width=800)
fig.show()

In [None]:
fig = px.pie(train,names="Wild Areas",height=300,width=800)
fig.show()


In [None]:
fig = px.pie(train,names="Soil types",height=400,width=850)
fig.update_traces(textposition='inside')
fig.show()

In [None]:
fig = px.pie(train,names="Wild Areas",height=300,width=800)
fig.show()




In [None]:
#fig = px.histogram(train,x="Wild Areas",color="Cover_Type",barmode="group",
#                   height=400,width=800)
#fig.show()

In [None]:
temp = train.groupby(['Cover_Type'],as_index=False)[["Elevation"]].median()
temp.sort_values(by="Elevation",ascending=False).style.background_gradient(cmap="Reds")




In [None]:
temp = train.groupby(['Wild Areas','Cover_Type'],as_index=False)[['Elevation']].median()

#Both barplot and treemap help in better understanding the features.
fig = px.bar(temp, x="Wild Areas", y="Elevation", color='Cover_Type', barmode='group',
             height=400,width=900)
fig.show()


In [None]:
fig = px.treemap(temp, path=['Wild Areas','Cover_Type'],values='Elevation',height=400,width=900)
fig.show()


In [None]:

temp = train.groupby(['Wild Areas','Cover_Type'],as_index=False)[['Aspect']].median()

fig = px.bar(temp, x="Wild Areas", y="Aspect", color='Cover_Type', barmode='group',
             height=400,width=900)
fig.show()

In [None]:

fig = px.treemap(temp, path=['Wild Areas','Cover_Type'],values='Aspect',height=400,width=900)
fig.show()

temp.style.background_gradient(cmap='YlGnBu')
del temp

In [None]:

fig = px.histogram(train,x="Slope",color="Cover_Type",marginal='box',title="Slope Histogram",
                  height=500,width=800)
fig.show()

In [None]:

temp = train.groupby(['Wild Areas','Cover_Type'],as_index=False)[['Slope']].median()

fig = px.bar(temp, x="Wild Areas", y="Slope", color='Cover_Type', barmode='group',
             height=400,width=900)
fig.show()
del temp

In [None]:

#Let's look at their relation with wild areas.....
temp = train.groupby(['Wild Areas','Cover_Type'],as_index=False)[['HD_Hydrology']].median()

fig = px.bar(temp, x="Wild Areas", y="HD_Hydrology", color='Cover_Type', barmode='group',
             height=400,width=900)
fig.show()


In [None]:

temp.style.background_gradient(cmap="Blues")
del temp


In [None]:
#Let's look at their relation with wild areas.....
temp = train.groupby(['Wild Areas','Cover_Type'],as_index=False)[['VD_Hydrology']].median()

fig = px.bar(temp, x="Wild Areas", y="VD_Hydrology", color='Cover_Type', barmode='group',
             height=400,width=900)
fig.show()


In [None]:
fig = px.treemap(temp, path=['Wild Areas','Cover_Type'],values='VD_Hydrology',height=400,width=800)
fig.show()
temp.style.background_gradient(cmap="BuPu")
del temp

In [None]:

#This plot shows us on average distance to roadways for each train covers.
temp = train.groupby(['Cover_Type'],as_index=False)[['HD_Roadways']].median()

fig = px.bar(temp.sort_values(by="HD_Roadways",ascending=False), x="HD_Roadways", y="Cover_Type", color='Cover_Type',orientation='h',
             height=300,width=900)
fig.show()
del temp

In [None]:

temp = train.groupby(['Wild Areas','Cover_Type'],as_index=False)[['HD_Roadways']].median()

fig = px.bar(temp, x="Wild Areas", y="HD_Roadways", color='Cover_Type', barmode='group',
             height=400,width=900)
fig.show()

In [None]:

fig = px.treemap(temp, path=['Wild Areas','Cover_Type'],values='HD_Roadways',height=400,width=800)
fig.show()
temp.style.background_gradient(cmap="Greys")
del temp

In [None]:

temp = train.groupby(['Wild Areas','Cover_Type'],as_index=False)[['HD_Fire_Points']].median()

fig = px.bar(temp, x="Wild Areas", y="HD_Fire_Points", color='Cover_Type', barmode='group',
             height=400,width=900)
fig.show()

In [None]:

fig = px.treemap(temp, path=['Wild Areas','Cover_Type'],values='HD_Fire_Points',height=400,width=800)
fig.show()
temp.style.background_gradient(cmap='YlOrRd')
del temp

In [None]:

temp = train.groupby(['Wild Areas','Cover_Type'],as_index=False)[['Hillshade_9am']].median()

fig = px.bar(temp, x="Wild Areas", y="Hillshade_9am", color='Cover_Type', barmode='group',
             height=400,width=900)
fig.show()
#No use of treemap as we don't see any difference in bar plots.
del temp

In [None]:

temp = train.groupby(['Wild Areas','Cover_Type'],as_index=False)[['Hillshade_Noon']].median()

fig = px.bar(temp, x="Wild Areas", y="Hillshade_Noon", color='Cover_Type', barmode='group',
             height=400,width=900)
fig.show()
del temp

In [None]:

temp = train.groupby(['Wild Areas','Cover_Type'],as_index=False)[['Hillshade_3pm']].median()

fig = px.bar(temp, x="Wild Areas", y="Hillshade_3pm", color='Cover_Type', barmode='group',
             height=400,width=900)
fig.show()

In [None]:

temp.style.background_gradient(cmap="cividis")
del temp

In [None]:

train_corr = train.corr()
train_corr.style.background_gradient(cmap="cool")
fig=plt.figure(figsize=(12,10))
sns.heatmap(train_corr,annot=True,linewidths=.3,cmap='Spectral')

In [None]:
temp = train.groupby(['Cover_Type'],as_index=False)[["Elevation"]].median()
temp.sort_values(by="Elevation",ascending=False).style.background_gradient(cmap="Reds")
del temp

In [None]:
del train, test

# <span class="title-section w3-xxlarge" style="color:green" id="codebook">3. Try clasification using XGB Classifier</span>

In [None]:
train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
reduce_memory_usage(train)
test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
reduce_memory_usage(test)

In [None]:
# predictor
x = train.drop(columns=['Cover_Type'])

# target
y = train['Cover_Type']

# test data 
#test = test.drop(columns=['Id'])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=123, shuffle =True)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(learning_rate=0.3,
                      tree_method='gpu_hist',
                      random_state=0)
model.fit(X_train,y_train)

In [None]:
# validation prediction
y_pred=model.predict(X_val)

In [None]:
# validation accuracy
from sklearn.metrics import accuracy_score
print('Accuracy Score : ',accuracy_score(y_val, y_pred))

In [None]:
# test prediction
y_pred = model.predict(test)

# <span class="title-section w3-xxlarge" style="color:green" id="codebook">4. Submit the files</span>

In [None]:
# submission
submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
submission['Cover_Type'] = y_pred
submission.to_csv("submission.csv",index=False)
submission.head()

# <span class="title-section w3-xxlarge" style="color:green" id="codebook">999. Work In Progress.... More to come</span>
[![image.jpg](https://i.postimg.cc/MH4ZNgkx/image.jpg)](https://postimg.cc/9wdjq8FN)