# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as pyo
import plotly.figure_factory as ff
from sklearn import preprocessing
from plotly import tools
from plotly.subplots import make_subplots
from plotly.offline import iplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn import metrics
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.decomposition import PCA as sklearnPCA
#import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
# Supress unnecessary warnings so that presentation looks clean
import warnings
warnings.filterwarnings("ignore")

# Loading Dataset

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
df_train

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
df_test

# Exploratory Data Analysis

In [None]:
df_train.head()

In [None]:
df_train.tail()

In [None]:
df_train.shape

In [None]:
df_train.size

In [None]:
df_train.dtypes

In [None]:
df_train.columns

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.skew()

In [None]:
df_train.isnull().sum()

In [None]:
df_train.corr()

# Data Visualization

In [None]:
fig = make_subplots(rows=1,cols=2,
                    subplot_titles=('Countplot',
                                    'Percentages'),
                    specs=[[{"type": "xy"},
                            {'type':'domain'}]])
fig.add_trace(go.Bar(y = df_train['Cover_Type'].value_counts().values.tolist(), 
                      x = df_train['Cover_Type'].value_counts().index, 
                      text=df_train['Cover_Type'].value_counts().values.tolist(),
              textfont=dict(size=15),
                      name = 'Cover_Type',
                      textposition = 'outside',
                      showlegend=False,
              marker = dict(color = 'cornflowerblue',
                            line_color = 'black',
                            line_width=3)),row = 1,col = 1)
fig.add_trace((go.Pie(labels=df_train['Cover_Type'].value_counts().keys(),
                             values=df_train['Cover_Type'].value_counts().values,textfont = dict(size = 16),
                     textposition='auto',
                     showlegend = True,
                     name = 'Cover_Type')), row = 1, col = 2)
fig.update_layout(title={'text': 'Type',
                         'y':0.9,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  template='plotly_white')
fig.update_yaxes(range=[0,2400000])
iplot(fig)

In [None]:
! pip install autoviz

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class

AV = AutoViz_Class()

train = df_train

filename = ""
sep = ","
dft = AV.AutoViz(
    filename,
    sep=",",
    depVar="Cover_Type",
    dfte=train,
    header=0,
    verbose=0,
    lowess=False,
    chart_format="svg",
    max_rows_analyzed=100000,
    max_cols_analyzed=40,
)

In [None]:
from IPython.core.display import HTML

def multi_table(table_list):
    ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>')

In [None]:
df_train.columns

In [None]:
df_nunique = {var: pd.DataFrame(df_train[var].value_counts()) 
              for var in {'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40'}}
multi_table([df_nunique['Wilderness_Area1'], df_nunique['Wilderness_Area2'], 
             df_nunique['Wilderness_Area3'], df_nunique['Wilderness_Area4'], 
             df_nunique['Soil_Type1'], df_nunique['Soil_Type2'],df_nunique['Soil_Type3'],
            df_nunique['Soil_Type4'],df_nunique['Soil_Type5'],df_nunique['Soil_Type6'],
            df_nunique['Soil_Type7'],df_nunique['Soil_Type8'],df_nunique['Soil_Type9'],df_nunique['Soil_Type10'],
            df_nunique['Soil_Type11'],df_nunique['Soil_Type12'],df_nunique['Soil_Type13'],df_nunique['Soil_Type14'],
            df_nunique['Soil_Type15'],df_nunique['Soil_Type16'],df_nunique['Soil_Type17'],df_nunique['Soil_Type18'],
            df_nunique['Soil_Type19'],df_nunique['Soil_Type20'],df_nunique['Soil_Type21'],df_nunique['Soil_Type22'],
            df_nunique['Soil_Type23'],df_nunique['Soil_Type24'],df_nunique['Soil_Type25'],df_nunique['Soil_Type26'],
            df_nunique['Soil_Type27'],df_nunique['Soil_Type28'],df_nunique['Soil_Type29'],df_nunique['Soil_Type30'],
            df_nunique['Soil_Type31'],df_nunique['Soil_Type32'],df_nunique['Soil_Type33'],df_nunique['Soil_Type33'],
            df_nunique['Soil_Type34'],df_nunique['Soil_Type35'],df_nunique['Soil_Type36'],df_nunique['Soil_Type37'],
            df_nunique['Soil_Type38'],df_nunique['Soil_Type39'],df_nunique['Soil_Type40']])

#### Observation:

1. Solid_Type7,Solid_Type15 are having only 0

In [None]:
df_groupby = {var: pd.DataFrame(df_train.groupby([var, 'Cover_Type']).size()) 
              for var in {'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40'}}
multi_table([df_groupby['Wilderness_Area1'], df_groupby['Wilderness_Area2'], 
             df_groupby['Wilderness_Area3'], df_groupby['Wilderness_Area4'], 
             df_groupby['Soil_Type1'], df_groupby['Soil_Type2'],df_groupby['Soil_Type3'],
            df_groupby['Soil_Type4'],df_groupby['Soil_Type5'],df_groupby['Soil_Type6'],
            df_groupby['Soil_Type7'],df_groupby['Soil_Type8'],df_groupby['Soil_Type9'],df_groupby['Soil_Type10'],
            df_groupby['Soil_Type11'],df_groupby['Soil_Type12'],df_groupby['Soil_Type13'],df_groupby['Soil_Type14'],
            df_groupby['Soil_Type15'],df_groupby['Soil_Type16'],df_groupby['Soil_Type17'],df_groupby['Soil_Type18'],
            df_groupby['Soil_Type19'],df_groupby['Soil_Type20'],df_groupby['Soil_Type21'],df_groupby['Soil_Type22'],
            df_groupby['Soil_Type23'],df_groupby['Soil_Type24'],df_groupby['Soil_Type25'],df_groupby['Soil_Type26'],
            df_groupby['Soil_Type27'],df_groupby['Soil_Type28'],df_groupby['Soil_Type29'],df_groupby['Soil_Type30'],
            df_groupby['Soil_Type31'],df_groupby['Soil_Type32'],df_groupby['Soil_Type33'],df_groupby['Soil_Type33'],
            df_groupby['Soil_Type34'],df_groupby['Soil_Type35'],df_groupby['Soil_Type36'],df_groupby['Soil_Type37'],
            df_groupby['Soil_Type38'],df_groupby['Soil_Type39'],df_groupby['Soil_Type40']])

#### Observation:
1. We can see covertype of 5 doesnot present in other categorical column which have value 1

In [None]:
train_num_visual_0 = ['Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']

In [None]:
sns.set_theme(rc = {'figure.dpi': 120, 'axes.labelsize': 8, 
                    'axes.facecolor': '#f0eee9', 'grid.color': '#fffdfa', 
                    'figure.facecolor': '#e8e6e1'}, font_scale = 0.65)

fig, ax = plt.subplots(10, 1, figsize = (6, 30))

for indx, (column, axes) in list(enumerate(list(zip(train_num_visual_0, ax.flatten())))):
    
    sns.histplot(ax = axes, x = df_train[column], hue = df_train['Cover_Type'], 
                 palette = 'magma', alpha = 0.8, multiple = 'stack')
    
    legend = axes.get_legend() # sns.hisplot has some issues with legend
    handles = legend.legendHandles
    legend.remove()
    axes.legend(handles, ['0', '1'], title = 'Cover_Type', loc = 'upper right')
    Quantiles = np.quantile(df_train[column], [0, 0.25, 0.50, 0.75, 1])
    
    for q in Quantiles: axes.axvline(x = q, linewidth = 0.5, color = 'r')
        
plt.tight_layout()
plt.show()

In [None]:
cat = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40']

In [None]:
sns.set_theme(rc = {'figure.dpi': 250, 'axes.labelsize': 7, 
                    'axes.facecolor': '#f0eee9', 'grid.color': '#fffdfa', 
                    'figure.facecolor': '#e8e6e1'}, font_scale = 0.55)

fig, ax = plt.subplots(23, 2, figsize = (6.5, 80))

for indx, (column, axes) in list(enumerate(list(zip(cat, 
                                                    ax.flatten())))):
    
    sns.countplot(ax = axes, x = df_train[column], hue = df_train['Cover_Type'], 
                  palette = 'magma', alpha = 0.8)
    
else:
    [axes.set_visible(False) for axes in ax.flatten()[indx + 1:]]
    
axes_legend = ax.flatten()

axes_legend[1].legend(title = 'Cover_Type', loc = 'upper right')
axes_legend[2].legend(title = 'Cover_Type', loc = 'upper right')

plt.tight_layout()
plt.show()

In [None]:
if len(cat) == 0 :
    print("No Categorical features")
else:
    ncols = 5
    nrows = int(len(cat) / ncols + (len(df_train.columns[:-1]) % ncols > 0)) 

    fig, axes = plt.subplots(nrows, ncols, figsize=(18, 45), facecolor='#EAEAF2')

    for r in range(nrows):
        for c in range(ncols):
            if r*ncols+c >= len(cat):
                break
            col = cat[r*ncols+c]
            sns.countplot(x=df_train[col], ax=axes[r, c], palette = 'magma', label='Train data')
            axes[r, c].set_ylabel('')
            axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
            axes[r, c].tick_params(labelsize=5, width=0.5)
            axes[r, c].xaxis.offsetText.set_fontsize(4)
            axes[r, c].yaxis.offsetText.set_fontsize(4)
    plt.show()

In [None]:
df_test = df_test.drop(columns=['Id'])

# Feature Selection

In [None]:
X = df_train.drop(columns=['Id','Cover_Type'])
y = df_train['Cover_Type']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48, shuffle =True)

# Data Modeling

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(learning_rate=0.3,n_jobs=-1,
                      tree_method='gpu_hist',
                      random_state=0)
model.fit(X_train,y_train)

In [None]:
# validation prediction
pred=model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print('Accuracy Score : ',accuracy_score(y_test, pred))

In [None]:
y_pred = model.predict(df_test)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
submission['Cover_Type'] = y_pred
submission.to_csv("submission.csv",index=False)
submission.head()

## Work in Progress