<b>Problem Statement:</b>  The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. This dataset is based off of the original <a href="https://www.kaggle.com/c/forest-cover-type-prediction/overview">Forest Cover Type Prediction</a> competition.

<b>Problem type:</b> A multi-class classification problem.

<b>Evaluation matrix:</b> Submissions are evaluated on <b>multi-class classification accuracy</b>.

<h2 id="Approach">Approach to the problem</h2>
Idea is to develop a generalized approach for solving any multiclass classification problem
<ol>
    <li>Performing exploratory data analysis (EDA) and Data Preparation (DP).</li>
    <ol>
        <li><a href="#FeatureSummary">Understanding Train and Test dataset features (EDA)</a></li>
        <li><a href="#Downcasting">Down Casting Train and Test datasets (DP)</a></li>
        <li><a href="#Target">Understanding Cover_Type (target) feature distribution (EDA)</a></li>
        <li><a href="#Corr">Correlation check (EDA)</a></li>
        <li><a href="#TrainVisual">Visualizing Training dataset (EDA)</a></li>
        <li><a href="#CUP">Data Clean up and Feature Tuning(DP)</a></li>
    </ol>
    <li>Feature Engineering.</li>
    <ol>
        <li><a href="#AggFeatures">Creating Aggregated features</a></li>
    </ol>
    <li>Training Linear,Gradient Boost and Ensemble models.</li>
    <ol>
        <li><a href="#Ridge">Ridge Classifier</a></li>
        <li><a href="DTC">DecisionTree Classification</a></li>
        <li><a href="#LGBM">LGBM Classification</a></li>
        <li><a href="#XGB">XGBClassifer</a></li>
    </ol>
 </ol>

<h4>Observations</h4>
<ul>
    <li> Cross Validation score for LGBM (0.9288525) is better than Ridge (0.89618075) for 10 iterations</li>
    <li> But on public leaderboard Ridge (0.88672) and LGBM (0.89817) scores have higher variance from cross validation scores</li>
    <li> Cross Validation score for Ridge (0.89611824), DecisionTree (0.94623975007), LGBM (0.9325144932) and corresponding public leaderboard scores Ridge (0.88771), DecisionTree (0.92391), LGBM (0.91971) for 15 iterations</li>
    <li> We are getting best score with DecisionTree classifier without any feature turning or feature engineering</li>
    <li> Now lets try some feature engineering and feature tuning</li>
</ul>

In [None]:
#REQUIRED LIBRARIES

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression,RidgeClassifier,Lasso
# from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.utils.extmath import softmax
# import pickle
# from sklearn.externals import joblib

# import pandas_profiling as pp

warnings.filterwarnings('ignore')
gc.enable()
%matplotlib inline

In [None]:
#CHECKING ALL AVAILABLE FILES
path='../input/tabular-playground-series-dec-2021/'
data_files=list(os.listdir(path))
df_files=pd.DataFrame(data_files,columns=['file_name'])
df_files['size_in_mb']=df_files.file_name.apply(lambda x: round(os.path.getsize(path+x)/(1024*1024),4))
df_files['type']=df_files.file_name.apply(lambda x:'file' if os.path.isfile(path+x) else 'directory')
df_files['file_count']=df_files[['file_name','type']].apply(lambda x: 0 if x['type']=='file' else len(os.listdir(path+x['file_name'])),axis=1)

print('Following files are available under path:',path)
display(df_files)

In [None]:
#ALL CUSTOM FUNCTIONS

#FUNCTION FOR PROVIDING FEATURE SUMMARY
def feature_summary(df_fa):
#     print('DataFrame shape')
#     print('rows:',df_fa.shape[0])
#     print('cols:',df_fa.shape[1])
    col_list=['null','unique_count','data_#type','max/min','mean','median','mode','std','skewness','sample_values']
    df=pd.DataFrame(index=df_fa.columns,columns=col_list)
    df['null']=list([len(df_fa[col][df_fa[col].isnull()]) for i,col in enumerate(df_fa.columns)])
    #df['%_Null']=list([len(df_fa[col][df_fa[col].isnull()])/df_fa.shape[0]*100 for i,col in enumerate(df_fa.columns)])
    df['unique_count']=list([len(df_fa[col].unique()) for i,col in enumerate(df_fa.columns)])
    df['data_type']=list([df_fa[col].dtype for i,col in enumerate(df_fa.columns)])
    for i,col in enumerate(df_fa.columns):
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.at[col,'max/min']=str(round(df_fa[col].max(),2))+'/'+str(round(df_fa[col].min(),2))
            df.at[col,'mean']=round(df_fa[col].mean(),4)
            df.at[col,'median']=round(df_fa[col].median(),4)
            df.at[col,'mode']=round(df_fa[col].mode()[0],4)
            df.at[col,'std']=round(df_fa[col].std(),4)
            df.at[col,'skewness']=round(df_fa[col].skew(),4)
        elif 'datetime64[ns]' in str(df_fa[col].dtype):
            df.at[col,'max/min']=str(df_fa[col].max())+'/'+str(df_fa[col].min())
        df.at[col,'sample_values']=list(df_fa[col].unique())
#     display(df_fa.head())      
    return(df.fillna('-'))


def feature_compare(df_fa,df_ft):
    print('Train DataFrame shape')
    print('rows:',df_fa.shape[0])
    print('cols:',df_fa.shape[1])
    
    print('Test DataFrame shape')
    print('rows:',df_ft.shape[0])
    print('cols:',df_ft.shape[1])
    
    col_list=['null','unique_count','data_type','max/min','mean','median','mode','std','skewness','sample_values']
    df=pd.DataFrame(index=pd.MultiIndex.from_product([df_train.columns,['train','test']],names=['features','dataset']),columns=col_list)
   
    df.loc[(slice(None),['train']),'null']=list([len(df_fa[col][df_fa[col].isnull()]) for i,col in enumerate(df_fa.columns)])
    df.loc[(slice(None),['test']),'null']=list([len(df_ft[col][df_ft[col].isnull()]) for i,col in enumerate(df_ft.columns)])+['-']
    
    
    #df['%_Null']=list([len(df_fa[col][df_fa[col].isnull()])/df_fa.shape[0]*100 for i,col in enumerate(df_fa.columns)])
    df.loc[(slice(None),['train']),'unique_count']=list([len(df_fa[col].unique()) for i,col in enumerate(df_fa.columns)])
    df.loc[(slice(None),['test']),'unique_count']=list([len(df_ft[col].unique()) for i,col in enumerate(df_ft.columns)])+['-']
    
    df.loc[(slice(None),['train']),'data_type']=list([df_fa[col].dtype for i,col in enumerate(df_fa.columns)])
    df.loc[(slice(None),['test']),'data_type']=list([df_ft[col].dtype for i,col in enumerate(df_ft.columns)])+['-']
    
    for i,col in enumerate(df_fa.columns):
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.loc[([col],['train']),'max/min']=str(round(df_fa[col].max(),2))+'/'+str(round(df_fa[col].min(),2))
            df.loc[([col],['train']),'mean']=round(df_fa[col].mean(),4)
            df.loc[([col],['train']),'median']=round(df_fa[col].median(),4)
            df.loc[([col],['train']),'mode']=round(df_fa[col].mode()[0],4)
            df.loc[([col],['train']),'std']=round(df_fa[col].std(),4)
            df.loc[([col],['train']),'skewness']=round(df_fa[col].skew(),4)
        elif 'datetime64[ns]' in str(df_fa[col].dtype):
            df.loc[([col],['train']),'max/min']=str(df_fa[col].max())+'/'+str(df_fa[col].min())
        df.loc[([col],['train']),'sample_values']=str(list(df_fa[col].unique()))
        
        
    for i,col in enumerate(df_ft.columns):            
        if 'float' in str(df_fa[col].dtype) or 'int' in str(df_fa[col].dtype):
            df.loc[([col],['test']),'max/min']=str(round(df_ft[col].max(),2))+'/'+str(round(df_ft[col].min(),2))
            df.loc[([col],['test']),'mean']=round(df_ft[col].mean(),4)
            df.loc[([col],['test']),'median']=round(df_ft[col].median(),4)
            df.loc[([col],['test']),'mode']=round(df_ft[col].mode()[0],4)
            df.loc[([col],['test']),'std']=round(df_ft[col].std(),4)
            df.loc[([col],['test']),'skewness']=round(df_ft[col].skew(),4)
        elif 'datetime64[ns]' in str(df_fa[col].dtype):
            df.loc[([col],['test']),'max/min']=str(df_ft[col].max())+'/'+str(df_ft[col].min())
        df.loc[([col],['test']),'sample_values']=str(list(df_ft[col].unique()))
        
    return(df.fillna('-'))

#EXTENDING RIDGE CLASSIFIER WITH PREDICT PROBABILITY FUNCITON

class RidgeClassifierwithProba(RidgeClassifier):
    def predict_proba(self, X):
        d = self.decision_function(X)
        d_2d = np.c_[-d, d]
        return softmax(d_2d)

    
#PREDICTION FUNCTIONS

def type_predictor(X,y,test,iterations,model,model_name):  

    df_preds=pd.DataFrame()
    df_preds_x=pd.DataFrame()
    k=1
    splits=iterations
    avg_score=0

    #CREATING STRATIFIED FOLDS
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=200)
    print('\nStarting KFold iterations...')
    for train_index,test_index in skf.split(X,y):
        df_X=X[train_index,:]
        df_y=y[train_index]
        val_X=X[test_index,:]
        val_y=y[test_index]
       

    #FITTING MODEL
        model.fit(df_X,df_y)

    #PREDICTING ON VALIDATION DATA
        col_name=model_name+'xpreds_'+str(k)
        preds_x=pd.Series(model.predict(val_X))
        df_preds_x[col_name]=pd.Series(model.predict(X))

    #CALCULATING ACCURACY
        acc=accuracy_score(val_y,preds_x)
        print('Iteration:',k,'  accuracy_score:',acc)
        if k==1:
            score=acc
            best_model=model
            preds=pd.Series(model.predict(test))
            col_name=model_name+'preds_'+str(k)
            df_preds[col_name]=preds
        else:
            preds1=pd.Series(model.predict(test))
            preds=preds+preds1
            col_name=model_name+'preds_'+str(k)
            df_preds[col_name]=preds1
            if score<acc:
                score=acc
                best_model=model
        avg_score=avg_score+acc        
        k=k+1
    print('\n Best score:',score,' Avg Score:',avg_score/splits)
    #TAKING AVERAGE OF PREDICTIONS
    preds=preds/splits
    
    print('Saving test and train predictions per iteration...')
    df_preds.to_csv(model_name+'.csv',index=False)
    df_preds_x.to_csv(model_name+'_.csv',index=False)
    x_preds=df_preds_x.mean(axis=1)
    del df_preds,df_preds_x
    gc.collect()
    return preds,best_model,x_preds 

In [None]:
%%time
#READING TRAIN DATASET

df_train=pd.read_csv(path+'train.csv')

#READING TEST DATASET AND SUBMISSION FILE
df_test=pd.read_csv(path+'test.csv')
df_submission=pd.read_csv(path+'sample_submission.csv')

<h2 id="FeatureSummary">Understanding Train and Test dataset features</h2>
Understanding Train and Test dataset features in comparative view, using basic statistical measures.

<h4>Observations</h4>
<ul>
    <li>No missing values in train or test dataset</li>
    <li>Features <b>Soil_Type7 and Soil_Type15</b> are zero in both train and test datasets</li>
</ul>

<br><a href="#Approach">back to main menu</a>

<table align="left">
    <caption><b>TRAIN and TEST DATASET FEATURES</b></caption>
    <tr><th>Feature</th><th>Feature description</th></tr>
    <tr><td>Elevation</td><td>Elevation in meters.</td></tr>
    <tr><td>Aspect</td><td>Aspect in degrees <a href="https://www.photopills.com/sites/default/files/tutorials/2014/2-azimuth-elevation.jpg">azimuth</a> (The azimuth is the angle between North, measured clockwise around the observer's horizon.)</td></tr>
    <tr><td>Slope</td><td>Slope in degrees.</td></tr>
    <tr><td>Horizontal_Distance_To_Hydrology</td><td>Horz Dist to nearest surface water features.</td></tr>
    <tr><td>Vertical_Distance_To_Hydrology</td><td>Vert Dist to nearest surface water features.</td></tr>
    <tr><td>Horizontal_Distance_To_Roadways</td><td>Horz Dist to nearest roadway.</td></tr>
    <tr><td>Hillshade_9am (0 to 255 index)</td><td>Hillshade index at 9am, summer solstice.</td></tr>
    <tr><td>Hillshade_Noon (0 to 255 index)</td><td>Hillshade index at noon, summer solstice.</td></tr>
    <tr><td>Hillshade_3pm (0 to 255 index)</td><td>Hillshade index at 3pm, summer solstice.</td></tr>
    <tr><td>Horizontal_Distance_To_Fire_Points</td><td>Horz Dist to nearest wildfire ignition points.</td></tr>
    <tr><td>Wilderness_Area (4 binary columns, 0 = absence or 1 = presence)</td><td>Wilderness area designation.</td></tr>
    <tr><td>Soil_Type (40 binary columns, 0 = absence or 1 = presence)</td><td>Soil Type designation.</td></tr>
    <tr><td>Cover_Type (7 types, integers 1 to 7)</td><td>Forest Cover Type designation.</td></tr>
</table>


<table align="left">
    <caption><b>THE WILDERNESS AREA DETAILS</b></caption>
    <tr><th>Wilderness type</th><th>Description</th></tr>
    <tr><td>1</td><td>Rawah Wilderness Area.</td></tr>
    <tr><td>2</td><td>Neota Wilderness Area.</td></tr>
    <tr><td>3</td><td>Comanche Peak Wilderness Area.</td></tr>
    <tr><td>4</td><td>Cache la Poudre Wilderness Area.</td></tr>
</table>

<table align="left">
    <caption><b>SOIL TYPE DETAILS</b></caption>
    <tr><th>Soil Type</th><th>Description</th></tr>
    <tr><td>1</td><td>Cathedral family - Rock outcrop complex, extremely stony.</td></tr>
    <tr><td>2</td><td>Vanet - Ratake families complex, very stony.</td></tr>
    <tr><td>3</td><td>Haploborolis - Rock outcrop complex, rubbly.</td></tr>
    <tr><td>4</td><td>Ratake family - Rock outcrop complex, rubbly.</td></tr>
    <tr><td>5</td><td>Vanet family - Rock outcrop complex complex, rubbly.</td></tr>
    <tr><td>6</td><td>Vanet - Wetmore families - Rock outcrop complex, stony.</td></tr>
    <tr><td>7</td><td>Gothic family.</td></tr>
    <tr><td>8</td><td>Supervisor - Limber families complex.</td></tr>
    <tr><td>9</td><td>Troutville family, very stony.</td></tr>
    <tr><td>10</td><td>Bullwark - Catamount families - Rock outcrop complex, rubbly.</td></tr>
    <tr><td>11</td><td>Bullwark - Catamount families - Rock land complex, rubbly.</td></tr>
    <tr><td>12</td><td>Legault family - Rock land complex, stony.</td></tr>
    <tr><td>13</td><td>Catamount family - Rock land - Bullwark family complex, rubbly.</td></tr>
    <tr><td>14</td><td>Pachic Argiborolis - Aquolis complex.</td></tr>
    <tr><td>15</td><td>unspecified in the USFS Soil and ELU Survey.</td></tr>
    <tr><td>16</td><td>Cryaquolis - Cryoborolis complex.</td></tr>
    <tr><td>17</td><td>Gateview family - Cryaquolis complex.</td></tr>
    <tr><td>18</td><td>Rogert family, very stony.</td></tr>
    <tr><td>19</td><td>Typic Cryaquolis - Borohemists complex.</td></tr>
    <tr><td>20</td><td>Typic Cryaquepts - Typic Cryaquolls complex.</td></tr>
    <tr><td>21</td><td>Typic Cryaquolls - Leighcan family, till substratum complex.</td></tr>
    <tr><td>22</td><td>Leighcan family, till substratum, extremely bouldery.</td></tr>
    <tr><td>23</td><td>Leighcan family, till substratum - Typic Cryaquolls complex.</td></tr>
    <tr><td>24</td><td>Leighcan family, extremely stony.</td></tr>
    <tr><td>25</td><td>Leighcan family, warm, extremely stony.</td></tr>
    <tr><td>26</td><td>Granile - Catamount families complex, very stony.</td></tr>
    <tr><td>27</td><td>Leighcan family, warm - Rock outcrop complex, extremely stony.</td></tr>
    <tr><td>28</td><td>Leighcan family - Rock outcrop complex, extremely stony.</td></tr>
    <tr><td>29</td><td>Como - Legault families complex, extremely stony.</td></tr>
    <tr><td>30</td><td>Como family - Rock land - Legault family complex, extremely stony.</td></tr>
    <tr><td>31</td><td>Leighcan - Catamount families complex, extremely stony.</td></tr>
    <tr><td>32</td><td>Catamount family - Rock outcrop - Leighcan family complex, extremely stony.</td></tr>
    <tr><td>33</td><td>Leighcan - Catamount families - Rock outcrop complex, extremely stony.</td></tr>
    <tr><td>34</td><td>Cryorthents - Rock land complex, extremely stony.</td></tr>
    <tr><td>35</td><td>Cryumbrepts - Rock outcrop - Cryaquepts complex.</td></tr>
    <tr><td>36</td><td>Bross family - Rock land - Cryumbrepts complex, extremely stony.</td></tr>
    <tr><td>37</td><td>Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony.</td></tr>
    <tr><td>38</td><td>Leighcan - Moran families - Cryaquolls complex, extremely stony.</td></tr>
    <tr><td>39</td><td>Moran family - Cryorthents - Leighcan family complex, extremely stony.</td></tr>
    <tr><td>40</td><td>Moran family - Cryorthents - Rock land complex, extremely stony.</td></tr>
</table>

In [None]:
%%time
#UNDERSTANDING TRAIN AND TEST DATASET USING FEATURE BY FEATURE COMPRISON
pd.set_option('display.max_rows', None)
feature_compare(df_train,df_test)

In [None]:
gc.collect()

<h2 id="Downcasting">Down Casting Training and Testing datasets</h2>
Checking possibility for down casting dataset datatypes. This will help in reducing overall dataset size.

<h4>Observations</h4>
<ul>
    <li>We have only one data type in datasets, i.e., int64</li>
    <li>It is always a good idea to reduce overall dataset size by finding correct datatypes</li>
    <li>With downcasting able to reduce training dataset size from 1.7 GB to 259.4 MB</li>
    <li>With downcasting able to reduce testing dataset size from 419.6 MB to 63.9 MB</li>
</ul>

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
#CHECKING TRAIN AND TEST DATASET MEMORY USAGE BEFORE DOWNCASTING
print('\ntrain dataset data usage information before downcasting\n')
df_train.info(memory_usage='deep',max_cols=1)
print('\ntest dataset data usage information before downcasting\n')
df_test.info(memory_usage='deep',max_cols=1)


#DOWNCASTING TRAIN DATASET
for column in df_train.columns:
    if df_train[column].dtype == "float64":
        df_train[column]=pd.to_numeric(df_train[column], downcast="float")
    if df_train[column].dtype == "int64":
        df_train[column]=pd.to_numeric(df_train[column], downcast="integer")
        
#DOWNCASTING TEST DATASET
for column in df_test.columns:
    if df_test[column].dtype == "float64":
        df_test[column]=pd.to_numeric(df_test[column], downcast="float")
    if df_test[column].dtype == "int64":
        df_test[column]=pd.to_numeric(df_test[column], downcast="integer")
        
#CHECKING TRAIN AND TEST DATASET MEMORY USAGE AFTER DOWNCASTING
print('train dataset data usage information after downcasting\n')
df_train.info(memory_usage='deep',max_cols=1)
print('\ntest dataset data usage information after downcasting\n')
df_test.info(memory_usage='deep',max_cols=1)

In [None]:
%%time
#CREATING A FEATURE LIST EXCLUDING ID AND TARGET
features=[col for col in df_train.columns if col!='Id' and col!='Cover_Type' and col!='Soil_Type7' and col!='Soil_Type15']
df_fs=feature_summary(df_train[features])
binary_features=[col for i,col in enumerate(df_fs.index) if df_fs.iloc[i,1]==2]
nonbinary_features=[col for i,col in enumerate(df_fs.index) if df_fs.iloc[i,1]!=2]

print('Total features excluding ID and Cover_Type(target feature):',len(features))
print('Total binary features:',len(binary_features))
print('Total Non binary features:',len(nonbinary_features))

In [None]:
%%time
#VISUALIZING TRAIN AND TEST FEATURE DISTRIBUTION
plt.figure()
fig, ax = plt.subplots(4,3 ,figsize=(20,20))

for i,feature in enumerate(nonbinary_features):
    plt.subplot(4, 3,i+1)
    sns.kdeplot(data=df_train[feature],x=df_train[feature],color='red', label='train')
    plt.axvline(x=df_train[feature].mean(),color='yellow',linestyle='--',label='train mean')
    sns.kdeplot(df_test[feature],x=df_test[feature],color='grey',label='test')
    plt.axvline(x=df_test[feature].mean(),color='orange',linestyle='--',label='test mean')
    plt.xlabel(feature,color='blue')

    plt.legend(loc=1,fontsize='x-small')
    
    
plt.show();

In [None]:
gc.collect()

<h2 id="Target">Understanding Cover_Type (target) feature distribution</h2>
Lets visualize Cover_Type (target) feature.

<h4>Observation</h4>
 

<br><a href="#Approach">back to main menu</a>

In [None]:
#Cover types description from original Forest Cover Type Prediction competition
data={1:'Spruce/Fir',2:'Lodgepole Pine',3:'Ponderosa Pine',4:'Cottonwood/Willow',5:'Aspen',6:'Douglas-fir',7:'Krummholz'}
df_cover_type=pd.DataFrame(list(data.items()),columns=['Cover_Type','Cover_Description'])


df=df_train[['Id','Cover_Type']].groupby('Cover_Type').count().reset_index().sort_values(by='Cover_Type')
df.columns=['Cover_Type','Observation Count']
df=df.merge(df_cover_type,on='Cover_Type',how='left')
df['Cover_Type_Desc']=[str(df.loc[i,'Cover_Type'])+'-'+df.loc[i,'Cover_Description'] for i in df.index]
#CREATING VISUALIZATION
fig=px.pie(df,names='Cover_Type_Desc',values='Observation Count',color='Cover_Type_Desc',hole=0.6,
           color_discrete_sequence=px.colors.qualitative.Plotly,width=700,height=700)
           

fig.update_layout(
                    {'paper_bgcolor':'#FEFBF3'},
                    title={
                        'text': "Cover_Type Distribution",
                        'x':0.5,
                        'font_color':"red"},
    
                    font_color="blue",
                    legend_title="Cover_Type",
                    legend_title_font_color="green",
                    legend=dict(
                                yanchor="top",
                                y=0.99,
                                xanchor="right",
                                x=0.05
                                )

                )


fig.show()

<h2 id="Corr">Correlation Check</h2>
Lets check if there are any correlated features. If two features are highly correlated we can remove one of the feature.
This will help in dimentionality reduction.

<h4>Observations</h4>
<ul>
    <li></li>
   
</ul>

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
#CORRELATION CHECK CATEGORICAL FEATURES
corr = df_train[features+['Cover_Type']].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Plotting correlation heatmap
fig,ax=plt.subplots(figsize=(20,20))
ax.set_xticklabels(labels=corr.columns,fontsize=12)
ax.set_yticklabels(labels=corr.columns,fontsize=12)
# plt.rcParams.update({'font.size': 12})
sns.heatmap(corr,mask=mask,cmap='tab20c',linewidth=0.1)
plt.title('CORRELATION MAP',color='blue',fontsize=12)
plt.show()

In [None]:
del corr
gc.collect()

<h2 id="TrainVisual">Visualizating Training dataset</h2>
We are making use of PCA, dimentionality reduction technique to Visualize Training dataset.<br>
Visualization is also helpful in understanding any grouping or patterns within dataset.
<h4>Observation</h4>
3D plot shows some pattern or grouping along different planes

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
X=df_train[features]

pca = PCA(n_components=2,random_state=200)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal_component_1','principal_component_2'])
principalDf['Cover_Type']=df_train['Cover_Type']

fig = plt.figure(figsize=(15,15))
sc=plt.scatter(x=principalDf['principal_component_1'], y=principalDf['principal_component_2'],c=principalDf['Cover_Type'],cmap='Accent')
plt.legend(*sc.legend_elements(),bbox_to_anchor=(1.05, 1), loc=2)
plt.title('2D Visualization of train Dataset',color='blue',fontsize=12)
plt.show()

In [None]:
%%time
X=df_train[features]

pca = PCA(n_components=3,random_state=200)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents,columns = ['principal_component_1','principal_component_2','principal_component_3'])
principalDf['Cover_Type']=df_train['Cover_Type']

fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111, projection = '3d')

ax.set_xlabel("principal_component_1")
ax.set_ylabel("principal_component_2")
ax.set_zlabel("principal_component_3")

sc=ax.scatter(xs=principalDf['principal_component_1'], ys=principalDf['principal_component_2'],
              zs=principalDf['principal_component_3'],c=principalDf['Cover_Type'],cmap='Accent')
plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2)
plt.title('3D Visualization of train Dataset',color='blue',fontsize=12)
plt.show()

In [None]:
del X
gc.collect()

<h2 id="CUP">Data Clean up and Feature Tuning</h2>
Correcting feature ranges and dropping rows with 'Cover_Type' 5 as there is only one record with this type.

Credit for these improvements goes to below notebook:

https://www.kaggle.com/gulshanmishra/tps-dec-21-tensorflow-nn-feature-engineering

<h4>Observations</h4>
<ul>
    <li><b>Aspect</b> is the compass direction that a terrain faces. Here, It is expressed in degrees. All the values from 0 to 359 are present. Besides, there are some values greater than 359 and some smaller than 0. It will be better If we make all the values in this column lie in the range (0, 359). Moreover, all the values in this column lies in the range (-33, 407), train dataset max value 407 and min value -33, whereas for test dataset max value 400 and min value -33. Please check <a href="#FeatureSummary">feature summary</a> section. So adding 360 to angles smaller than 0 and subtracting 360 from angles greater than 359 will do the work.</li>
    <li>Hillshading computes surface illumination as values from 0 to 255 based on a given compass direction to the sun (azimuth) and a certain altitude above the horizon (altitude). Hillshades are often used to produce maps that are visually appealing. In both train and test datasets, there are certain rows with <b>hillshade</b> value more than 255 or less than 0. They must be the result of recording error and should be relpaced with an appropriate value. Perhaps, values less than 0 refer to the darkest shade and replacing them with 0 should be fine. Similarly, we can assume that hillshade values more than 255 refer to the brightest shades and a value of 255 should be good replacement.</li>
 
</ul>

<br><a href="#Approach">back to main menu</a>

In [None]:
#Dropping Cover_Type 5 as there is only on record from this category
print('Train dataset shape before dropping Cover_Type 5',df_train.shape)

df_train.drop(df_train[df_train["Cover_Type"] == 5].index, axis=0, inplace=True)

print('Train dataset shape after dropping Cover_Type 5',df_train.shape)

In [None]:
#checking Aspect feature before correction
print('Train dataset: Aspect feature summary before correction')
display(feature_summary(df_train[['Id','Aspect']]))

print('Test dataset: Aspect feature summary before correction')
display(feature_summary(df_test[['Id','Aspect']]))

In [None]:
%%time
#Correcting Aspect Range.if value is less than 0 adding 360 to it and if value is greater than 359 subtracting 360 from it
df_train["Aspect"][df_train["Aspect"] < 0] += 360
df_train["Aspect"][df_train["Aspect"] > 359] -= 360

df_test["Aspect"][df_test["Aspect"] < 0] += 360
df_test["Aspect"][df_test["Aspect"] > 359] -= 360

In [None]:
#checking Aspect feature after correction
print('Train dataset: Aspect feature summary after correction')
display(feature_summary(df_train[['Id','Aspect']]))

print('Test dataset: Aspect feature summary after correction')
display(feature_summary(df_test[['Id','Aspect']]))

In [None]:
#checking hillshade features before correction
print('Train dataset hillshade features summary before correction')
display(feature_summary(df_train[['Id','Hillshade_9am','Hillshade_Noon','Hillshade_3pm']]))

print('Test dataset hillshade features summary before correction')
display(feature_summary(df_test[['Id','Hillshade_9am','Hillshade_Noon','Hillshade_3pm']]))

In [None]:
%%time
#Correcting Hillshade features
df_train.loc[df_train["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
df_test.loc[df_test["Hillshade_9am"] < 0, "Hillshade_9am"] = 0

df_train.loc[df_train["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
df_test.loc[df_test["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0

df_train.loc[df_train["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
df_test.loc[df_test["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

df_train.loc[df_train["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
df_test.loc[df_test["Hillshade_9am"] > 255, "Hillshade_9am"] = 255

df_train.loc[df_train["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
df_test.loc[df_test["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255

df_train.loc[df_train["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
df_test.loc[df_test["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

In [None]:
#checking hillshade features after correction
print('Train dataset hillshade features summary after correction')
display(feature_summary(df_train[['Id','Hillshade_9am','Hillshade_Noon','Hillshade_3pm']]))

print('Test dataset hillshade features summary after correction')
display(feature_summary(df_test[['Id','Hillshade_9am','Hillshade_Noon','Hillshade_3pm']]))

<h2 id="AggFeatures">Creating Aggregated features</h2>
Creating aggregated features
<h4>Observation</h4>
<li>Creating aggregated features for Soil_Type, Wilderness_Area and Nonbinary columns separately</li>

<br><a href="#Approach">back to main menu</a>

In [None]:
#Creating list of Soil_Type and Wilderness_Area columns
st_features=[col for col in df_train.columns if str(col).startswith('Soil_Type')]
wa_features=[col for col in df_train.columns if str(col).startswith('Wilderness_Area')]

In [None]:
%%time
#SIMPLE FEATURE ENGINEERING, CREATING SOME AGGREGATION FEATURES
df_train['sumst']=df_train[st_features].sum(axis=1)
df_test['sumst']=df_test[st_features].sum(axis=1)

df_train['meanst']=df_train[st_features].mean(axis=1)
df_test['meanst']=df_test[st_features].mean(axis=1)

df_train['stdst'] = df_train[st_features].std(axis=1)
df_test['stdst'] = df_test[st_features].std(axis=1)

df_train['maxst'] = df_train[st_features].max(axis=1)
df_test['maxst'] = df_test[st_features].max(axis=1)

df_train['minst'] = df_train[st_features].min(axis=1)
df_test['minst'] = df_test[st_features].min(axis=1)

df_train['kurtst'] = df_train[st_features].kurtosis(axis=1)
df_test['kurtst'] = df_test[st_features].kurtosis(axis=1)

df_train['sumwa']=df_train[wa_features].sum(axis=1)
df_test['sumwa']=df_test[wa_features].sum(axis=1)

df_train['meanwa']=df_train[wa_features].mean(axis=1)
df_test['meanwa']=df_test[wa_features].mean(axis=1)

df_train['stdwa'] = df_train[wa_features].std(axis=1)
df_test['stdwa'] = df_test[wa_features].std(axis=1)

df_train['maxwa'] = df_train[wa_features].max(axis=1)
df_test['maxwa'] = df_test[wa_features].max(axis=1)

df_train['minwa'] = df_train[wa_features].min(axis=1)
df_test['minwa'] = df_test[wa_features].min(axis=1)

df_train['kurtwa'] = df_train[wa_features].kurtosis(axis=1)
df_test['kurtwa'] = df_test[wa_features].kurtosis(axis=1)

df_train['sumnb']=df_train[nonbinary_features].sum(axis=1)
df_test['sumnb']=df_test[nonbinary_features].sum(axis=1)

df_train['meannb']=df_train[nonbinary_features].mean(axis=1)
df_test['meannb']=df_test[nonbinary_features].mean(axis=1)

df_train['stdnb'] = df_train[nonbinary_features].std(axis=1)
df_test['stdnb'] = df_test[nonbinary_features].std(axis=1)

df_train['maxnb'] = df_train[nonbinary_features].max(axis=1)
df_test['maxnb'] = df_test[nonbinary_features].max(axis=1)

df_train['minnb'] = df_train[nonbinary_features].min(axis=1)
df_test['minnb'] = df_test[nonbinary_features].min(axis=1)

df_train['kurtnb'] = df_train[nonbinary_features].kurtosis(axis=1)
df_test['kurtnb'] = df_test[nonbinary_features].kurtosis(axis=1)


agg_features= ['sumst','meanst','stdst','maxst','minst','kurtst','sumwa','meanwa','stdwa','maxwa','minwa','kurtwa',
               'sumnb','meannb','stdnb','maxnb','minnb','kurtnb']

In [None]:
# Manhhattan distance to Hydrology
df_train['manhhattan_dist_hydro'] = np.abs(df_train['Horizontal_Distance_To_Hydrology']) + np.abs(df_train['Vertical_Distance_To_Hydrology'])
df_test['manhhattan_dist_hydro'] = np.abs(df_test['Horizontal_Distance_To_Hydrology']) + np.abs(df_test['Vertical_Distance_To_Hydrology'])

# Euclidean distance to Hydrology
df_train['ecldn_dist_hydro'] = (df_train['Horizontal_Distance_To_Hydrology']**2 + df_train['Vertical_Distance_To_Hydrology']**2)**0.5
df_test['ecldn_dist_hydro'] = (df_test['Horizontal_Distance_To_Hydrology']**2 + df_test['Vertical_Distance_To_Hydrology']**2)**0.5

In [None]:
hydro_features=['manhhattan_dist_hydro','ecldn_dist_hydro']

In [None]:
#filling null values with zero
df_train['ecldn_dist_hydro'].fillna(0,inplace=True)
df_test['ecldn_dist_hydro'].fillna(0,inplace=True)

In [None]:
gc.collect()

<h2 id="Normalization">Normalizing dataset</h2>
Using Standard Scaler to normalize dataset

<br><a href="#Approach">back to main menu</a>


In [None]:
%%time
scaler = StandardScaler()
X = scaler.fit_transform(df_train[features+agg_features+hydro_features])
test = scaler.transform(df_test[features+agg_features+hydro_features])
y=df_train['Cover_Type'].values

In [None]:
gc.collect()

In [None]:
#FINAL DATASET SHAPES
X.shape,y.shape,test.shape

<h2 id="Ridge">Ridge Classifier</h2>
Starting with base Ridge model, without any hyperparameter tuning.

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
model=RidgeClassifier()
print('Ridge Classifier parameters:\n',model.get_params())

ridge_predictions,best_ridge_model,ridge_preds=type_predictor(X,y,test,15,model,'RC')

In [None]:
gc.collect()

In [None]:
df_submission['Cover_Type']=ridge_predictions
df_submission['Cover_Type']=round(df_submission['Cover_Type'],0)
df_submission['Cover_Type']=df_submission['Cover_Type'].astype('int8')
#SAVING LGBM PREDICTIONS
df_submission.to_csv('ridge_submission.csv',index=False)
df_submission.head(10)

In [None]:
df_feature_impt=pd.DataFrame()
df_feature_impt['features']=features+agg_features+hydro_features
df_feature_impt['importance']=best_ridge_model.coef_[0]

df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (15,30))
ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
plt.title('Feature importance Ridge Model',color='blue',fontsize=12)
ax.bar_label(ax.containers[0]);

In [None]:
gc.collect()

<h2 id="DTC">DecisionTree Classification</h2>

Simple DecisionTree Classification without any hyperparameter tunning.

<I><font color='red'>Note: Commented the code to reduce overall execution time</font></I>

<br><a href="#Approach">back to main menu</a>

In [None]:
# %%time
# model=DecisionTreeClassifier()
# print('Ridge Classifier parameters:\n',model.get_params())

# dtree_predictions,best_dtree_model,dtree_preds=type_predictor(X,y,test,15,model,'DTC')

In [None]:
# gc.collect()

In [None]:
# df_submission['Cover_Type']=dtree_predictions
# df_submission['Cover_Type']=round(df_submission['Cover_Type'],0)
# df_submission['Cover_Type']=df_submission['Cover_Type'].astype('int8')
# #SAVING LGBM PREDICTIONS
# df_submission.to_csv('dtree_submission.csv',index=False)
# df_submission.head(10)

In [None]:
# df_feature_impt=pd.DataFrame()
# df_feature_impt['features']=features+agg_features+hydro_features
# df_feature_impt['importance']=best_dtree_model.feature_importances_

# df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
# plt.figure(figsize = (15,30))
# ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
# plt.title('Feature importance Decision Tree Model',color='blue',fontsize=12)
# ax.bar_label(ax.containers[0]);

<h2 id="LGBM">LGBM Classification</h2>

Simple LGBMClassifier without any hyperparameter tunning.

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
lgbm_params = {
    'objective' : 'multiclass',
   'device_type': 'gpu'
}

model=lgb.LGBMClassifier(**lgbm_params)
print('LGBM parameters:\n',model.get_params())

lgb_predictions,best_lgb_model,LGBpreds=type_predictor(X,y,test,15,model,'LGB')

In [None]:
gc.collect()

In [None]:
df_submission['Cover_Type']=lgb_predictions
df_submission['Cover_Type']=round(df_submission['Cover_Type'],0)
df_submission['Cover_Type']=df_submission['Cover_Type'].astype('int8')
#SAVING LGBM PREDICTIONS
df_submission.to_csv('lgb_submission.csv',index=False)
df_submission.head(10)

In [None]:
df_feature_impt=pd.DataFrame()
df_feature_impt['features']=features+agg_features+hydro_features
df_feature_impt['importance']=best_lgb_model.feature_importances_

df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (15,50))
ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
plt.title('Feature importance LGB Model',color='blue',fontsize=12)
ax.bar_label(ax.containers[0]);

<h2 id="XGB">XGBClassifier</h2><font color='red'>(work in progress)</font>

Simple XGBClassifier without any hyperparameter tunning.

<br><a href="#Approach">back to main menu</a>

In [None]:
%%time
xgb_params = {
   'tree_method': 'gpu_hist', 
   'gpu_id': 0, 
   'predictor': 'gpu_predictor', 
}

model=xgb.XGBClassifier(**xgb_params)
print('XGB parameters:\n',model.get_params())

xgb_predictions,best_xgb_model,XGBpreds=type_predictor(X,y,test,15,model,'XGB')

In [None]:
gc.collect()

In [None]:
df_submission['Cover_Type']=xgb_predictions
df_submission['Cover_Type']=round(df_submission['Cover_Type'],0)
df_submission['Cover_Type']=df_submission['Cover_Type'].astype('int8')
#SAVING LGBM PREDICTIONS
df_submission.to_csv('xgb_submission.csv',index=False)
df_submission.head(10)

In [None]:
df_feature_impt=pd.DataFrame()
df_feature_impt['features']=features+agg_features+hydro_features
df_feature_impt['importance']=best_xgb_model.feature_importances_

df_feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (15,50))
ax=sns.barplot(x=df_feature_impt['importance'],y=df_feature_impt['features'],data=df_feature_impt)
plt.title('Feature importance Best XGB Model',color='blue',fontsize=12)
ax.bar_label(ax.containers[0]);