In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

Data is visualized [here](https://www.kaggle.com/kenjishioya/draft-for-forest-cover-type-prediction)

In [None]:
# import data
train_df = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
test_df = pd.read_csv('../input/forest-cover-type-prediction/test.csv')

In [None]:
# data info
print(train_df.info())
print(train_df.head())

In [None]:
# missing values
print(train_df.columns[train_df.isnull().sum() > 0])

In [None]:
# outliers
class OutlierDetector():
    def __init__(self, threshhold=3):
        self.all_index_ = set()
        self.columns_ = {}
        self.threshhold_ = threshhold
    
    def fit(self, df, target_columns):
        self.df_ = df.copy()
        self.target_columns_ = target_columns
        for column in self.target_columns_:
            upper_limit, lower_limit, outliers = self.detect_outlier(self.df_[column])
            self.all_index_ |= set(outliers.index.tolist())
            self.columns_[column] = {'index': outliers.index.tolist(), 'upper_limit': upper_limit, 'lower_limit': lower_limit}
        return self
    
    def detect_outlier(self, series):
        first_q = np.percentile(np.array(series.tolist()), 25)
        third_q = np.percentile(np.array(series.tolist()), 75)
        IQR = third_q - first_q
        
        upper_limit = third_q+(self.threshhold_*IQR)
        lower_limit = first_q-(self.threshhold_*IQR)
        
        outliers = series[(series > upper_limit) | (series < lower_limit)]
        return upper_limit, lower_limit, outliers
    
    def get_df_without_outlier(self):
        outlier_index = self.df_.index.isin(self.all_index_)
        return self.df_.loc[~outlier_index]
    
    def get_ouliers_info(self):
        return self.columns_
    
    def show_outliers_label_distribution(self, target_column, label_column):
        outlier_index = self.df_.index.isin(self.columns_[target_column]['index'])
        sns.histplot(self.df_.loc[outlier_index][label_column])

In [None]:
outlier_detector = OutlierDetector()

In [None]:
target_columns = ['Elevation','Aspect','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Fire_Points']

In [None]:
outlier_detector.fit(train_df, target_columns)

In [None]:
train_df_without_outliers = outlier_detector.get_df_without_outlier()
train_df_without_outliers.reset_index(drop=True)

In [None]:
# HeatMap for numeric features
numeric_features = ['Elevation','Aspect','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Roadways','Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points']
corr = train_df_without_outliers[numeric_features].corr()
plt.figure(figsize=(14,12))
colormap = plt.cm.RdBu
sns.heatmap(corr,linewidths=0.1, 
            square=False, cmap=colormap, linecolor='white', annot=True)

In [None]:
class CustomPreprocessor(BaseEstimator, TransformerMixin):
        
    def transform(self, X, y=None):
        # feature generation
        new_X = X.copy()
        new_X['Ele_minus_VDtHyd'] = new_X['Elevation']-new_X['Vertical_Distance_To_Hydrology']
        new_X['Ele_plus_VDtHyd'] = new_X['Elevation']+new_X['Vertical_Distance_To_Hydrology']
        new_X['Distanse_to_Hydrolody'] = (new_X['Horizontal_Distance_To_Hydrology']**2+new_X['Vertical_Distance_To_Hydrology']**2)**0.5
        new_X['Hydro_plus_Fire'] = new_X['Horizontal_Distance_To_Hydrology']+new_X['Horizontal_Distance_To_Fire_Points']
        new_X['Hydro_minus_Fire'] = new_X['Horizontal_Distance_To_Hydrology']-new_X['Horizontal_Distance_To_Fire_Points']
        new_X['Hydro_plus_Road'] = new_X['Horizontal_Distance_To_Hydrology']+new_X['Horizontal_Distance_To_Roadways']
        new_X['Hydro_minus_Road'] = new_X['Horizontal_Distance_To_Hydrology']-new_X['Horizontal_Distance_To_Roadways']
        new_X['Fire_plus_Road'] = new_X['Horizontal_Distance_To_Fire_Points']+new_X['Horizontal_Distance_To_Roadways']
        new_X['Fire_minus_Road'] = new_X['Horizontal_Distance_To_Fire_Points']-new_X['Horizontal_Distance_To_Roadways']
        # feature selection
        columns_to_remove = ['Soil_Type1', 'Soil_Type2', 'Soil_Type5', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37']
        new_X = new_X.drop(columns_to_remove, axis='columns')
        return new_X

    def fit(self, X, y=None, **fit_params):
        return self
    

In [None]:
train_y = train_df_without_outliers.Cover_Type
train_X = train_df_without_outliers.drop(['Id','Cover_Type'], axis='columns')
test_id = test_df.Id
test_X = test_df.drop(['Id'], axis='columns')

In [None]:
from xgboost import XGBClassifier
xg_model = XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 500,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softprob',
 nthread= -1,
 scale_pos_weight=1
)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
ex_model = ExtraTreesClassifier(max_features=0.3, n_estimators=500)

In [None]:
ex_pipe = Pipeline([
    ("preprocess", CustomPreprocessor()),
    ("model", ex_model)
])
scores = cross_val_score(ex_pipe,train_X,train_y,cv=5,scoring='accuracy')
print(scores)

In [None]:
xg_pipe = Pipeline([
    ("preprocess", CustomPreprocessor()),
    ("model", xg_model)
])
scores = cross_val_score(xg_pipe,train_X,train_y,cv=5,scoring='accuracy')
print(scores)

In [None]:
ex_pipe.fit(train_X, train_y)
predict = ex_pipe.predict(test_X)

In [None]:
print(len(test_id))
print(len(predict))

In [None]:
submission = pd.DataFrame({'Id': test_id, 'Cover_Type': predict})
print(submission.head())
submission.to_csv('submission.csv', index=False)

reference:  
[Forest_Prediction_Final](https://www.kaggle.com/nehabhandari1/forest-prediction-final)  
[my_first_submission](https://www.kaggle.com/jianyu/my-first-submission)  