In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

## EDA

In [None]:
print(train.shape)
print(test.shape)

是否有丢失值

In [None]:
train.info()

In [None]:
test.info()

no need to deal with missing values 

判断是否存在异常值

In [None]:
train.describe()

Mean VS Max has a gap difference,may have many outliers

In [None]:
train.columns

特征相关性分析

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(train.corr())

从相关性图可以看出，soil_type与其他特征相关性不强；'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'相关性挺强； 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'相关性强；“Elevation”与其他特征相关性强

In [None]:
#函数引用于他处
from scipy.stats import gaussian_kde

def compare_dist(x,y,ax, feature, i=0):
    sns.kdeplot(x[feature], label="train", ax=ax)
    sns.kdeplot(y[feature], label="test", ax=ax)

def numeric_tile(x,y):
    fig, axs = plt.subplots(2, 6, figsize=(24, 12))
    axs = axs.flatten()
    
    for i, (ax, col) in enumerate(zip(axs, y.columns.tolist()[1:])):
        compare_dist(x,y,ax, col, i)
        ax.set_title(col)
    plt.tight_layout()

In [None]:
list1 = ['Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm']
for i in list1:
    for j in list1:
        if i is not j:
            sns.FacetGrid(train, hue="Cover_Type", size=10).map(plt.scatter, i, j).add_legend()

In [None]:
list2 = ['Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
for i in list2:
    if i is not 'Elevation':
        sns.FacetGrid(train, hue="Cover_Type", size=10).map(plt.scatter, 
    "Elevation", i).add_legend()

“Elevation”与'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'呈强线性相关，可考虑特征组合等

In [None]:
sns.FacetGrid(train, hue="Cover_Type", size=10).map(plt.scatter,"Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology").add_legend()

In [None]:
numeric_tile(train,test)

Elevation在训练集和测试集的分布有较大差异，可能对分类的影响较大，可考虑组合特征或直接丢弃；同时onehot编码有点影响分布，可以考虑转换成标签编码

## Data preprocessing

categorical feature （类别特征）

In [None]:
soil_list = []
for i in range(1, 41):
    soil_list.append('Soil_Type' + str(i))

wilderness_area_list = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']

def wilderness_compress(df):
    
    df[wilderness_area_list] = df[wilderness_area_list].multiply([1, 2, 3, 4], axis=1)
    df['Wilderness_Area'] = df[wilderness_area_list].sum(axis=1)
    df.drop(wilderness_area_list,inplace=True,axis=1)
    return df
def soil_compress(df):
    
    df[soil_list] = df[soil_list].multiply([i for i in range(1, 41)], axis=1)
    df['Soil_Type'] = df[soil_list].sum(axis=1)
    df.drop(soil_list,inplace=True,axis=1)
    return df

def feature_compress(df):
    df = wilderness_compress(df)
    df = soil_compress(df)
    return df

In [None]:
train = feature_compress(train)

In [None]:
train.head()

先进行归一化尝试

In [None]:
train_copy = train.copy()
train_label = train_copy["Cover_Type"]
train_copy.drop(["Id","Cover_Type"],inplace=True,axis=1)

In [None]:
train_copy.head()

In [None]:
train_copy = (train_copy - train_copy.min())/(train_copy.max()-train_copy.min())

In [None]:
train_copy.head()

In [None]:
train_copy.describe()

In [None]:
test_Id = test["Id"]
test.drop(["Id"],inplace=True,axis=1)
test = feature_compress(test)
test = (test - test.min())/(test.max()-test.min())

In [None]:
numeric_tile(train_copy,test)

经过数据预处理后，训练集与测试集的特征分布趋势基本上一致了

处理 Outlier （异常值）

## Feature Engineering

对“Elevation”与'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'进行以下特征组合尝试：(具体系数引用于他处）

In [None]:
#train['Vertical_Distance_To_Hydrology'] = train['Elevation']-train['Vertical_Distance_To_Hydrology']
#train['Horizontal_Distance_To_Hydrology']=train['Elevation']- train['Horizontal_Distance_To_Hydrology']*0.2

对'Horizontal_Distance_To_Hydrology'和 'Vertical_Distance_To_Hydrology'考虑领域知识以及上面的散点图，尝试采用特征组合生成新的特征：

In [None]:
#train['fe_Distance_To_Hydrology'] = np.sqrt(train['Horizontal_Distance_To_Hydrology']**2 + train['Vertical_Distance_To_Hydrology']**2)

对'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'进行尝试：

In [None]:
#train['fe_Hillshade_Mean'] = (train['Hillshade_9am'] + train['Hillshade_Noon'] + train['Hillshade_3pm'])/3
#train['fe_Hillshade_Mean_Div_E'] = (train['fe_Hillshade_Mean'] / train['Elevation']).clip(upper=255)
#train['fe_Hillshade_Mean_Div_Aspect'] = (train['fe_Hillshade_Mean'] / train['Aspect']).clip(upper=255)

结合他人的idea，总结特征工程如下：

In [None]:
def preprocess(df_):
    #df_.drop('Elevation', axis=1, inplace=True)
    df_['fe_E_Min_02HDtH'] = df_['Elevation']- df_['Horizontal_Distance_To_Hydrology']*0.2
    df_['fe_Distance_To_Hydrology'] = np.sqrt(df_['Horizontal_Distance_To_Hydrology']**2 + 
                                              df_['Vertical_Distance_To_Hydrology']**2)
    
    feats_sub = [('E_Min_VDtH', 'Elevation', 'Vertical_Distance_To_Hydrology'),
                 ('HD_Hydrology_Min_Roadways', 'Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways'),
                 ('HD_Hydrology_Min_Fire', 'Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Fire_Points'),
                 ('Hillshade_9am_Min_Noon', 'Hillshade_9am', 'Hillshade_Noon'),
                 ('Hillshade_Noon_Min_3pm', 'Hillshade_Noon', 'Hillshade_3pm'),
                 ('Hillshade_9am_Min_3pm', 'Hillshade_9am', 'Hillshade_3pm')
                ]
    feats_add = [('E_Add_VDtH', 'Elevation', 'Vertical_Distance_To_Hydrology'),
                 ('HD_Hydrology_Add_Roadways', 'Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways'),
                 ('HD_Hydrology_Add_Fire', 'Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Fire_Points'),
                 ('Hillshade_9am_Add_Noon', 'Hillshade_9am', 'Hillshade_Noon'),
                 ('Hillshade_Noon_Add_3pm', 'Hillshade_Noon', 'Hillshade_3pm'),
                 ('Hillshade_9am_Add_3pm', 'Hillshade_9am', 'Hillshade_3pm')
                ]
    
    for f_new, f1, f2 in feats_sub:
        df_['fe_' + f_new] = df_[f1] - df_[f2]
    for f_new, f1, f2 in feats_add:
        df_['fe_' + f_new] = df_[f1] + df_[f2]
        
    df_['fe_Hillshade_Mean'] = (df_['Hillshade_9am'] + df_['Hillshade_Noon'] + df_['Hillshade_3pm'])/3
    df_['fe_Hillshade_Mean_Div_E'] = (df_['fe_Hillshade_Mean'] / df_['Elevation']).clip(upper=255)
    df_['fe_Hillshade_Mean_Div_Aspect'] = (df_['fe_Hillshade_Mean'] / df_['Aspect']).clip(upper=255)
    
    # A few composite variables
    df_['fe_Hillshade_Ratio1'] = (df_['fe_Hillshade_9am_Min_Noon'] / df_['fe_Hillshade_Noon_Min_3pm']).clip(lower=-5, upper=2)
    df_['fe_Hillshade_Ratio2'] = (df_['fe_Hillshade_9am_Min_3pm']  / df_['Hillshade_Noon']).clip(lower=-2, upper=2)
        
    # The feature is advertised in https://douglas-fraser.com/forest_cover_management.pdf
    df_['fe_Shade9_Mul_VDtH'] = df_['Hillshade_9am'] * df_['Vertical_Distance_To_Hydrology']
    
    # Features inherited from https://www.kaggle.com/leannelong3/r-random-forest
    df_['Elevation_bins50'] = np.floor_divide(df_['Elevation'], 50)
    df_['fe_Horizontal_Distance_To_Roadways_Log'] = np.log1p(df_['Horizontal_Distance_To_Roadways'])

    # this mapping comes from https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info
    climatic_zone = {}
    geologic_zone = {}
    for i in range(1,41):
        if i <= 6:
            climatic_zone[i] = 2
            geologic_zone[i] = 7
        elif i <= 8:
            climatic_zone[i] = 3
            geologic_zone[i] = 5
        elif i == 9:
            climatic_zone[i] = 4
            geologic_zone[i] = 2
        elif i <= 13:
            climatic_zone[i] = 4
            geologic_zone[i] = 7
        elif i <= 15:
            climatic_zone[i] = 5
            geologic_zone[i] = 1
        elif i <= 17:
            climatic_zone[i] = 6
            geologic_zone[i] = 1
        elif i == 18:
            climatic_zone[i] = 6
            geologic_zone[i] = 7
        elif i <= 21:
            climatic_zone[i] = 7
            geologic_zone[i] = 1
        elif i <= 23:
            climatic_zone[i] = 7
            geologic_zone[i] = 2
        elif i <= 34:
            climatic_zone[i] = 7
            geologic_zone[i] = 7
        else:
            climatic_zone[i] = 8
            geologic_zone[i] = 7
            
    df_['Climatic_zone_LE'] = df_['Soil_Type'].map(climatic_zone).astype(np.uint8)
    df_['Geologic_zone_LE'] = df_['Soil_Type'].map(geologic_zone).astype(np.uint8)
    
    for c in df_.columns:
        if c.startswith('fe_'):
            df_[c] = df_[c].astype(np.float32)
    return df_

In [None]:
train = preprocess(train)

In [None]:
train.head()

In [None]:
train_l = train["Cover_Type"]
train.drop(["Id","Cover_Type"],inplace=True,axis=1)

In [None]:
train = (train - train.min())/(train.max()-train.min())

In [None]:
train.describe()

总结数据处理方法

In [None]:
def feature_preprocessing(df):
    df = feature_compress(df)
    df = preprocess(df)
    df = (df - df.min())/(df.max()-df.min())
    return df

In [None]:
train2 = pd.read_csv("../input/train.csv")
test2 = pd.read_csv("../input/test.csv")

In [None]:
train2_label = train2['Cover_Type']
train2.drop(["Id","Cover_Type"],inplace=True,axis=1)
train2 = feature_preprocessing(train2)

In [None]:
test2_id = test2['Id']
test2.drop(["Id"],inplace=True,axis=1)
test2 = feature_preprocessing(test2)

## Model Selection and Training

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train2,train2_label,test_size=0.3, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble

from sklearn.metrics import accuracy_score

In [None]:
preds = pd.DataFrame()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# best params:{'criterion': 'entropy', 'max_depth': 60, 'max_features': 0.5, 'n_estimators': 300}
# best score:  0.8573318216175358
#clf1 = RandomForestClassifier()
"""
grid_values1={'n_estimators':[300],#[200,300,400,500,600,700,800]
             'max_features':[0.5],#[0.2,0.5,0.8]
             'max_depth':[60],#[50,60,70,80]
             'criterion':["entropy"]#["gini","entropy"]dxs 
}
grid1 = GridSearchCV(clf1,param_grid=grid_values1,cv=5,verbose=1,n_jobs = 4)
grid1.fit(X_train,y_train)
print(grid1.best_params_)
print(grid1.best_score_)

clf1 = RandomForestClassifier(n_estimators=grid1.best_params_["n_estimators"],\
                              max_features=grid1.best_params_["max_features"],\
                              max_depth=grid1.best_params_["max_depth"],\
                              criterion=grid1.best_params_["criterion"])
"""
clf1 = RandomForestClassifier(n_estimators=300,max_features=0.5,max_depth=60,criterion='entropy')

In [None]:
# best params:{'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 80, 'base_estimator__max_features': 0.8, 'base_estimator__n_estimators': 200}
# best score:  0.8738662131519275
"""
clf2 = ensemble.AdaBoostClassifier(ensemble.ExtraTreesClassifier(),n_estimators=250, learning_rate=0.01, algorithm='SAMME')
grid_values2={'base_estimator__n_estimators':[200],#[200,300,400,500,600,700,800]
             'base_estimator__max_features':[0.8],#[0.2,0.5,0.8]
             'base_estimator__max_depth':[80],#[50,60,70,80]
             'base_estimator__criterion':["gini"]#["gini","entropy"]
}
grid2 = GridSearchCV(clf2,param_grid=grid_values2,cv=5,verbose=1,n_jobs = 4)
grid2.fit(X_train,y_train)
print(grid2.best_params_)
print(grid2.best_score_)
clf2 =  ensemble.AdaBoostClassifier(ensemble.ExtraTreesClassifier(n_estimators=grid2.best_params_["base_estimator__n_estimators"],\
                              max_features=grid2.best_params_["base_estimator__max_features"],\
                              max_depth=grid2.best_params_["base_estimator__max_depth"],\
                              criterion=grid2.best_params_["base_estimator__criterion"]),
                                   n_estimators=250, learning_rate=0.01, algorithm='SAMME')
"""
clf2 =  ensemble.AdaBoostClassifier(ensemble.ExtraTreesClassifier(n_estimators=200,max_features=0.8,max_depth=80,criterion='gini'),
                                   n_estimators=250, learning_rate=0.01, algorithm='SAMME')

In [None]:
# best params:{'criterion': 'gini', 'max_depth': 60, 'max_features': 0.8, 'n_estimators': 600}
# best score:  0.8735827664399093
"""
clf3 = ensemble.ExtraTreesClassifier() 
grid_values3={'n_estimators':[600],#[200,300,400,500,600,700,800]
             'max_features':[0.8],#[0.2,0.5,0.8]
             'max_depth':[60],#[50,60,70,80]
             'criterion':["gini"]#["gini","entropy"]
}
grid3 = GridSearchCV(clf3,param_grid=grid_values3,cv=5,verbose=1,n_jobs = 4)
grid3.fit(X_train,y_train)
print(grid3.best_params_)
print(grid3.best_score_)
clf3 =  ensemble.ExtraTreesClassifier(n_estimators=grid3.best_params_["n_estimators"],\
                              max_features=grid3.best_params_["max_features"],\
                              max_depth=grid3.best_params_["max_depth"],\
                              criterion=grid3.best_params_["criterion"])
"""
clf3 =  ensemble.ExtraTreesClassifier(n_estimators=600,max_features=0.8,max_depth=60,criterion='gini')

In [None]:
# best params:{'criterion': 'friedman_mse', 'max_depth': 50, 'max_features': 0.2, 'n_estimators': 500}
# best score:  0.8564814814814815
"""
clf4 = ensemble.AdaBoostClassifier(ensemble.GradientBoostingClassifier(),
                                   n_estimators=250, learning_rate=0.01, algorithm="SAMME")
grid_values4={'base_estimator__n_estimators':[500],#[200,300,400,500,600,700,800]
             'base_estimator__max_features':[0.2],#[0.2,0.5,0.8]
             'base_estimator__max_depth':[50],#[50,60,70,80]
             'base_estimator__criterion':["friedman_mse"]#["friedman_mse","mse","mae"]
}
grid4 = GridSearchCV(clf4,param_grid=grid_values4,cv=5,verbose=1,n_jobs = 4)
grid4.fit(X_train,y_train)
print(grid4.best_params_)
print(grid4.best_score_)
clf4 =  ensemble.AdaBoostClassifier(ensemble.GradientBoostingClassifier(n_estimators=grid4.best_params_["base_estimator__n_estimators"],\
                              max_features=grid4.best_params_["base_estimator__max_features"],\
                              max_depth=grid4.best_params_["base_estimator__max_depth"],\
                              criterion=grid4.best_params_["base_estimator__criterion"]),
                                   n_estimators=250, learning_rate=0.01, algorithm="SAMME")
"""
clf4 =  ensemble.AdaBoostClassifier(ensemble.GradientBoostingClassifier(n_estimators=500,max_features=0.8,
                                                                        max_depth=50,criterion='friedman_mse'),
                                    n_estimators=250, learning_rate=0.01, algorithm="SAMME")

In [None]:
for clf, label in zip([clf1,
                       clf2,
                       clf3, 
                       clf4
                      ], 
                      [
                          'Random Forest',
                          'AdaBoostClassifier_ExtraTreesClassifier',
                          'ExtraTreesClassifier',
                          'AdaBoostClassifier_GradientBoostingClassifier'
                      ]):
    
    clf.fit(X_train,y_train)
    y_hat = clf.predict(X_test)
    preds[label]=y_hat
    test_score = accuracy_score(y_test,y_hat)
    #print("train Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    print("test Accuracy: %0.2f [%s]" % (test_score,  label))

In [None]:
preds

In [None]:
pred_vote = preds.mode(axis=1)

In [None]:
print(accuracy_score(y_test,pred_vote[0]))

预测

In [None]:
test_preds = pd.DataFrame()
for clf,label in zip([clf1,clf2,clf3,clf4],
                     ['Random Forest','AdaBoostClassifier_ExtraTreesClassifier',
                      'ExtraTreesClassifier','AdaBoostClassifier_GradientBoostingClassifier']):
    test_preds[label]=clf.predict(test2)

In [None]:
test_preds
test_pred_vote = test_preds.mode(axis=1)

In [None]:
sub = pd.DataFrame({"Id":test2_id,"Cover_Type": test_pred_vote[0].astype('int').values})
sub.to_csv("sub.csv", index=False)