## 3. Forest Cover Type Dataset  
- Voting Classifier for victory!

<div style="text-align: right"> <b>Author : Kwang Myung Yu</b></div>
<div style="text-align: right\"> Initial upload: 2021.9.27 </div>
<div style="text-align: right\"> Last update: 2021.9.27</div>

- 출처 : https://www.kaggle.com/thebrownviking20/voting-classifier-for-victory

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings; warnings.filterwarnings('ignore')
plt.style.use('ggplot')
%matplotlib inline

In [2]:
colors = ["#00798c", "#d1495b", '#edae49', '#66a182', '#4a4a4a',
          '#1a508b', '#e3120b', '#c5a880', '#9F5F80', '#6F9EAF',
          '#0278ae','#F39233', '#A7C5EB', '#54E346', '#ABCE74',
        '#d6b0b1', '#58391c', '#cdd0cb', '#ffb396', '#6930c3']
sns.color_palette(colors[:10])

In [3]:
from sklearn import ensemble 
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

In [4]:
train = pd.read_csv('data/forest-cover-type-kernels-only/train.csv')
test = pd.read_csv('data/forest-cover-type-kernels-only/test.csv')

In [5]:
train.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [6]:
print(train.shape)
print(test.shape)

(15120, 56)
(565892, 55)


In [7]:
####################### Train data #############################################
train['HF1'] = train['Horizontal_Distance_To_Hydrology']+train['Horizontal_Distance_To_Fire_Points']
train['HF2'] = abs(train['Horizontal_Distance_To_Hydrology']-train['Horizontal_Distance_To_Fire_Points'])
train['HR1'] = abs(train['Horizontal_Distance_To_Hydrology']+train['Horizontal_Distance_To_Roadways'])
train['HR2'] = abs(train['Horizontal_Distance_To_Hydrology']-train['Horizontal_Distance_To_Roadways'])
train['FR1'] = abs(train['Horizontal_Distance_To_Fire_Points']+train['Horizontal_Distance_To_Roadways'])
train['FR2'] = abs(train['Horizontal_Distance_To_Fire_Points']-train['Horizontal_Distance_To_Roadways'])
train['ele_vert'] = train.Elevation-train.Vertical_Distance_To_Hydrology

train['slope_hyd'] = (train['Horizontal_Distance_To_Hydrology']**2+train['Vertical_Distance_To_Hydrology']**2)**0.5
train.slope_hyd=train.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any

#Mean distance to Amenities 
train['Mean_Amenities']=(train.Horizontal_Distance_To_Fire_Points + train.Horizontal_Distance_To_Hydrology + train.Horizontal_Distance_To_Roadways) / 3 
#Mean Distance to Fire and Water 
train['Mean_Fire_Hyd']=(train.Horizontal_Distance_To_Fire_Points + train.Horizontal_Distance_To_Hydrology) / 2 

####################### Test data #############################################
test['HF1'] = test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Fire_Points']
test['HF2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Fire_Points'])
test['HR1'] = abs(test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Roadways'])
test['HR2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Roadways'])
test['FR1'] = abs(test['Horizontal_Distance_To_Fire_Points']+test['Horizontal_Distance_To_Roadways'])
test['FR2'] = abs(test['Horizontal_Distance_To_Fire_Points']-test['Horizontal_Distance_To_Roadways'])
test['ele_vert'] = test.Elevation-test.Vertical_Distance_To_Hydrology

test['slope_hyd'] = (test['Horizontal_Distance_To_Hydrology']**2+test['Vertical_Distance_To_Hydrology']**2)**0.5
test.slope_hyd=test.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any

#Mean distance to Amenities 
test['Mean_Amenities']=(test.Horizontal_Distance_To_Fire_Points + test.Horizontal_Distance_To_Hydrology + test.Horizontal_Distance_To_Roadways) / 3 
#Mean Distance to Fire and Water 
test['Mean_Fire_Hyd']=(test.Horizontal_Distance_To_Fire_Points + test.Horizontal_Distance_To_Hydrology) / 2

In [8]:
feature = [col for col in train.columns if col not in ['Cover_Type', 'Id']]

In [9]:
X_train = train[feature]
X_test = test[feature]

In [10]:
preds = pd.DataFrame()

In [11]:
m1 = ensemble.AdaBoostClassifier(base_estimator=ensemble.ExtraTreesClassifier(n_estimators=500), 
                                n_estimators=250, learning_rate=0.01, algorithm = 'SAMME')
m1.fit(X_train, train['Cover_Type'])
preds['Model1'] = m1.predict(X_test)

In [12]:
m2 = ensemble.ExtraTreesClassifier(n_estimators=550)  
m2.fit(X_train, train['Cover_Type'])
preds["Model2"] = m2.predict(X_test)

In [13]:
m3 = XGBClassifier(max_depth=20, n_estimators=1000)  
m3.fit(X_train, train['Cover_Type'])
preds["Model3"] = m3.predict(X_test)



In [14]:
m4 = LGBMClassifier(n_estimators=2000, max_depth=15)
m4.fit(X_train, train['Cover_Type'])
preds["Model4"] = m4.predict(X_test)

In [15]:
m5 = ensemble.AdaBoostClassifier(ensemble.GradientBoostingClassifier(n_estimators=1000, max_depth=10), n_estimators=1000, learning_rate=0.01, algorithm="SAMME")
m5.fit(X_train, train['Cover_Type'])
preds["Model5"] = m5.predict(X_test)

In [16]:
m6 = SGDClassifier(loss='hinge')
m6.fit(X_train, train['Cover_Type'])
preds["Model6"] = m6.predict(X_test)

In [18]:
preds.head()

Unnamed: 0,Model1,Model2,Model3,Model4,Model5,Model6
0,1,1,5,5,1,4
1,1,1,1,1,1,4
2,1,1,2,1,2,3
3,1,1,2,1,2,3
4,1,1,1,1,2,3


In [20]:
pred = preds.mode(axis = 1)

In [21]:
pred

Unnamed: 0,0,1,2
0,1.0,,
1,1.0,,
2,1.0,,
3,1.0,,
4,1.0,,
...,...,...,...
565887,3.0,,
565888,3.0,,
565889,3.0,,
565890,3.0,,


In [22]:
sub = pd.DataFrame({"Id": test['Id'],"Cover_Type": pred[0].astype('int').values})
sub.to_csv("data/forest-cover-type-kernels-only/sub.csv", index=False)

In [23]:
sub.head()

Unnamed: 0,Id,Cover_Type
0,15121,1
1,15122,1
2,15123,1
3,15124,1
4,15125,1
