## 3. Forest Cover Type Dataset  
- LGB + cv ensemble + normalisation + FE

<div style="text-align: right"> <b>Author : Kwang Myung Yu</b></div>
<div style="text-align: right\"> Initial upload: 2021.10.18 </div>
<div style="text-align: right\"> Last update: 2021.10.18</div>

- 출처 : https://www.kaggle.com/schlerp/lgb-cv-ensemble-normalisation-fe

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings; warnings.filterwarnings('ignore')
plt.style.use('ggplot')
%matplotlib inline

In [2]:
colors = ["#00798c", "#d1495b", '#edae49', '#66a182', '#4a4a4a',
          '#1a508b', '#e3120b', '#c5a880', '#9F5F80', '#6F9EAF',
          '#0278ae','#F39233', '#A7C5EB', '#54E346', '#ABCE74',
        '#d6b0b1', '#58391c', '#cdd0cb', '#ffb396', '#6930c3']
sns.color_palette(colors[:10])

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import random

In [5]:
print(os.listdir('data/forest-cover-type-kernels-only/'))

['my_submission.csv', 'output_ceil.csv', 'output_floor.csv', 'sampleSubmission.csv', 'sample_submission.csv', 'sub.csv', 'test.csv', 'train.csv']


In [7]:
number_classes = 7
train = pd.read_csv('data/forest-cover-type-kernels-only/train.csv')
test = pd.read_csv('data/forest-cover-type-kernels-only/test.csv')

In [8]:
train.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [9]:
train.shape

(15120, 56)

In [10]:
X_train = train.drop(['Id', 'Cover_Type'], axis = 1)
y_train = train['Cover_Type'].values

In [16]:
X_test = test.drop(['Id'], axis = 1)
ID_test = test['Id'].values

In [17]:
X = pd.concat([X_train, X_test], axis = 0)

In [20]:
X.shape

(581012, 54)

In [21]:
# mean hillshade
def mean_hillshade(df):
    df['mean_hillshade'] = (df['Hillshade_9am'] + df['Hillshade_Noon'] + df['Hillshade_3pm']) / 3
    return df

# calculate the distance to hydrology using pythagoras theorem
def distance_to_hydrology(df):
    df['distance_to_hydrology'] = np.sqrt(np.power(df['Horizontal_Distance_To_Hydrology'], 2) + \
                                          np.power(df['Vertical_Distance_To_Hydrology'], 2))
    return df

# calculate diagnial distance down to sea level?
def diag_to_sealevl(df):
    df['diag_to_sealevel'] = np.divide(df['Elevation'], np.cos(180-df['Slope']))
    return df

# calculate mean distance to features
def mean_dist_to_feature(df):
    df['mean_dist_to_feature'] = (df['Horizontal_Distance_To_Hydrology'] + \
                                  df['Horizontal_Distance_To_Roadways'] + \
                                  df['Horizontal_Distance_To_Fire_Points']) / 3
    return df

def binned_columns(df):
    bin_defs = [
        # col name, bin size, new name
        ('Elevation', 200, 'Binned_Elevation'),
        ('Aspect', 45, 'Binned_Aspect'),
        ('Slope', 6, 'Binned_Slope'),
        ('Horizontal_Distance_To_Hydrology', 140, 'Binned_Horizontal_Distance_To_Hydrology'),
        ('Horizontal_Distance_To_Roadways', 712, 'Binned_Horizontal_Distance_To_Roadways'),
        ('Hillshade_9am', 32, 'Binned_Hillshade_9am'),
        ('Hillshade_Noon', 32, 'Binned_Hillshade_Noon'),
        ('Hillshade_3pm', 32, 'Binned_Hillshade_3pm'),
        ('Horizontal_Distance_To_Fire_Points', 717, 'Binned_Horizontal_Distance_To_Fire_Points')
    ]
    
    for col_name, bin_size, new_name in bin_defs:
        df[new_name] = np.floor(df[col_name]/bin_size)
    
    return df

In [22]:
X = mean_hillshade(X)
X = distance_to_hydrology(X)
X = diag_to_sealevl(X)
X = mean_dist_to_feature(X)
X = binned_columns(X)

In [24]:
X.shape

(581012, 67)

In [25]:
# 정규화
def normalise_df(df):
    df_mean = df.mean()
    df_std = df.std()    
    df_norm = (df - df_mean) / (df_std)
    return df_norm, df_mean, df_std

In [26]:
cols_non_onehot = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
                   'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
                   'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 
                   'Horizontal_Distance_To_Fire_Points', 'mean_hillshade',
                   'distance_to_hydrology', 'diag_to_sealevel', 'mean_dist_to_feature']

X_norm, df_mean, df_std = normalise_df(X[cols_non_onehot])

# replace columns with normalised versions
X = X.drop(cols_non_onehot, axis=1)
X = pd.concat([X_norm, X], axis=1)

In [27]:
# split back into test and train sets
X_train = np.array(X[:len(X_train)])
X_test = np.array(X[len(X_train):])

In [28]:
X_train

array([[-1.29780397, -0.93515618, -1.48281851, ...,  7.        ,
         4.        ,  8.        ],
       [-1.31923371, -0.8904789 , -1.6163612 , ...,  7.        ,
         4.        ,  8.        ],
       [-0.55490633, -0.14883615, -0.68156233, ...,  7.        ,
         4.        ,  8.        ],
       ...,
       [-1.6692528 , -0.19351342,  1.45512081, ...,  6.        ,
         2.        ,  1.        ],
       [-1.68711091,  0.10135658,  1.85574889, ...,  7.        ,
         3.        ,  1.        ],
       [-1.72997039,  0.36942023,  2.65700507, ...,  7.        ,
         5.        ,  1.        ]])

In [29]:
# Kfold 설정 
n_splits = 12
kfolds = StratifiedKFold(n_splits= n_splits, shuffle = True)

In [30]:
kfolds

StratifiedKFold(n_splits=12, random_state=None, shuffle=True)

In [32]:
scores = []
models = []

lrs = [0.1, 0.03, 0.01]
nls = [46, 48, 50]
n_ests = [200, 225, 250]

current_fold = 1

In [34]:
for train, val in kfolds.split(X_train, y_train):
    print('commencing fold {}'.format(current_fold))
    print('     preparing data...')
    Xt = X_train[train]
    yt = y_train[train]
    Xv = X_train[val]
    yv = y_train[val]
    
    n_est = random.choice(n_ests)
    lr = random.choice(lrs)
    nl = random.choice(nls)
    
    print('     building model with ests={}, lr={}, nl={}...'.format(n_est, lr, nl))
    
    classifier = lgb.LGBMClassifier(n_estimators=n_est, boosting_type='dart', learning_rate=lr, num_leaves=nl)
    
    print('     fitting model...')
    classifier.fit(Xt, yt, eval_set=(Xv, yv), early_stopping_rounds=50, verbose=False)
    print('     evaluating model...')
    
    y_pred = classifier.predict(Xv)
    score = accuracy_score(yv, y_pred)
    scores.append(score)
    models.append(classifier)
    print('  fold {} accuracy: {} %'.format(current_fold, score*100))
    current_fold += 1

commencing fold 1
     preparing data...
     building model with ests=200, lr=0.1, nl=46...
     fitting model...
     evaluating model...
  fold 1 accuracy: 86.90476190476191 %
commencing fold 2
     preparing data...
     building model with ests=225, lr=0.03, nl=46...
     fitting model...
     evaluating model...
  fold 2 accuracy: 83.96825396825398 %
commencing fold 3
     preparing data...
     building model with ests=200, lr=0.03, nl=48...
     fitting model...
     evaluating model...
  fold 3 accuracy: 83.17460317460318 %
commencing fold 4
     preparing data...
     building model with ests=225, lr=0.03, nl=48...
     fitting model...
     evaluating model...
  fold 4 accuracy: 85.39682539682539 %
commencing fold 5
     preparing data...
     building model with ests=225, lr=0.1, nl=48...
     fitting model...
     evaluating model...
  fold 5 accuracy: 86.82539682539682 %
commencing fold 6
     preparing data...
     building model with ests=200, lr=0.01, nl=48...
     fit

In [35]:
print('ensemble average accuracy: {} % (+/- {} %)'.format(np.mean(scores)*100, np.std(scores)*100))

ensemble average accuracy: 85.9457671957672 % (+/- 2.0477775494755956 %)


In [36]:
print('testing ensemble accuracy on whole training set...')
y_preds = []
for index, classifier in enumerate(models):
    print('getting predictions from model {}...'.format(index+1))
    y_pred = classifier.predict(X_train)
    y_preds.append(y_pred)

testing ensemble accuracy on whole training set...
getting predictions from model 1...
getting predictions from model 2...
getting predictions from model 3...
getting predictions from model 4...
getting predictions from model 5...
getting predictions from model 6...
getting predictions from model 7...
getting predictions from model 8...
getting predictions from model 9...
getting predictions from model 10...
getting predictions from model 11...
getting predictions from model 12...


In [37]:
y_preds

[array([5, 5, 2, ..., 3, 3, 3], dtype=int64),
 array([5, 5, 2, ..., 3, 3, 3], dtype=int64),
 array([5, 5, 2, ..., 3, 3, 3], dtype=int64),
 array([5, 5, 2, ..., 3, 3, 3], dtype=int64),
 array([5, 5, 2, ..., 3, 3, 3], dtype=int64),
 array([5, 5, 2, ..., 3, 3, 3], dtype=int64),
 array([5, 5, 2, ..., 3, 3, 3], dtype=int64),
 array([5, 5, 2, ..., 3, 3, 3], dtype=int64),
 array([5, 5, 2, ..., 3, 3, 3], dtype=int64),
 array([5, 5, 2, ..., 3, 3, 3], dtype=int64),
 array([5, 5, 2, ..., 3, 3, 3], dtype=int64),
 array([5, 5, 2, ..., 3, 3, 3], dtype=int64)]

In [38]:
print('taking average and rounding...')
y_pred = np.rint(np.mean(y_preds, axis=0))
y_pred = y_pred.astype(int)

taking average and rounding...


In [39]:
print('calcualting accuracy...')
ensemble_accuracy = accuracy_score(y_train, y_pred)

calcualting accuracy...


In [40]:
print('ensemble accuracy: {} %'.format(ensemble_accuracy*100))

ensemble accuracy: 92.1031746031746 %


In [41]:
print('producing test data predictions...')
y_preds = []
for index, classifier in enumerate(models):
    print('getting predictions from model {}...'.format(index+1))
    y_pred = classifier.predict(X_test)
    y_preds.append(y_pred)

print('taking average and rounding...')
y_pred = np.rint(np.mean(y_preds, axis=0))
y_pred = y_pred.astype(int)

print(max(y_pred))
print(min(y_pred))

producing test data predictions...
getting predictions from model 1...
getting predictions from model 2...
getting predictions from model 3...
getting predictions from model 4...
getting predictions from model 5...
getting predictions from model 6...
getting predictions from model 7...
getting predictions from model 8...
getting predictions from model 9...
getting predictions from model 10...
getting predictions from model 11...
getting predictions from model 12...
taking average and rounding...
7
1


In [None]:
# sub = pd.DataFrame()
# sub['Id'] = ID_test
# sub['Cover_Type'] = y_pred
# sub.to_csv('my_submission.csv', index=False)
# print('good luck!')