In [0]:
import os
import random
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE

from lightgbm import LGBMClassifier

from mlxtend.classifier import StackingCVClassifier

from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

random_state = 1
random.seed(random_state)
np.random.seed(random_state)


print('> Loading data')
X_train = pd.read_csv('/kaggle/input/learn-together/train.csv', index_col='Id')
X_test = pd.read_csv('/kaggle/input/learn-together/test.csv', index_col='Id')

y_train = X_train['Cover_Type']
X_train = X_train.drop(['Cover_Type'], axis='columns')


# https://www.kaggle.com/kwabenantim/forest-cover-feature-engineering
def add_features(X_):
    X = X_.copy()

    X['Hydro_Elevation_diff'] = (X['Elevation'] - 
                                 X['Vertical_Distance_To_Hydrology'])

    X['Hydro_Euclidean'] = (X['Horizontal_Distance_To_Hydrology']**2 +
                            X['Vertical_Distance_To_Hydrology']**2).apply(np.sqrt)

    X['Hydro_Fire_sum'] = (X['Horizontal_Distance_To_Hydrology'] + 
                           X['Horizontal_Distance_To_Fire_Points'])

    X['Hydro_Fire_diff'] = (X['Horizontal_Distance_To_Hydrology'] - 
                            X['Horizontal_Distance_To_Fire_Points']).abs()

    X['Hydro_Road_sum'] = (X['Horizontal_Distance_To_Hydrology'] +
                           X['Horizontal_Distance_To_Roadways'])

    X['Hydro_Road_diff'] = (X['Horizontal_Distance_To_Hydrology'] -
                            X['Horizontal_Distance_To_Roadways']).abs()

    X['Road_Fire_sum'] = (X['Horizontal_Distance_To_Roadways'] + 
                          X['Horizontal_Distance_To_Fire_Points'])

    X['Road_Fire_diff'] = (X['Horizontal_Distance_To_Roadways'] - 
                           X['Horizontal_Distance_To_Fire_Points']).abs()
    
    # For all 40 Soil_Types, 1=rubbly, 2=stony, 3=very stony, 4=extremely stony, 0=?
    stoneyness = [4, 3, 1, 1, 1, 2, 0, 0, 3, 1, 
                  1, 2, 1, 0, 0, 0, 0, 3, 0, 0, 
                  0, 4, 0, 4, 4, 3, 4, 4, 4, 4, 
                  4, 4, 4, 4, 1, 4, 4, 4, 4, 4]
    
    # Compute Soil_Type number from Soil_Type binary columns
    X['Stoneyness'] = sum(i * X['Soil_Type{}'.format(i)] for i in range(1, 41))
    
    # Replace Soil_Type number with "stoneyness" value
    X['Stoneyness'] = X['Stoneyness'].replace(range(1, 41), stoneyness)  
    
    return X


def drop_features(X_):
    X = X_.copy()
    
    # Drop low variance columns
    hi = 0.99 * len(X)
    lo_var = [c for c in X.columns if  X[c].value_counts().iat[0] > hi]
    
    X = X.drop(lo_var, axis='columns')
    
    return X


print('> Adding features')
X_train = add_features(X_train)
X_test = add_features(X_test)


print('> Setting up classifiers')
ab_clf = AdaBoostClassifier(n_estimators=200,
                            base_estimator=DecisionTreeClassifier(
                                min_samples_leaf=2,
                                random_state=random_state),
                            random_state=random_state)

et_clf = ExtraTreesClassifier(n_estimators=300,
                              min_samples_leaf=2,
                              min_samples_split=2,
                              max_features=30,
                              max_depth=50,
                              random_state=random_state)

rf_clf = RandomForestClassifier(n_estimators=200,
                                random_state=random_state)


predictions = pd.Series(dtype=y_train.dtype)
wilderness_areas = ['Wilderness_Area{}'.format(i) for i in range(1,5)]

for wa in range(1,5):
    print('> Preparing data for Wilderness_Area{}'.format(wa))
    
    X_train_wa = X_train[X_train['Wilderness_Area{}'.format(wa)] == 1]
    X_test_wa = X_test[X_test['Wilderness_Area{}'.format(wa)] == 1]
    
    X_train_wa = X_train_wa.drop(wilderness_areas, axis='columns')
    X_test_wa = X_test_wa.drop(wilderness_areas, axis='columns')
    
    y_train_wa = y_train.loc[X_train_wa.index]
    
    
    print('> Adding soil count features')
    X_train_wa['Soil_counts'] = sum(i * X_train_wa['Soil_Type{}'.format(i)] for i in range(1, 41))
    X_test_wa['Soil_counts'] = sum(i * X_test_wa['Soil_Type{}'.format(i)] for i in range(1, 41))

    soils = pd.concat([X_train_wa['Soil_counts'], X_test_wa['Soil_counts']], ignore_index=True)
    soil_counts = soils.value_counts()

    X_train_wa['Soil_counts'] = X_train_wa['Soil_counts'].map(soil_counts)
    X_test_wa['Soil_counts'] = X_test_wa['Soil_counts'].map(soil_counts)

    
    print('> Dropping features')
    X_train_wa = drop_features(X_train_wa)
    X_test_wa = X_test_wa[X_train_wa.columns]
    print('> {} features remaining'.format(len(X_train_wa.columns)))
    
    
    print('> Up-sampling')
    max_samples = y_train_wa.value_counts().iat[0]
    classes = y_train_wa.unique().tolist()
    sampling_strategy = dict((cl, max_samples) for cl in classes)
    
    sampler = SMOTE(sampling_strategy=sampling_strategy,
                    random_state=random_state)
    
    X_train_wa, y_train_wa = sampler.fit_resample(X_train_wa, y_train_wa)
    X_train_wa = pd.DataFrame(X_train_wa)
    y_train_wa = pd.Series(y_train_wa)
    
    
    print('> Setting up stack')   
    max_features = min(30, X_train_wa.columns.size)
    et_clf.set_params(max_features=max_features)
    
    stack = StackingCVClassifier(classifiers=[ab_clf, et_clf, rf_clf],
                                 meta_classifier=rf_clf,
                                 cv=5,
                                 stratify=True,
                                 shuffle=True,
                                 use_probas=True,
                                 use_features_in_secondary=True,
                                 verbose=1,
                                 random_state=random_state,
                                 n_jobs=-1
                                )
    
    
    print('> Fitting stack for Wilderness_Area{}'.format(wa))      
    stack = stack.fit(X_train_wa, y_train_wa)
    
    
    print('> Making predictions for Wilderness_Area{}'.format(wa))
    prediction_wa = stack.predict(X_test_wa)
    prediction_wa = pd.DataFrame(prediction_wa, index=X_test_wa.index)
    
    predictions = pd.concat([predictions, prediction_wa])
    

print('> Creating submission')
predictions = predictions.sort_index()
predictions.to_csv('submission.csv', header=['Cover_Type'], index=True, index_label='Id')

print('> Done !')