In [None]:
import numpy as np

import pandas as pd

from lightgbm import LGBMClassifier

from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from IPython.display import display

In [None]:
# Utility function
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")

In [None]:
train = train[train.Cover_Type != 5]

In [None]:
train.Cover_Type.value_counts()

In [None]:
%%time

# Reduce memory
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
#training_set, testing_set = train_test_split(df, test_size=0.3, random_state=25)
#x_train, x_test, y_train, y_test = train_test_split(df[features], df[target], test_size = 0.3, random_state=42)

train.drop(["Soil_Type7", "Id", "Soil_Type15"], axis=1, inplace=True)
test.drop(["Soil_Type7", "Id", "Soil_Type15"], axis=1, inplace=True)

In [None]:
train["Aspect"][train["Aspect"] < 0] += 360
train["Aspect"][train["Aspect"] > 359] -= 360

test["Aspect"][test["Aspect"] < 0] += 360
test["Aspect"][test["Aspect"] > 359] -= 360

In [None]:
# Manhhattan distance to Hydrology
train["mnhttn_dist_hydrlgy"] = np.abs(train["Horizontal_Distance_To_Hydrology"]) + np.abs(train["Vertical_Distance_To_Hydrology"])
test["mnhttn_dist_hydrlgy"] = np.abs(test["Horizontal_Distance_To_Hydrology"]) + np.abs(test["Vertical_Distance_To_Hydrology"])

# Euclidean distance to Hydrology
train["ecldn_dist_hydrlgy"] = (train["Horizontal_Distance_To_Hydrology"]**2 + train["Vertical_Distance_To_Hydrology"]**2)**0.5
test["ecldn_dist_hydrlgy"] = (test["Horizontal_Distance_To_Hydrology"]**2 + test["Vertical_Distance_To_Hydrology"]**2)**0.5

In [None]:
train.loc[train["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
test.loc[test["Hillshade_9am"] < 0, "Hillshade_9am"] = 0

train.loc[train["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
test.loc[test["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0

train.loc[train["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
test.loc[test["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

train.loc[train["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
test.loc[test["Hillshade_9am"] > 255, "Hillshade_9am"] = 255

train.loc[train["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
test.loc[test["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255

train.loc[train["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
test.loc[test["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

In [None]:
features_Hillshade = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
soil_features = [x for x in train.columns if x.startswith("Soil_Type")]
wilderness_features = [x for x in train.columns if x.startswith("Wilderness_Area")]

def addFeature(X):
    X["Soil_Count"] = X[soil_features].apply(sum, axis=1)

    X["Wilderness_Area_Count"] = X[wilderness_features].apply(sum, axis=1)
    X["Hillshade_mean"] = X[features_Hillshade].mean(axis=1)
    X['amp_Hillshade'] = X[features_Hillshade].max(axis=1) - X[features_Hillshade].min(axis=1)

In [None]:
%%time

addFeature(train)
addFeature(test)

In [None]:
cols = [
    "Elevation",
    "Aspect",
    "mnhttn_dist_hydrlgy",
    "ecldn_dist_hydrlgy",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",    
    "Soil_Count",
    "Wilderness_Area_Count",
    "Hillshade_mean",
    "amp_Hillshade"
]

scaler = RobustScaler()
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

In [None]:
def create(hyperparams):
    """Create LGBM Classifier for a given set of hyper-parameters."""
    model = LGBMClassifier(**hyperparams)
    return model

def fit_with_stop(model, X, y, X_val, y_val, esr):
    """Advanced training with early stopping."""
    model.fit(X, y,
              eval_set=(X_val, y_val),
              early_stopping_rounds=esr, 
              verbose=200)
    return model

def evaluate(model, X, y):
    """Compute AUC for a given model."""
    yp = model.predict(X)
    yp = np.where(yp == 4, 3, yp)
    acc_score = accuracy_score(y, yp)
    return acc_score

def kfold_evaluation(X, y, k, hyperparams, esr):
    """Run a KFlod evaluation."""
    scores = []
    
    print(f"\n------ {k}-fold evaluation -----")
    print(hyperparams)
    
    kf = KFold(k)
    for i, (train_idx, test_idx) in enumerate(kf.split(X)):
        print(f"\n----- FOLD {i} -----")
        
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model = create(hyperparams)
        model = fit_with_stop(model, X_train, y_train, X_val, y_val, esr)

        train_score = evaluate(model, X_train, y_train)
        val_score = evaluate(model, X_val, y_val)
        
        scores.append((train_score, val_score))
        
        print(f"Fold {i} | Eval ACC: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns=['train score', 'validation score'])
    
    return scores

def kfold_prediction(X, y, X_test, k, hyperparams, esr):
    """Make predictions with a bagged model based on KFold."""
    yp = np.zeros(len(X_test))
    
    print(f"\n------ {k}-fold evaluation -----")
    print(hyperparams)
    
    kf = KFold(k)
    for i, (train_idx, test_idx) in enumerate(kf.split(X)):
        print(f"\n----- FOLD {i} -----")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model = create(hyperparams)
        model = fit_with_stop(model, X_train, y_train, X_val, y_val, esr)
        yp += model.predict_proba(X_test)[:, 1] / k
    
    return yp

In [None]:
# Features & target

x_train = train.loc[:, train.columns != 'Cover_Type']
y_train = train.loc[:, train.columns == 'Cover_Type']
x_test = test.loc[:, test.columns != 'Cover_Type']

train_sample = train.sample(n=300000)
x_train_sample = train_sample.loc[:, train_sample.columns != 'Cover_Type']
y_train_sample = train_sample.loc[:, train_sample.columns == 'Cover_Type']


In [None]:
BEST_PARAMS = {
    'n_estimators': 10000, # Waiting for early-stopping
    'learning_rate': 0.05, # Me
    'metric': "multi_logloss",
    'device' : 'gpu',# Me
}

In [None]:
# Objective function
def objective(trial):
    # Search spaces
    hyperparams = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 5, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 64),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'cat_smooth' : trial.suggest_int('cat_smooth', 10, 100),
        'cat_l2': trial.suggest_int('cat_l2', 1, 20),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200)
    }
    
    # Add BEST_PARAMS
    hyperparams.update(BEST_PARAMS)
     
    # Evaluation
    scores = kfold_evaluation(x_train_sample, y_train_sample, 5, hyperparams, 100)
    
    return scores['validation score'].mean()

In [None]:
%%time

# Optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=3600*5)

In [None]:
# Best score
study.best_value

In [None]:
# Historic
plot_optimization_history(study)

In [None]:
# Importance
plot_param_importances(study)

In [None]:
# Best parameters
BEST_PARAMS.update(study.best_params)
BEST_PARAMS

In [None]:
# Update hyperparams for prediction
BEST_PARAMS['learning_rate'] = 0.005

In [None]:
# Predictions on test set and submission
test['Cover_Type'] = kfold_prediction(x_train_sample, y_train_sample, x_test, 5, BEST_PARAMS, 150)
test['Cover_Type'].to_csv('submission.csv')