In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import lightgbm as lgb
import joblib
from joblib import Parallel, delayed
from functools import partial
from multiprocessing import Pool

import warnings
warnings.filterwarnings('ignore')

In [None]:
def parallelize_df(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')

In [None]:
# From public notebooks
def feature_engineering(df):
    df["Aspect"][df["Aspect"] < 0] += 360
    df["Aspect"][df["Aspect"] > 359] -= 360
    df.loc[df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
    df.loc[df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
    df.loc[df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
    df.loc[df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
    df.loc[df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
    df.loc[df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
    df["manhattan_dist_hydrology"] = np.abs(df["Horizontal_Distance_To_Hydrology"]) + np.abs(df["Vertical_Distance_To_Hydrology"])
    df["euclidean_dist_hydrology"] = (df["Horizontal_Distance_To_Hydrology"]**2 + df["Vertical_Distance_To_Hydrology"]**2)**0.5
    soil_features = [x for x in df.columns if x.startswith("Soil_Type")]
    df["soil_type_count"] = df[soil_features].sum(axis=1)
    wilderness_features = [x for x in df.columns if x.startswith("Wilderness_Area")]
    df["wilderness_area_count"] = df[wilderness_features].sum(axis=1)
    df.drop(["Soil_Type7","Soil_Type15"],axis = 1, inplace= True )
    return df

In [None]:
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

In [None]:
X = train_df[~(train_df.Cover_Type == 5)]
y = X.pop('Cover_Type').values
X_ = test_df

In [None]:
# Default values, just is_unbalance and learning_rate not default values. May be there is room to optimize with OPTUNA for example.
params = {
  'metric': 'multi_logloss',
  'is_unbalance': True,
  'n_estimators': 20000,
  'objective': 'multiclass',
  'learning_rate': 0.001
}

In [None]:
NFOLDS = 5
folds = KFold(n_splits=NFOLDS, shuffle=True)
oof = np.zeros(X.shape[0])
predictions = np.zeros((X_.shape[0],6))
best_prediction = np.zeros((X_.shape[0],6)) 

In [None]:
def worker(p, fold_, X_train, y_train, X_test, y_test):
    clf = lgb.LGBMClassifier(**p, n_jobs=-1)
    clf.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=250, early_stopping_rounds=50)
    return clf.predict(X_)

In [None]:
# Parallel fold classification
clf = None
parameters = []
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print("Fold {}".format(fold_))
    X_train = X.iloc[trn_idx]
    y_train = y[trn_idx]
    X_test = X.iloc[val_idx]
    y_test = y[val_idx]
    parameters.append((params, fold_, X_train, y_train, X_test, y_test))
predictions = Parallel(n_jobs=NFOLDS)(delayed(worker)(x[0], x[1], x[2], x[3], x[4], x[5]) for x in parameters)


In [None]:
# Build fold prediction's dataframe
predictions_df = pd.concat([pd.DataFrame(p) for p in predictions], axis=1)
# Voting predictions
final_predictions = parallelize_df(predictions_df, partial(pd.DataFrame.mode, axis=1))

In [None]:
# Create submission file
submission = pd.concat([sample_submission,final_predictions[0]], axis=1)
_ = submission.pop('Cover_Type')
submission.columns = ['Id', 'Cover_Type']
submission['Cover_Type'] = submission.Cover_Type.astype(int)
submission.to_csv('submission.csv', index=False)