In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This is an example of using tranfromers and pipelines in Sklearn. Feature engineering transformer is based on this research: https://www.kaggle.com/balamurugan1603/tps-dec-21-nn-feature-engg-tf

In [None]:
train_ds = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv', index_col='Id')
train_ds.head(10)

Removing empty columns and NaN rows

In [None]:
train_ds.drop(["Soil_Type7", "Soil_Type15"], axis=1, inplace=True)
train_ds = train_ds.where(train_ds['Cover_Type'] != 5)
print('Before removing null: ', train_ds.isnull().sum().sum())
train_ds.dropna(inplace=True)

In [None]:
labels = train_ds['Cover_Type'].copy()
train_ds = train_ds.drop('Cover_Type', axis=1) 

Here we define a feature engineering tranformer, it will addjust and add features.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion


class FeatureEngineer(BaseEstimator, TransformerMixin):
    
    def __init__(self, hill_feat, soil_feat, wild_feat):
        self.hill_feat = hill_feat
        self.soil_feat = soil_feat
        self.wild_feat = wild_feat

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):        
        X['Aspect'][X['Aspect'] < 0] += 360
        X["Aspect"][X["Aspect"] > 359] -= 360

        X["Mnhttn_Dist_Hydrlgy"] = np.abs(X["Horizontal_Distance_To_Hydrology"]) + np.abs(X["Vertical_Distance_To_Hydrology"])
        X["Ecldn_Dist_Hydrlgy"] = (X["Horizontal_Distance_To_Hydrology"]**2 + X["Vertical_Distance_To_Hydrology"]**2)**0.5
        
        for hill in self.hill_feat:
            X.loc[X[hill] < 0, hill] = 0
            X.loc[X[hill] > 255, hill] = 255

        X["Soil_Count"] = X[self.soil_feat].apply(sum, axis=1)
        X["Wild_Area_Count"] = X[self.wild_feat].apply(sum, axis=1)
        X["Hillshade_Mean"] = X[self.hill_feat].mean(axis=1)
        X['Amp_Hillshade'] = X[self.hill_feat].max(axis=1) - X[self.hill_feat].min(axis=1)

        return X


In [None]:
hill_feat = [x for x in train_ds.columns if x.startswith("Hillshade")]
soil_feat = [x for x in train_ds.columns if x.startswith("Soil_Type")]
wild_feat = [x for x in train_ds.columns if x.startswith("Wilderness_Area")]

feature_engineer = FeatureEngineer(hill_feat=hill_feat, soil_feat=soil_feat, wild_feat=wild_feat)
train_ds = feature_engineer.fit_transform(train_ds)
train_ds.head(10)

Here we add data selector. It will seperate dataset based on column types. Also we define pipelines to preprocess dataset.

In [None]:
class DataSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes):
        self.attributes = attributes
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attributes].values.astype(np.float16)


bool_cols = [i for i in train_ds.columns if i.startswith('Soil_Type') or i.startswith('Wilderness')]
num_cols = [i for i in train_ds.columns if not i.startswith('Soil_Type') and not i.startswith('Wilderness')]

num_pipeline = Pipeline([
    ('num_selector', DataSelector(num_cols)),
    ('scaler', StandardScaler())
])

int_pipeline = Pipeline([
    ('int_selector', DataSelector(bool_cols))
])

preprocessor = FeatureUnion([
    ('num_pipeline', num_pipeline),
    ('int_pipeline', int_pipeline)
])

Prepare dataset with pipelines

In [None]:
train_ds = preprocessor.fit_transform(train_ds)
labels = labels.values

I didn't reduce memory usage for this task, so *n_estimators=25*. Sorry) This is all for demonstrational purpose.

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=25, random_state=42, verbose=2, n_jobs=-1).fit(train_ds, labels)

Our pipeline in action for test dataset.

In [None]:
test_ds = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv', index_col='Id')

added_features = FeatureEngineer(hill_feat=hill_feat, soil_feat=soil_feat, wild_feat=wild_feat).fit_transform(test_ds)
test_data = preprocessor.transform(added_features)
test_labels = clf.predict(test_data)

In [None]:
predictions = test_labels.astype(np.uint8)

sub_df = pd.DataFrame({'Id': test_ds.index, 'Cover_Type': predictions})

sub_df.to_csv("submission.csv", index=False)

If you found it useful - plz UV ;)