In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import pandas as pd

In [None]:
train_df = pd.read_csv('/kaggle/input/richters-predictor-modeling-earthquake-damage/train_values.csv', index_col='building_id')
test_df = pd.read_csv('/kaggle/input/richters-predictor-modeling-earthquake-damage/test_values.csv',  index_col='building_id')
target_df = pd.read_csv('/kaggle/input/richters-predictor-modeling-earthquake-damage/train_labels.csv', index_col='building_id')

In [None]:
cat_features = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'land_surface_condition', 'foundation_type', 'roof_type', 
                    'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']
num_features = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

 **Реализация K-Fold MultiEncoding**

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder

In [None]:
from sklearn.model_selection import KFold 

class MultiEncoderKF(BaseEstimator, TransformerMixin):
    """Cross-fold target encoder.
    """
    
    def __init__(self, encoder_name, n_splits=3, cols=None, shuffle=True, **encoder_params):
        """Cross-fold target encoding for categorical features.
        
        Parameter
        ----------
        encoder_name: str
            Type of target encoder.
        **encoder_params: 
            Encoder parameters.
        n_splits : int
            Number of cross-fold splits. Default = 3.
        shuffle : bool
            Whether to shuffle the data when splitting into folds.
        cols : list of str
            Columns to target encode.
        weight : int
            Weight for smoothing encoding. Default = 0.
        """
        self.encoder_name = encoder_name
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.cols = cols

        #Get encoder parameter values or setting a default value
        #Target Encoder
        self.encoder_smoothing = encoder_params.get('smoothing', 1.0)
        self.encoder_min_samples_leaf=encoder_params.get('min_samples_leaf', 1)
        
        #Weight of Evidence
        self.encoder_regularization=encoder_params.get('regularization', 1.0)
        
        #MEstimateEncoder
        self.encoder_m=encoder_params.get('m', 1.0)

        #CatBoostEncoder
        self.encoder_a=encoder_params.get('a', 1)

    def fit(self, X, y):
        """Fit cross-fold target encoder to X and y
        
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values.
            
        Returns
        -------
        self : encoder
            Returns self.
        """
        
        if self.encoder_name == "WOEEncoder":
            self._encoder = WOEEncoder(cols=self.cols,
                                       regularization=self.encoder_regularization)

        if self.encoder_name == "TargetEncoder":
            self._encoder = TargetEncoder(cols=self.cols, 
                                          smoothing=self.encoder_smoothing, 
                                          min_samples_leaf=self.encoder_min_samples_leaf)

        if self.encoder_name == "MEstimateEncoder":
            self._encoder = MEstimateEncoder(cols=self.cols,
                                             m=self.encoder_m)

        if self.encoder_name == "LeaveOneOutEncoder":
            self._encoder = LeaveOneOutEncoder(cols=self.cols)

        if self.encoder_name == "JamesSteinEncoder":
            self._encoder = JamesSteinEncoder(cols=self.cols)

        if self.encoder_name == "CatBoostEncoder":
            self._encoder = CatBoostEncoder(cols=self.cols,
                                            a=self.encoder_a)

        self._encoder.fit(X, y)
        return self

    
    def transform(self, X, y=None):
        """Perform the target encoding transformation.

        Uses cross-fold target encoding for the training fold,
        and uses normal target encoding for the test fold.

        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """

        # Use target encoding from fit() if this is test data
        if y is None:
            return self._encoder.transform(X)

        # Compute means for each fold
        self._train_ix = []
        self._test_ix = []
        self._fit_tes = []
        kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle)
        for train_ix, test_ix in kf.split(X):
            self._train_ix.append(train_ix)
            self._test_ix.append(test_ix)
            
            encoder = TargetEncoder(cols=self.cols)

            if self.encoder_name == "WOEEncoder":
                encoder = WOEEncoder(cols=self.cols,
                                     regularization=self.encoder_regularization)

            if self.encoder_name == "TargetEncoder":
                encoder = TargetEncoder(cols=self.cols, 
                                          smoothing=self.encoder_smoothing, 
                                          min_samples_leaf=self.encoder_min_samples_leaf)

            if self.encoder_name == "MEstimateEncoder":
                encoder = MEstimateEncoder(cols=self.cols,
                                           m=self.encoder_m)

            if self.encoder_name == "LeaveOneOutEncoder":
                encoder = LeaveOneOutEncoder(cols=self.cols)

            if self.encoder_name == "JamesSteinEncoder":
                encoder = JamesSteinEncoder(cols=self.cols)

            if self.encoder_name == "CatBoostEncoder":
                encoder = CatBoostEncoder(cols=self.cols,
                                          a=self.encoder_a)
            
            if isinstance(X, pd.DataFrame):
                self._fit_tes.append(encoder.fit(X.iloc[train_ix,:],
                                            y.iloc[train_ix]))
            elif isinstance(X, np.ndarray):
                self._fit_tes.append(encoder.fit(X[train_ix,:],
                                            y[train_ix]))
            else:
                raise TypeError('X must be DataFrame or ndarray')

        # Apply means across folds
        Xo = X.copy()
        for ix in range(len(self._test_ix)):
            test_ix = self._test_ix[ix]
            if isinstance(X, pd.DataFrame):
                Xo.iloc[test_ix,:] = \
                    self._fit_tes[ix].transform(X.iloc[test_ix,:])
            elif isinstance(X, np.ndarray):
                Xo[test_ix,:] = \
                    self._fit_tes[ix].transform(X[test_ix,:])
            else:
                raise TypeError('X must be DataFrame or ndarray')
        return Xo

            
    def fit_transform(self, X, y=None):
        """Fit and transform the data via target encoding.
        
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values (required!).

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """
        return self.fit(X, y).transform(X, y)

In [None]:
X_train = train_df
X_test = test_df
y_train = target_df

**Кодирование**

In [None]:
%%time
enc = MultiEncoderKF(encoder_name='TargetEncoder', cols=cat_features, n_splits=5)
X_train_enc = enc.fit_transform(X_train, y_train)
X_test_enc = enc.transform(X_test)

In [None]:
X_train_enc[cat_features]=X_train_enc[cat_features].astype('float64')
X_test_enc[cat_features]=X_test_enc[cat_features].astype('float64')

In [None]:
from sklearn.model_selection import train_test_split
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train_enc, y_train, test_size=0.2, random_state=42)

**Запуск модели**

In [None]:
from lightgbm import LGBMRegressor

lgb_reg = LGBMRegressor(
    n_estimators=10000,
    max_depth=9,
    learning_rate=0.1,
    random_seed=42,
    loss_function='RMSE'
)

In [None]:
lgb_reg.fit(
    X_train_split, y_train_split,
    eval_set=[(X_test_split, y_test_split)],
    early_stopping_rounds=100,
    verbose=100
    )

In [None]:
y_pred = np.around(lgb_reg.predict(X_test_split))

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test_split, y_pred, average='micro')

In [None]:
lgb_reg.feature_importances_

In [None]:
fi_df = pd.DataFrame({'importance':lgb_reg.feature_importances_})    
fi_df['feature'] = X_train.columns
fi_df.sort_values(by='importance', inplace=True)
fi_df.plot(kind='barh', x='feature', y='importance',figsize=(10, 10))

In [None]:
from xgboost import XGBRegressor

xgb_reg = XGBRegressor(
    n_estimators=10000,
    max_depth=9,
    learning_rate=0.1,
    random_seed=42,
    objective='reg:squarederror'
)

In [None]:
xgb_reg.fit(
    X_train_split, y_train_split,
    eval_set=[(X_test_split, y_test_split)],
    early_stopping_rounds=100,
    verbose=100
    )

In [None]:
y_pred = np.around(xgb_reg.predict(X_test_split))
f1_score(y_test_split, y_pred, average='micro')

In [None]:
xgb_reg.feature_importances_

In [None]:
fi_df = pd.DataFrame({'importance':xgb_reg.feature_importances_})    
fi_df['feature'] = X_train.columns
fi_df.sort_values(by='importance', inplace=True)
fi_df.plot(kind='barh', x='feature', y='importance',figsize=(10, 10))

In [None]:
from catboost import CatBoostRegressor

catb_reg = CatBoostRegressor(
    iterations=10000,
    depth=9,
    learning_rate=0.1,
    random_seed=42,
    loss_function='RMSE'
)

In [None]:
catb_reg.fit(
    X_train_split, y_train_split,
    eval_set=[(X_test_split, y_test_split)],
    early_stopping_rounds=100,
    verbose=100
    )

In [None]:
y_pred = np.around(catb_reg.predict(X_test_split))
f1_score(y_test_split, y_pred, average='micro')

In [None]:
catb_reg.feature_importances_

In [None]:
fi_df = pd.DataFrame({'importance':catb_reg.feature_importances_})    
fi_df['feature'] = X_train.columns
fi_df.sort_values(by='importance', inplace=True)
fi_df.plot(kind='barh', x='feature', y='importance',figsize=(10, 10))

**Библиотека eli5**

In [None]:
# https://eli5.readthedocs.io/en/latest/
!pip install eli5

In [None]:
from eli5 import explain_weights

In [None]:
explain_weights(lgb_reg, top=38)

In [None]:
explain_weights(xgb_reg, top=38)

In [None]:
explain_weights(catb_reg, top=38)