In [None]:
import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv('/kaggle/input/richters-predictor-modeling-earthquake-damage/train_values.csv', index_col='building_id')
test_df = pd.read_csv('/kaggle/input/richters-predictor-modeling-earthquake-damage/test_values.csv',  index_col='building_id')
target_df = pd.read_csv('/kaggle/input/richters-predictor-modeling-earthquake-damage/train_labels.csv', index_col='building_id')

In [None]:
train_df.head()

In [None]:
target_df.head()

In [None]:
test_df.head()

In [None]:
cat_features = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'land_surface_condition', 'foundation_type', 'roof_type', 
                    'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']
num_features = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TargetEncoder(BaseEstimator, TransformerMixin):
    """Target encoder.
    
    Replaces categorical column(s) with the mean target value for
    each category.

    """
    
    def __init__(self, cols=None):
        """Target encoder
        
        Parameters
        ----------
        cols : list of str
            Columns to target encode.  Default is to target 
            encode all categorical columns in the DataFrame.
        """
        if isinstance(cols, str):
            self.cols = [cols]
        else:
            self.cols = cols
        
        
    def fit(self, X, y):
        """Fit target encoder to X and y
        
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values.
            
        Returns
        -------
        self : encoder
            Returns self.
        """
        
        # Encode all categorical cols by default
        if self.cols is None:
            self.cols = [col for col in X 
                         if str(X[col].dtype)=='object']

        # Check columns are in X
        for col in self.cols:
            if col not in X:
                raise ValueError('Column \''+col+'\' not in X')

        # Encode each element of each column
        self.maps = dict() #dict to store map for each column
        for col in self.cols:
            tmap = dict()
            uniques = X[col].unique()
            for unique in uniques:
                tmap[unique] = y[X[col]==unique].mean()
            self.maps[col] = tmap
            
        return self

        
    def transform(self, X, y=None):
        """Perform the target encoding transformation.
        
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
            
        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """
        Xo = X.copy()
        for col, tmap in self.maps.items():
            vals = np.full(X.shape[0], np.nan)
            for val, mean_target in tmap.items():
                vals[X[col]==val] = mean_target
            Xo[col] = vals
        return Xo
            
            
    def fit_transform(self, X, y=None):
        """Fit and transform the data via target encoding.
        
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values (required!).

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """
        return self.fit(X, y).transform(X, y)

In [None]:
from sklearn.model_selection import KFold 

class TargetEncoderCV(TargetEncoder):
    """Cross-fold target encoder.
    """
    
    def __init__(self, n_splits=3, shuffle=True, cols=None):
        """Cross-fold target encoding for categorical features.
        
        Parameters
        ----------
        n_splits : int
            Number of cross-fold splits. Default = 3.
        shuffle : bool
            Whether to shuffle the data when splitting into folds.
        cols : list of str
            Columns to target encode.
        """
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.cols = cols
        

    def fit(self, X, y):
        """Fit cross-fold target encoder to X and y
        
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values.
            
        Returns
        -------
        self : encoder
            Returns self.
        """
        self._target_encoder = TargetEncoder(cols=self.cols)
        self._target_encoder.fit(X, y)
        return self

    
    def transform(self, X, y=None):
        """Perform the target encoding transformation.

        Uses cross-fold target encoding for the training fold,
        and uses normal target encoding for the test fold.

        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """

        # Use target encoding from fit() if this is test data
        if y is None:
            return self._target_encoder.transform(X)

        # Compute means for each fold
        self._train_ix = []
        self._test_ix = []
        self._fit_tes = []
        kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle)
        for train_ix, test_ix in kf.split(X):
            self._train_ix.append(train_ix)
            self._test_ix.append(test_ix)
            te = TargetEncoder(cols=self.cols)
            if isinstance(X, pd.DataFrame):
                self._fit_tes.append(te.fit(X.iloc[train_ix,:],
                                            y.iloc[train_ix]))
            elif isinstance(X, np.ndarray):
                self._fit_tes.append(te.fit(X[train_ix,:],
                                            y[train_ix]))
            else:
                raise TypeError('X must be DataFrame or ndarray')

        # Apply means across folds
        Xo = X.copy()
        for ix in range(len(self._test_ix)):
            test_ix = self._test_ix[ix]
            if isinstance(X, pd.DataFrame):
                Xo.iloc[test_ix,:] = \
                    self._fit_tes[ix].transform(X.iloc[test_ix,:])
            elif isinstance(X, np.ndarray):
                Xo[test_ix,:] = \
                    self._fit_tes[ix].transform(X[test_ix,:])
            else:
                raise TypeError('X must be DataFrame or ndarray')
        return Xo

            
    def fit_transform(self, X, y=None):
        """Fit and transform the data via target encoding.
        
        Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values (required!).

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """
        return self.fit(X, y).transform(X, y)

In [None]:
te_cv = TargetEncoderCV(cols = cat_features, n_splits=5).fit(train_df, target_df)
train_df_te_cv = te_cv.transform(train_df)
test_df_te_cv = te_cv.transform(test_df)

In [None]:
train_df_te_cv['land_surface_condition+foundation_type'] = train_df['land_surface_condition'] + train_df['foundation_type']
test_df_te_cv['land_surface_condition+foundation_type'] = test_df['land_surface_condition'] + test_df['foundation_type']

te_cv = TargetEncoderCV(cols = ['land_surface_condition+foundation_type'], n_splits=5).fit(train_df_te_cv, target_df)
train_df_te_cv_new = te_cv.transform(train_df_te_cv)
test_df_te_cv_new = te_cv.transform(test_df_te_cv)

In [None]:
X_train = train_df_te_cv_new
X_test = test_df_te_cv_new
y_train = target_df['damage_grade'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, train_size=0.8, random_state=42)

In [None]:
from xgboost.sklearn import XGBClassifier

In [None]:
model = XGBClassifier(
    learning_rate = 0.1,
    max_depth = 5,
    n_estimators = 1000,
    min_child_weight = 3,
    subsample = 0.8,
    colsample_bytree = 0.8,
    tree_method='gpu_hist'
)

In [None]:
model.fit(
    X_train_split, y_train_split,
    verbose=10,
)

In [None]:
y_pred = model.predict(X_valid_split)

In [None]:
from sklearn.metrics import f1_score
f1_score(y_valid_split, y_pred, average='micro')

In [None]:
y_pred = model.predict(X_test)
predicted_df = pd.DataFrame(y_pred.astype(np.int8), index = test_df.index, columns=['damage_grade'])
predicted_df.to_csv('baseline.csv')

In [None]:
X_train = train_df
X_test = test_df
y_train = target_df

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)

results = []

for train_idx, test_idx in kf.split(X_train):
    
    X_train_cv, X_test_cv = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_cv, y_test_cv = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    te_cv = TargetEncoderCV(cols = cat_features, n_splits=5).fit(X_train_cv, y_train_cv)
    X_train_te_cv = te_cv.transform(X_train_cv)
    X_test_te_cv = te_cv.transform(X_test_cv)
    
    model.fit(
    X_train_te_cv, y_train_cv,
    eval_set=[(X_test_te_cv, y_test_cv)],
    verbose = 100
    )

    y_pred = model.predict(X_test_te_cv)
    eval_result = f1_score(y_test_cv, y_pred, average='micro')

    results.append(eval_result)

In [None]:
results

In [None]:
te_cv = TargetEncoderCV(cols = cat_features, n_splits=5).fit(train_df, target_df)
train_df_te_cv = te_cv.transform(train_df)
test_df_te_cv = te_cv.transform(test_df)

X_train = train_df_te_cv
X_test = test_df_te_cv
y_train = target_df['damage_grade'].values

model.fit(
    X_train, y_train,
    verbose=10)