In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

X = pd.read_csv('../input/richters-predictor-modeling-earthquake-damage/train_values.csv', index_col = 'building_id')
y = pd.read_csv('../input/richters-predictor-modeling-earthquake-damage/train_labels.csv', index_col = 'building_id')
X_test = pd.read_csv('../input/richters-predictor-modeling-earthquake-damage/test_values.csv', index_col = 'building_id')

/kaggle/input/richters-predictor-modeling-earthquake-damage/train_labels.csv
/kaggle/input/richters-predictor-modeling-earthquake-damage/submission_format.csv
/kaggle/input/richters-predictor-modeling-earthquake-damage/train_values.csv
/kaggle/input/richters-predictor-modeling-earthquake-damage/test_values.csv


In [2]:
num_cols = [col for col in X.columns if X[col].dtypes=='int64']
cat_cols = [col for col in X.columns if X[col].dtypes=='object']

In [3]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import f1_score

In [4]:
X_train = pd.get_dummies(X)

0.727 {'subsample': 0.5, 'min_child_weight': 1, 'max_depth': 7, 'gamma': 1.5, 'colsample_bytree': 0.7}

0.744 {'subsample': 0.3, 'min_child_weight': 1, 'max_depth': 9, 'gamma': 1.5, 'colsample_bytree': 0.65}

In [5]:
params = {
        'min_child_weight': [0.9, 1, 1.1],
        'gamma': [1.4, 1.5, 1.6],
        'subsample': [0.15, 0.2, 0.25, 0.3],
        'colsample_bytree': [0.6, 0.65, 0.7],
        'max_depth': [8, 9, 10, 11]
        }
xgb = XGBClassifier(n_estimators=1000, 
                    learning_rate=0.02, 
                    tree_method='gpu_hist',
                    objective='multi:softprob',
                    silent=True)

folds = 5
param_comb = 5
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
random_search = RandomizedSearchCV(xgb, param_distributions=params, 
                                   n_iter=param_comb, 
                                   scoring='f1_micro', 
                                   n_jobs=4, 
                                   cv=skf.split(X_train, y), 
                                   verbose=3,
                                   random_state=1001)

In [6]:
random_search.fit(X_train, y)
print('\n Best hyperparameters:')
print(random_search.best_params_)
print('\n Best estimator:')
print(random_search.best_estimator_)


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 31.2min finished
  return f(**kwargs)


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Best hyperparameters:
{'subsample': 0.25, 'min_child_weight': 1, 'max_depth': 11, 'gamma': 1.4, 'colsample_bytree': 0.65}

 Best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.65, gamma=1.4, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=1000, n_jobs=0, num_parallel_tree=1,
          

In [7]:
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.65, gamma=1.4, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=None,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=1000, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, silent=True, subsample=0.25,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)



In [8]:
from sklearn.metrics import f1_score
model.fit(X_train,y)
preds = model.predict(X_train)
print('f1 micro score:', f1_score(y, preds, average='micro'))


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


f1 micro score: 0.7694214527189075


In [9]:
X_test_transform = pd.get_dummies(X_test)
y_test = model.predict(X_test_transform)
outputfull = pd.DataFrame({'building_id': X_test.index,
                       'damage_grade': y_test,
                      })
outputfull.to_csv('Submission.csv', index=False)