In [52]:
# include tools
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
# load data info and path
DATA_DIR = Path('data')
train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')
test_values = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')

data = train_values.join(train_labels)

data.head(3)

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3


In [54]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# the model
import xgboost as xgb

# for combining to preprocess with model training
from sklearn.pipeline import make_pipeline


# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [55]:
# pipeline and selective features
categorical_features = ['land_surface_condition',
            'foundation_type',
            'roof_type',
            'ground_floor_type',
            'other_floor_type',
            'position',
            'plan_configuration',
            'legal_ownership_status']

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   geo_level_1_id                          260601 non-null  int64 
 1   geo_level_2_id                          260601 non-null  int64 
 2   geo_level_3_id                          260601 non-null  int64 
 3   count_floors_pre_eq                     260601 non-null  int64 
 4   age                                     260601 non-null  int64 
 5   area_percentage                         260601 non-null  int64 
 6   height_percentage                       260601 non-null  int64 
 7   land_surface_condition                  260601 non-null  object
 8   foundation_type                         260601 non-null  object
 9   roof_type                               260601 non-null  object
 10  ground_floor_type                       260601 non-

In [56]:
# preprocess OneHotEncoder
encoder = OneHotEncoder(sparse=False)
encoded_categorical_data = encoder.fit_transform(data[categorical_features])
# Merge encoded categorical data with the original dataset and drop the original categorical columns
encoded_data = pd.concat([data.drop(categorical_features, axis=1),
                          pd.DataFrame(encoded_categorical_data, index=data.index)], axis=1)

encoded_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 69 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   geo_level_1_id                          260601 non-null  int64  
 1   geo_level_2_id                          260601 non-null  int64  
 2   geo_level_3_id                          260601 non-null  int64  
 3   count_floors_pre_eq                     260601 non-null  int64  
 4   age                                     260601 non-null  int64  
 5   area_percentage                         260601 non-null  int64  
 6   height_percentage                       260601 non-null  int64  
 7   has_superstructure_adobe_mud            260601 non-null  int64  
 8   has_superstructure_mud_mortar_stone     260601 non-null  int64  
 9   has_superstructure_stone_flag           260601 non-null  int64  
 10  has_superstructure_cement_mortar_stone 

In [57]:
# Split the dataset into training and testing sets
X = encoded_data.drop(['damage_grade'], axis=1)
y = encoded_data['damage_grade']
# Label conversion, Damage Label (1, 2, 3), need Label (0, 1, 2)
y = y - 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=3, seed=42)

In [58]:
# Set up the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0],
}

# Set up the GridSearchCV object
grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='f1_micro',  # Use micro-averaged F1 score as the evaluation metric
    n_jobs=-1,  # Use all available CPU cores
    cv=5,  # Use 5-fold cross-validation
    verbose=1,
)

# Perform the grid search
grid_search.fit(X_train, y_train)
# Print the best parameters and corresponding score
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best parameters found:  {'colsample_bytree': 0.5, 'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 200, 'subsample': 0.8}
Best score found:  0.7444359171143514


In [59]:
# Train the final model using the best parameters
best_params = grid_search.best_params_
best_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    n_estimators=best_params['n_estimators'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    seed=42
)

best_model.fit(X_train, y_train)

# Make predictions
y_pred = best_model.predict(X_test)

from sklearn.metrics import f1_score
# Evaluate the model
f1 = f1_score(y_test, y_pred, average='micro')
print("F1 Score: ", f1)

F1 Score:  0.7436925615394946


In [67]:
# make prediction on test
# preprocess OneHotEncoder
test_encoder = OneHotEncoder(sparse=False)
test_encoded_categorical_data = encoder.fit_transform(test_values[categorical_features])
# Merge encoded categorical data with the original dataset and drop the original categorical columns
test_encoded_data = pd.concat([test_values.drop(categorical_features, axis=1),
                          pd.DataFrame(test_encoded_categorical_data, index=test_values.index)], axis=1)

test_values_pred = best_model.predict(test_encoded_data)
test_values_pred = test_values_pred + 1

In [68]:
# save submission
submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=test_values_pred,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,3


In [69]:
# save to submission
my_submission.to_csv('submission.csv')
# check the head of the saved file
!type submission.csv

building_id,damage_grade
300051,3
99355,2
890251,2
745817,1
421793,3
871976,2
691228,2
896100,3
343471,2
766647,2
800548,2
652685,2
590834,3
612530,3
535718,2
269418,2
281421,2
797571,2
216579,2
81554,2
132785,2
976125,3
851237,3
322922,3
786774,2
922680,2
134349,3
551456,2
194812,2
520031,3
765502,3
517764,1
174470,2
395012,2
159776,2
62638,2
65760,2
761527,2
37201,2
853163,2
562505,2
349010,3
780817,2
179897,2
845103,3
171560,3
586661,2
84102,2
610806,3
269531,2
3781,2
127674,2
820462,2
955190,2
405557,2
143219,3
360041,1
378036,2
613583,3
992040,2
849585,2
491236,3
847580,3
229211,3
173624,2
642562,2
86467,2
684629,1
304627,2
255913,3
861624,3
367328,1
13216,3
838446,3
981040,2
335849,2
823769,2
666855,2
398783,2
829946,2
326177,3
359387,2
924578,2
712624,2
946541,2
373943,2
865274,1
522404,2
841632,3
649142,1
332157,1
722082,1
279597,2
107512,2
26477,2
138895,2
553866,2
157718,3
582842,3
123399,2
189875,2
70843,1
225218,2
703273,2
245176,2
760257,3
641933,2
76772,2
302226,2
535232,