In [1]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
import seaborn as sns
import matplotlib as plt

In [2]:
#Read files
train_values = pd.read_csv('train_values.csv')
train_labels = pd.read_csv('train_labels.csv')
test_values = pd.read_csv('test_values.csv')

*EDA and PreProcessing for training and test data*

In [4]:
# Join training values and labels on 'building_id'
train_df = pd.merge(train_values, train_labels, on="building_id")
display(train_df.head())

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,3


In [5]:
#Checking column types for training data
train_df.dtypes

building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

In [6]:
#Checking column types for test data
test_values.dtypes

building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

In [7]:
#Dropping columns with object type (for either train data and test data)
object_cols = train_df.select_dtypes(include=['object']).columns
train_df = train_df.drop(columns=object_cols)
test_values = test_values.drop(columns=object_cols)

*Train/Test split*

In [10]:
#defining features and target
x = train_df.drop(columns=['damage_grade','building_id'])
y = train_df['damage_grade']
x_test_data = test_values.drop(columns=['building_id'])

#split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

*Grid search*

In [11]:
#Performing grid search for hyperparameter tuning
pipe = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=42))

param_grid = {
    "gradientboostingclassifier__n_estimators": [50, 100, 200],
    "gradientboostingclassifier__learning_rate": [0.01, 0.1, 0.2],
    "gradientboostingclassifier__max_depth": [3, 5, 7]
    
    }


*Training the model and validation*

In [None]:
#setting up the GridSearchCV
clf = GridSearchCV(pipe, param_grid, scoring="accuracy", cv=5)
clf.fit(x_train, y_train)

#Best model and score
print(f"Best score: {clf.best_score_}: with parameters: {clf.best_params_}")

#Evaluate on the validation set
best_model = clf.best_estimator_
val_score = best_model.score(x_test, y_test)
print(f"Validation accuracy: {val_score}")

*Making predictions*

In [None]:
# Predictions on the validation set
damage_pred = best_model.predict(x_test)

In [None]:
# Print accuracy and classification report
accuracy = accuracy_score(y_test, damage_pred)
print(f"Validation Accuracy: {accuracy}")
print(f"Classification report: \n{classification_report(y_test, damage_pred)}")

Validation Accuracy: 0.7147982578998868
Classification report: 
               precision    recall  f1-score   support

           1       0.65      0.47      0.55      5170
           2       0.72      0.82      0.77     29487
           3       0.71      0.61      0.66     17464

    accuracy                           0.71     52121
   macro avg       0.70      0.63      0.66     52121
weighted avg       0.71      0.71      0.71     52121



In [None]:
# Fit the model on the entire training data
best_model.fit(x, y)

# Predictions on the test set
test_predictions = best_model.predict(x_test_data)

# Prepare the results with predictions
predictions = pd.DataFrame({
    'building_id': test_values['building_id'],
    'damage_grade': test_predictions})

predictions.to_csv('submission.csv', index=False)