In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, hstack
from catboost import CatBoostClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder


train_df = pd.read_csv('/kaggle/input/richters-predictor-modeling-earthquake-damage/train_values.csv', index_col='building_id')
test_df = pd.read_csv('/kaggle/input/richters-predictor-modeling-earthquake-damage/test_values.csv',  index_col='building_id')
target_df = pd.read_csv('/kaggle/input/richters-predictor-modeling-earthquake-damage/train_labels.csv', index_col='building_id')
idx = train_df.shape[0]
data_df = pd.concat([train_df, test_df], sort=False)
cat_features = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'land_surface_condition', 'foundation_type',
                'roof_type',
                'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']
data_cat = pd.DataFrame(index=data_df.index,
                        data=data_df,
                        columns=cat_features)
data_num = data_df.drop(columns=cat_features)
num_features = data_num.columns
enc = OneHotEncoder()
enc.fit(data_cat)
data_cat_encoded = enc.transform(data_cat)
scaler = MinMaxScaler()
data_num_scaled = scaler.fit_transform(data_num)
data_num_scaled = coo_matrix(data_num_scaled)
data = hstack((data_cat_encoded, data_num_scaled))
data = data.astype(dtype='float16')
X_train = data.tocsr()[:idx]
X_test = data.tocsr()[idx:]
y_train = target_df['damage_grade'].values
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train,
                                                                              test_size=0.2,
                                                                              random_state=42)
cat_cls = CatBoostClassifier(iterations=2500,
                             loss_function='MultiClass',
                             depth=4,
                             l2_leaf_reg=20,
                             eval_metric='Accuracy',
                             leaf_estimation_iterations=10)
cat_cls.fit(X_train_split, y_train_split)
y_pred = cat_cls.predict(X_valid_split)
f1_score(y_valid_split, y_pred, average='micro')
cat_cls.fit(X_train, y_train)
y_pred = cat_cls.predict(X_test)
predicted_df = pd.DataFrame(y_pred.astype(np.int8), index=test_df.index, columns=['damage_grade'])
predicted_df.to_csv('baseline.csv')