In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip3 install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/96/3b/bb419654adcf7efff42ed8a3f84e50c8f236424b7ed1cc8ccd290852e003/catboost-0.24.4-cp37-none-manylinux1_x86_64.whl (65.7MB)
[K     |████████████████████████████████| 65.7MB 63kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.4


In [None]:
import numpy as np
import os
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn_pandas import DataFrameMapper

In [None]:
base_path = os.path.join('drive', 'MyDrive', 'Earthquake prediction')
train_values_path = os.path.join(base_path, 'train_values.csv')
train_labels_path = os.path.join(base_path, 'train_labels.csv')
test_values_path = os.path.join(base_path, 'test_values.csv')
preprocessed_path = os.path.join(base_path, 'preprocessing_outputs')
submission_format_path = os.path.join(base_path, 'submission_format.csv')

In [None]:
os.listdir(preprocessed_path)

['train_values.csv',
 'train_labels.csv',
 'test_values.csv',
 'train_data.npz',
 'test_data.npz',
 'lightning_logs']

In [None]:
X_train = pd.read_csv(train_values_path)
y_train = pd.read_csv(train_labels_path)
X_test = pd.read_csv(test_values_path)

NameError: ignored

In [None]:
y_train = y_train['damage_grade']

In [None]:
X_train = X_train.astype({col: 'object' for col in list(X_train.columns)
                   if col.startswith('has') 
                   or col in ['count_families', 'damage_grade']})

X_train = X_train.astype({col: 'float' for col in X_train.columns[1:7]})

In [None]:
numerical_features = list(X_train.select_dtypes(include=[np.number]).columns[1:]) #without ids
categorical_features = list(X_train.select_dtypes(include=[np.object]).columns)

In [None]:
%%time
tree = DecisionTreeClassifier(random_state=29)
max_depth_values = [k for k in range(3, 10)]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=29)
tree_params = {'max_depth': max_depth_values}
tree_grid = GridSearchCV(estimator=tree, param_grid=tree_params, cv=skf, 
                         scoring='f1_micro', refit=True)



cat = [([feature], [SimpleImputer(strategy='constant', fill_value='UNK'),
              OneHotEncoder()]) for feature in categorical_features]  #or OneHotEncoder ?
num = [([feature], [SimpleImputer()]) for feature in numerical_features]

mapper = DataFrameMapper(num + cat, df_out=True)

CPU times: user 403 µs, sys: 0 ns, total: 403 µs
Wall time: 393 µs


In [None]:
clf = CatBoostClassifier(iterations=1000,
                         learning_rate=0.001,
                         metric_period=100,
                         depth=4)

pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)

0:	learn: 1.0981067	total: 209ms	remaining: 3m 28s
100:	learn: 1.0519162	total: 13.5s	remaining: 1m 59s
200:	learn: 1.0136911	total: 26.5s	remaining: 1m 45s
300:	learn: 0.9807757	total: 39.8s	remaining: 1m 32s
400:	learn: 0.9528020	total: 53.1s	remaining: 1m 19s
500:	learn: 0.9287120	total: 1m 6s	remaining: 1m 6s
600:	learn: 0.9080076	total: 1m 19s	remaining: 52.9s
700:	learn: 0.8899749	total: 1m 32s	remaining: 39.7s
800:	learn: 0.8742201	total: 1m 46s	remaining: 26.4s
900:	learn: 0.8604017	total: 1m 59s	remaining: 13.1s
999:	learn: 0.8484308	total: 2m 12s	remaining: 0us


Pipeline(memory=None,
         steps=[('preprocess',
                 DataFrameMapper(default=False, df_out=True,
                                 features=[(['geo_level_1_id'],
                                            [SimpleImputer(add_indicator=False,
                                                           copy=True,
                                                           fill_value=None,
                                                           missing_values=nan,
                                                           strategy='mean',
                                                           verbose=0)]),
                                           (['geo_level_2_id'],
                                            [SimpleImputer(add_indicator=False,
                                                           copy=True,
                                                           fill_value=None,
                                                           missing_values=nan,

In [None]:
#print('best params: {} \nbest score: {}'.format(cat_grid.best_params_, cat_grid.best_score_))

In [None]:
preprocessed_X_test = mapper.transform(X_test)

In [None]:
X_test[numerical_features + categorical_features].head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,17,596,11307,3,20,7,6,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,6,141,11987,2,25,13,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,1,1,0,0,0,0,0,0,0,0,0
2,22,19,10044,2,5,4,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,26,39,633,1,0,19,3,t,r,x,v,j,t,d,0,0,0,0,0,1,0,0,0,0,0,v,2,1,0,0,1,0,0,0,0,0,0,0
4,17,289,7970,3,15,8,7,t,r,q,f,q,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [None]:
preprocessed_X_test.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition_x0_n,land_surface_condition_x0_o,land_surface_condition_x0_t,foundation_type_x0_h,foundation_type_x0_i,foundation_type_x0_r,foundation_type_x0_u,foundation_type_x0_w,roof_type_x0_n,roof_type_x0_q,roof_type_x0_x,ground_floor_type_x0_f,ground_floor_type_x0_m,ground_floor_type_x0_v,ground_floor_type_x0_x,ground_floor_type_x0_z,other_floor_type_x0_j,other_floor_type_x0_q,other_floor_type_x0_s,other_floor_type_x0_x,position_x0_j,position_x0_o,position_x0_s,position_x0_t,plan_configuration_x0_a,plan_configuration_x0_c,plan_configuration_x0_d,plan_configuration_x0_f,plan_configuration_x0_m,plan_configuration_x0_n,plan_configuration_x0_o,plan_configuration_x0_q,plan_configuration_x0_s,...,has_superstructure_rc_engineered_x0_0,has_superstructure_rc_engineered_x0_1,has_superstructure_other_x0_0,has_superstructure_other_x0_1,legal_ownership_status_x0_a,legal_ownership_status_x0_r,legal_ownership_status_x0_v,legal_ownership_status_x0_w,count_families_x0_0,count_families_x0_1,count_families_x0_2,count_families_x0_3,count_families_x0_4,count_families_x0_5,count_families_x0_6,count_families_x0_7,count_families_x0_8,count_families_x0_9,has_secondary_use_x0_0,has_secondary_use_x0_1,has_secondary_use_agriculture_x0_0,has_secondary_use_agriculture_x0_1,has_secondary_use_hotel_x0_0,has_secondary_use_hotel_x0_1,has_secondary_use_rental_x0_0,has_secondary_use_rental_x0_1,has_secondary_use_institution_x0_0,has_secondary_use_institution_x0_1,has_secondary_use_school_x0_0,has_secondary_use_school_x0_1,has_secondary_use_industry_x0_0,has_secondary_use_industry_x0_1,has_secondary_use_health_post_x0_0,has_secondary_use_health_post_x0_1,has_secondary_use_gov_office_x0_0,has_secondary_use_gov_office_x0_1,has_secondary_use_use_police_x0_0,has_secondary_use_use_police_x0_1,has_secondary_use_other_x0_0,has_secondary_use_other_x0_1
0,17.0,596.0,11307.0,3.0,20.0,7.0,6.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,6.0,141.0,11987.0,2.0,25.0,13.0,5.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,22.0,19.0,10044.0,2.0,5.0,4.0,5.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,26.0,39.0,633.0,1.0,0.0,19.0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,17.0,289.0,7970.0,3.0,15.0,8.0,7.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [None]:
pred = pipeline.predict(X_test)
submission = pd.read_csv(submission_format_path)
submission['damage_grade'] = pred
submission.to_csv(os.path.join(base_path, 'submission10.csv'), index=False)

With preprocessed data

In [None]:
x_train = np.load(os.path.join(preprocessed_path, 'train_data.npz'))['data']
x_test = np.load(os.path.join(preprocessed_path, 'test_data.npz'))['data']
y_train = pd.read_csv(train_labels_path)['damage_grade'].values

In [None]:
y_train

array([3, 2, 3, ..., 3, 2, 3])

In [None]:
tree = RandomForestClassifier(random_state=29)
max_depth_values = [k for k in range(3, 10)]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=29)
tree_params = {'max_depth': max_depth_values}
tree_grid = GridSearchCV(estimator=tree, param_grid=tree_params, cv=skf, 
                         scoring='f1_micro', refit=True)

In [None]:
tree_grid.fit(x_train, y_train)

KeyboardInterrupt: ignored

In [None]:
from sklearn.model_selection import cross_val_score

clf = CatBoostClassifier(iterations=1000,
                         learning_rate=0.001,
                         metric_period=100,
                         depth=4)

print(cross_val_score(clf, x_train, y_train, cv=5, scoring='f1_micro'))

0:	learn: 1.0977529	total: 165ms	remaining: 2m 44s
100:	learn: 1.0207337	total: 15.5s	remaining: 2m 18s
200:	learn: 0.9584419	total: 30.9s	remaining: 2m 2s
300:	learn: 0.9071332	total: 46.2s	remaining: 1m 47s
400:	learn: 0.8640704	total: 1m 1s	remaining: 1m 32s
500:	learn: 0.8275492	total: 1m 17s	remaining: 1m 16s
600:	learn: 0.7962327	total: 1m 32s	remaining: 1m 1s
700:	learn: 0.7691538	total: 1m 48s	remaining: 46.2s
800:	learn: 0.7457505	total: 2m 4s	remaining: 30.9s
900:	learn: 0.7252821	total: 2m 20s	remaining: 15.4s
999:	learn: 0.7074495	total: 2m 36s	remaining: 0us
0:	learn: 1.0977460	total: 162ms	remaining: 2m 41s
100:	learn: 1.0204947	total: 15.7s	remaining: 2m 19s
200:	learn: 0.9579908	total: 31.3s	remaining: 2m 4s
300:	learn: 0.9064853	total: 46.7s	remaining: 1m 48s
400:	learn: 0.8632292	total: 1m 2s	remaining: 1m 33s
500:	learn: 0.8264806	total: 1m 17s	remaining: 1m 17s
600:	learn: 0.7950000	total: 1m 33s	remaining: 1m 2s
700:	learn: 0.7679269	total: 1m 49s	remaining: 46.6s


In [None]:
np.mean([0.74307861, 0.74092479 ,0.7444551 , 0.7447429 , 0.74247889])

0.7431360579999999

In [None]:
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
submission = pd.read_csv(submission_format_path)
submission['damage_grade'] = pred
submission.to_csv(os.path.join(base_path, 'submission10.csv'), index=False)

In [None]:
x_train.shape

(260601, 90)

In [None]:
%%time
model = RandomForestClassifier(n_estimators=100, min_samples_split=50, n_jobs=-1)
model.fit(x_train, y_train)

CPU times: user 2min 2s, sys: 325 ms, total: 2min 2s
Wall time: 1min 2s


In [None]:
%%time
#We choose logistic regression
#let's do sequential feature selection with the mlxtend library
from mlxtend.feature_selection import SequentialFeatureSelector as SFS 
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=29)
final_model = RandomForestClassifier(n_estimators=100, min_samples_split=50, n_jobs=-1) 
sfs = SFS(estimator=final_model,
          k_features=45,
          forward=True,
          floating=False,
          scoring='f1_micro',
          cv=skf)

sfs.fit(x_train, y_train)