In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import sklearn.preprocessing as skp
#from sklearn.svm import LinearSVC
#from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
#from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score, classification_report
#from sklearn.utils.class_weight import compute_class_weight
#from sklearn.model_selection import GridSearchCV
#from tensorflow import keras
#import tensorflow as tf
import tensorflow.keras.preprocessing as tfp

In [2]:
# Training parameters
TRAIN_FRAC = 0.8 # fraction of the labeled data to use for training, remainder is used for validation
RANDOM_STATE = 1 # random state for random sampling 

In [49]:
# Load data
train_data_all = pd.read_csv('data/train_values.csv')
label_data_all = pd.read_csv('data/train_labels.csv')
test_data_all = pd.read_csv('data/test_values.csv')
train_data_all.shape

(260601, 39)

In [4]:
# Compare totals for each category
val_counts = label_data_all['damage_grade'].value_counts()
val_counts

2    148259
3     87218
1     25124
Name: damage_grade, dtype: int64

In [5]:
# Undersample the majority classes
n_1, n_2, n_3 = val_counts[1], val_counts[2], val_counts[3]

sample_to = 1 # choose 1 or 3
mult_factor = .85

train_data_drop = train_data_all.copy()
if sample_to == 1:
    # sample 2 and 3 down to n_1
    label_3_inds = label_data_all.index[label_data_all['damage_grade']==3].to_list()
    label_2_inds = label_data_all.index[label_data_all['damage_grade']==2].to_list()
    drop_3 = np.random.choice(label_3_inds, int((len(label_3_inds)-n_1)*mult_factor), replace=False)
    drop_2 = np.random.choice(label_2_inds, len(drop_3)+(n_2-n_3), replace=False)
    print('Dropping -- 3: {}, 2: {}'.format(len(drop_3),len(drop_2)))
    train_data_drop.drop(drop_3,axis=0,inplace=True)
    train_data_drop.drop(drop_2,axis=0,inplace=True)
    label_data_drop = label_data_all.copy()
    label_data_drop.drop(drop_3,axis=0,inplace=True)
    label_data_drop.drop(drop_2,axis=0,inplace=True)
elif sample_to == 3:
    # sample 2 down to n_3
    label_2_inds = label_data_all.index[label_data_all['damage_grade']==2].to_list()
    drop_2 = np.random.choice(label_2_inds, len(label_2_inds)-int(n_3*mult_factor), replace=False)
    print('Dropping -- 2: {}'.format(len(drop_2)))
    train_data_drop.drop(drop_2,axis=0,inplace=True)
    label_data_drop = label_data_all.copy()
    label_data_drop.drop(drop_2,axis=0,inplace=True)

new_val_counts = label_data_drop['damage_grade'].value_counts()
new_val_counts

Dropping -- 3: 52779, 2: 113820


3    34439
2    34439
1    25124
Name: damage_grade, dtype: int64

In [50]:
# Inspect columns
print(train_data_drop.shape)
train_data_drop.dtypes

(94002, 39)


building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

In [51]:
# create lists of the columns that are categorical, binary and numerical
cat_cols = [train_data_drop.columns[kk] for kk in range(len(train_data_drop.columns)) if train_data_drop[train_data_drop.columns[kk]].dtypes=='object']
bin_cols = [train_data_drop.columns[kk] for kk in range(len(train_data_drop.columns)) if 'has' in train_data_drop.columns[kk]]
num_cols = [train_data_drop.columns[kk] for kk in range(len(train_data_drop.columns)) if \
            (train_data_drop.columns[kk] not in cat_cols) and (train_data_drop.columns[kk] not in bin_cols) \
            and ('building_id' not in train_data_drop.columns[kk])]
print('\nCategorical columns: ',cat_cols)
print('\nBinary columns: ',bin_cols)
print('\nNumerical columns: ',num_cols)


Categorical columns:  ['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']

Binary columns:  ['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered', 'has_superstructure_other', 'has_secondary_use', 'has_secondary_use_agriculture', 'has_secondary_use_hotel', 'has_secondary_use_rental', 'has_secondary_use_institution', 'has_secondary_use_school', 'has_secondary_use_industry', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'has_secondary_use_other']

Numerical columns:  ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'count_fl

In [52]:
# One hot encoding for categorical variables
oh_enc = skp.OneHotEncoder()
oh_cols = oh_enc.fit_transform(train_data_drop[cat_cols])
oh_mat = oh_cols.toarray().astype(int)

In [53]:
# Drop categorical columns
train_data_num = train_data_drop.drop(columns=cat_cols,axis=1).copy()
train_data_num.shape

(94002, 31)

In [54]:
# Append one-hot version of categorical features
new_cols = ['col_'+str(xx) for xx in range(oh_mat.shape[1])]
train_data_oh_df = pd.DataFrame(oh_mat,columns=new_cols,index=train_data_num.index)
train_data_tot = pd.concat((train_data_num,train_data_oh_df),axis=1)
train_data_tot.shape

(94002, 69)

In [55]:
# Scale numerical features
std_scaler = skp.StandardScaler()
train_data_sc = train_data_tot.copy()
train_data_sc[num_cols] = std_scaler.fit_transform(train_data_tot[num_cols])
train_data_sc[num_cols].head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,count_families
2,0.787175,-0.791749,0.737842,-0.107994,-0.196462,-0.684251,-0.186463,0.060691
4,-0.445276,-1.346559,-1.303909,1.219542,0.062502,-0.059193,1.761083,0.060691
6,-0.691767,-0.52391,1.581548,-0.107994,-0.002239,-1.100957,-0.67335,0.060691
7,0.66393,-0.887406,1.62792,-0.107994,-0.325944,-0.059193,0.300423,0.060691
8,-1.800973,0.150471,0.259388,-0.107994,-0.131721,-0.059193,0.300423,0.060691


In [57]:
# prepare test set in the same way
oh_cols_test = oh_enc.fit_transform(train_data_drop[cat_cols])
oh_mat_test = oh_cols_test.toarray().astype(int)
test_data_num = test_data_all.drop(columns=cat_cols,axis=1).copy()
print(test_data_num.shape)

test_data_oh_df = pd.DataFrame(oh_mat_test,columns=new_cols,index=test_data_num.index)
test_data_tot = pd.concat((test_data_num,test_data_oh_df),axis=1)
test_data_tot.shape

test_data_sc = test_data_tot.copy()
test_data_sc[num_cols] = std_scaler.transform(test_data_tot[num_cols])
test_data_sc[num_cols].head()

(86868, 31)


ValueError: Shape of passed values is (94002, 38), indices imply (86868, 38)

In [None]:
test_data_cat[cat_cols]
test_cat_results

In [None]:
print('All finite in training? ',train_data_drop.notnull().values.all())
print('Any NaN in training? ',train_data_drop.isnull().values.any())
print('All finite in cat? ',train_data_cat.notnull().values.all())
print('Any NaN in cat? ',train_data_cat.isnull().values.any())

In [None]:
(train_data_cat.dtypes == 'int64').all()

In [None]:
# Split into training and validation sets
data_train_unscaled = train_data_cat.sample(frac=TRAIN_FRAC,random_state=RANDOM_STATE)
data_val_unscaled = train_data_cat.drop(data_train_unscaled.index,axis=0)
label_train = label_data_drop.sample(frac=TRAIN_FRAC,random_state=RANDOM_STATE)
label_val = label_data_drop.drop(label_train.index,axis=0)

In [None]:
# Scale the training data
#mm_scaler = preprocessing.StandardScaler()
#scale_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 
#              'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage']
#data_train = data_train_unscaled.copy()
#data_train[scale_cols] = mm_scaler.fit_transform(data_train[scale_cols])
#data_val = data_val_unscaled.copy()
#data_val[scale_cols] = mm_scaler.transform(data_val[scale_cols])

In [None]:
# compute class weights
class_weights = compute_class_weight('balanced',
                                    classes=np.unique(label_train['damage_grade'].values),
                                    y=label_train['damage_grade'].values)
weights_dict = {np.unique(label_train['damage_grade'].values)[kk]:class_weights[kk] for kk in range(len(class_weights))}
weights_dict

In [None]:
# Train the classifier
cl_f = RandomForestClassifier(random_state = 1)
n_estimators = [925]
max_depth = [36]
min_samples_split = [5]
min_samples_leaf = [1] 

hyper_f = dict(n_estimators = n_estimators, 
               max_depth = max_depth,
               min_samples_split = min_samples_split,
               min_samples_leaf = min_samples_leaf)
grid_f = GridSearchCV(cl_f, hyper_f, cv = 3, verbose = 2, 
                      n_jobs = -1)
best_f = grid_f.fit(data_train_unscaled.values.astype(float), label_train.values[:,1].astype(int))

In [None]:
# Evaluate the classifier
pred_val = best_f.predict(data_val_unscaled)
print(classification_report(label_val.values[:,1], pred_val))

In [None]:
plot_confusion_matrix(best_f,data_val_unscaled,label_val.values[:,1].astype(int))

In [None]:
best_f.best_params_

In [None]:
# predict from test set
pred_test = best_f.predict(test_data_cat)

In [None]:
pred_test_df = pd.DataFrame(test_data_cat.building_id,columns=['building_id'])
pred_test_df['damage_grade'] = pred_test
pred_test_df.to_csv('data/submission.csv',index=False)

In [None]:
test_data_cat.building_id