In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV

In [None]:
# Training parameters
TRAIN_FRAC = 0.85 # fraction of the labeled data to use for training, remainder is used for validation
RANDOM_STATE = 123 # random state for random sampling 

In [None]:
# Load data
train_data_all = pd.read_csv('train_values.csv')
label_data_all = pd.read_csv('train_labels.csv')
test_data_all = pd.read_csv('test_values.csv')

In [None]:
# Inspect columns
train_data_all.dtypes

In [None]:
def one_hot(train_df):
    for col in train_df.columns:
        if train_df[col].dtypes == 'object':
            dummy_labels = [col + '_' + str(xx) for xx in range(train_df[col].nunique())]
            train_df[dummy_labels] = pd.get_dummies(train_df[col])
            train_df.drop(col,axis=1,inplace=True)
    return train_df

train_data_all = one_hot(train_data_all)        
for col in train_data_all.columns: print(col, '\t',train_data_all[col].dtypes)

In [None]:
# One-hot encode the labels
label_data_all[['1','2','3']] = pd.get_dummies(label_data_all['damage_grade'])
label_data_all.head()

In [None]:
# Split into training and validation sets
data_train_unscaled = train_data_all.sample(frac=TRAIN_FRAC,random_state=RANDOM_STATE)
data_val_unscaled = train_data_all.drop(data_train_unscaled.index,axis=0)
label_train = label_data_all.sample(frac=TRAIN_FRAC,random_state=RANDOM_STATE)
label_val = label_data_all.drop(label_train.index,axis=0)

In [None]:
# Scale the training data
mm_scaler = preprocessing.StandardScaler()
scale_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 
              'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage']
data_train = data_train_unscaled.copy()
data_train[scale_cols] = mm_scaler.fit_transform(data_train[scale_cols])
data_val = data_val_unscaled.copy()
data_val[scale_cols] = mm_scaler.transform(data_val[scale_cols])

In [None]:
# compute class weights
class_weights = compute_class_weight('balanced',
                                    classes=np.unique(label_train['damage_grade'].values),
                                    y=label_train['damage_grade'].values)
weights_dict = {np.unique(label_train['damage_grade'].values)[kk]:class_weights[kk] for kk in range(len(class_weights))}
weights_dict

In [None]:
# Train the classifier
cl_f = RandomForestClassifier(random_state = 1)
n_estimators = [50, 100, 300, 500, 800]
max_depth = [5, 8, 15, 25, 35]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyper_f = dict(n_estimators = n_estimators, 
               max_depth = max_depth,
               min_samples_split = min_samples_split,
               min_samples_leaf = min_samples_leaf)
grid_f = GridSearchCV(cl_f, hyper_f, cv = 3, verbose = 1, 
                      n_jobs = -1)
best_f = grid_f.fit(data_train.values.astype(float), label_train.values[:,1].astype(int))

In [None]:
# Evaluate the classifier
pred_val = clf.predict(data_val)
plot_confusion_matrix(clf,data_val,label_val.values[:,1].astype(int))
print(f1_score(label_val.values[:,1].shape, pred_val.shape))