In [77]:
# include tools
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load data info and path
DATA_DIR = Path('data')
train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')
test_values = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')

train_values.dtypes

In [None]:
# explore the data
(train_labels.damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of Buildings with Each Damage Grade"))

In [None]:
# a quick look at the relationships between our numeric features and labels
sns.heatmap(train_values.join(train_labels).corr(), annot=False, fmt=".2f")
plt.show()

In [None]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining to preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
# select features and preprocess the data
selected_features = ['age',
                     'geo_level_1_id',
                     'geo_level_2_id',
                     'geo_level_3_id',
                     'area_percentage',
                     'height_percentage',
                     'ground_floor_type',
                     'roof_type',
                     'other_floor_type',
                     'position',
                     'foundation_type',
                     'area_percentage',
                     'height_percentage',
                     'count_floors_pre_eq',
                     'land_surface_condition',
                     'has_superstructure_cement_mortar_stone',
                     'has_superstructure_mud_mortar_brick',
                     'count_families',
                     'has_secondary_use',
                     'plan_configuration']

# select the subset of features we used to train the model and create dummy variables.
#train_values_subset = train_values
train_values_subset = train_values[selected_features]
train_values_subset = pd.get_dummies(train_values_subset)

# select test values
#test_values_subset = test_values
test_values_subset = test_values[selected_features]
test_values_subset = pd.get_dummies(test_values_subset)

In [None]:
# training pipeline
pipe = make_pipeline(StandardScaler(),
                     RandomForestClassifier(random_state=2018))
pipe

In [None]:
# hyperparameter search
param_grid = {'randomforestclassifier__n_estimators': [50, 100],
              'randomforestclassifier__min_samples_leaf': [1, 5]}
gs = GridSearchCV(pipe, param_grid, cv=5)
gs.fit(train_values_subset, train_labels.values.ravel())
gs.best_params_

In [None]:
# calculate train F1 micro score
from sklearn.metrics import f1_score

in_sample_preds = gs.predict(train_values_subset)
f1_score(train_labels, in_sample_preds, average='micro')

In [None]:
# make prediction
predictions = gs.predict(test_values_subset)

# save submission
submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission.head()

In [None]:
# save to submission
my_submission.to_csv('submission.csv')
# check the head of the saved file
!type submission.csv