In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **Imports**

In [None]:
import os
import numpy as np
import pandas as pd

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.size'] = 24

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#Preprocessing, model selection & metrics import
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

In [None]:
root_path = '/kaggle/input/richters-predictor-modeling-earthquake-damage'

# **Data Cleaning & Formatting**
## **Load data**

In [None]:
train_values = pd.read_csv(os.path.join(root_path, 'train_values.csv'))
train_labels = pd.read_csv(os.path.join(root_path, 'train_labels.csv'))
test_values = pd.read_csv(os.path.join(root_path, 'test_values.csv'))

In [None]:
print(f'Train shape: {train_values.shape}')
print(f'Test shape: {test_values.shape}')

In [None]:
#display to 5 rows
train_values.head()

The data contains details about buildings/constructions damaged due to the earthquake occured in a region.

For that we are provided with geographical details, how many floor does the building had, age of construction, height, land surface, materials used for building the construction etc.

Here we have supervised data, the label/target is damage level indicated as value 1, 2, & 3.

This comes under *multiclass* or *multinomial* classification where we need to predict one class from more than two classes.

## Data types & missing values

In [None]:
train_values.info()

Most of the columns are numeric, 8 columns are categorical features which need to converted to numeric later to train model

In [None]:
train_values.describe().T

## Missing values

Data is clean without missing values or nan, so we can proceed further for EDA.

In [None]:
#Merge the lable/target column with features
train = pd.merge(train_values, train_labels, on='building_id')

# **Exploratory Data Analysis**

To start data analysis, we begin with target variable here we have ordinal values so let us do count plot for this

In [None]:
plt.figure(figsize=(8,8))
plt.style.use('fivethirtyeight')
sns.countplot(train_labels['damage_grade'])
plt.xlabel('Values'); plt.ylabel('counts'); 
plt.title('Damage grade')

In [None]:
percent = list(train['damage_grade'].value_counts()/len(train['damage_grade'])*100)
label = list(train['damage_grade'].value_counts().index)
explode = (0.1,0,0)
fig,ax1 = plt.subplots()
ax1.pie(percent, explode=explode, labels=label, autopct='%1.1f%%', shadow=True)
ax1.axis('equal')
plt.show()

Damage category 2 & 3 are higher than 1

### Density plot on continuous value features

In [None]:
#continuous values
cont_values = ['geo_level_1_id','geo_level_2_id','geo_level_3_id','age','area_percentage','height_percentage']

def densityPlot(cont_values):
  fig = plt.figure(figsize=(18,16))
  plt.style.use('fivethirtyeight')
  for i,txt in enumerate(cont_values):
    ax = fig.add_subplot(3,2,i+1)
    sns.kdeplot(train.loc[train['damage_grade'] == 1, txt], ax=ax, label='damage_grade==1')
    sns.kdeplot(train.loc[train['damage_grade'] == 2, txt], ax=ax, label='damage_grade==2')
    sns.kdeplot(train.loc[train['damage_grade'] == 3, txt], ax=ax, label='damage_grade==3')
  plt.show()
densityPlot(cont_values)

From plotting continuous feature values using kdeplot (kernel density plot), we can see there is few changes in pattern on columns `geo_level_1_id`,`geo_level_2_id` which will help us for better prediction

### Count plot on binary features

In [None]:
#Binary columns are selected using the column name having word 'has'
bin_cols = train.columns[train.columns.str.startswith('has')]

def countPlot(bin_cols):
  plt.rcParams['font.size'] = 18
  plt.style.use('fivethirtyeight')
  fig = plt.figure(figsize=(20,37))
  for i,txt in enumerate(bin_cols):
    ax = fig.add_subplot(8,3,i+1)
    sns.countplot(x=train[txt], ax=ax, hue=train['damage_grade'])
  plt.show()
countPlot(bin_cols)

1. All binary columns have more zero than 1, except `has_superstructure_cement_mortar_stone`
2. In some columns there seems having only single value like all 0's, need to check.

### Count plot on categorical columns

In [None]:
cat_cols = train.select_dtypes(include=np.object).columns

def catPlot(cat_cols):
  plt.rcParams['font.size'] = 18
  plt.style.use('fivethirtyeight')
  fig = plt.figure(figsize=(18,15))
  for i,txt in enumerate(cat_cols):
    ax = fig.add_subplot(3,3,i+1)
    sns.countplot(x=train[txt], ax=ax, hue=train['damage_grade'])
  plt.show()
catPlot(cat_cols)

### Feature engineering

As a first step we need to concat both train & test data to do feature engineering on both the data.

In [None]:
df_full = pd.concat([train, test_values], axis=0).reset_index(drop=True)
df_full.shape

Function to merge new columns generated through feature engineering

In [None]:
def merge_by_concat(df1, df2, merge_on):
  merged_gf = df1[merge_on]
  merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
  new_columns = [col for col in list(merged_gf) if col not in merge_on]
  df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
  return df1

Now let us create new columns by grouping *geographical* columns.
1. Creating mean `age`,`height_percentage`,`area_percentage` from grouping geographic level columns `geo_level_1_id`,`geo_level_2_id`,`geo_level_3_id`

In [None]:
geo_age_mean = df_full.groupby(['geo_level_1_id','geo_level_2_id','geo_level_3_id'])['age'].agg(['mean']).reset_index().rename(columns={'mean':'geo_grp_age_mean'})
geo_height_mean = df_full.groupby(['geo_level_1_id','geo_level_2_id','geo_level_3_id'])['height_percentage'].agg(['mean']).reset_index().rename(columns={'mean':'geo_grp_height_mean'})
geo_area_mean = df_full.groupby(['geo_level_1_id','geo_level_2_id','geo_level_3_id'])['area_percentage'].agg(['mean']).reset_index().rename(columns={'mean':'geo_grp_area_mean'})

Now let us create new columns by grouping different types of *roof* columns

2. Creating mean `age`,`height_percentage`,`area_percentage` from grouping roof & foundation columns `foundation_type`,`roof_type`,`ground_floor_type`,`other_floor_type'

In [None]:
type_age_mean = df_full.groupby(['foundation_type','roof_type','ground_floor_type','other_floor_type'])['age'].agg(['mean']).reset_index().rename(columns={'mean':'type_grp_age_mean'})
type_height_mean = df_full.groupby(['foundation_type','roof_type','ground_floor_type','other_floor_type'])['height_percentage'].agg(['mean']).reset_index().rename(columns={'mean':'type_grp_height_mean'})
type_area_mean = df_full.groupby(['foundation_type','roof_type','ground_floor_type','other_floor_type'])['area_percentage'].agg(['mean']).reset_index().rename(columns={'mean':'type_grp_area_mean'})

In [None]:
#Merge the newly created columns
df_full = merge_by_concat(df_full, geo_age_mean, ['geo_level_1_id','geo_level_2_id','geo_level_3_id'])
df_full = merge_by_concat(df_full, geo_height_mean, ['geo_level_1_id','geo_level_2_id','geo_level_3_id'])
df_full = merge_by_concat(df_full, geo_area_mean, ['geo_level_1_id','geo_level_2_id','geo_level_3_id'])

df_full = merge_by_concat(df_full, type_age_mean, ['foundation_type','roof_type','ground_floor_type','other_floor_type'])
df_full = merge_by_concat(df_full, type_height_mean, ['foundation_type','roof_type','ground_floor_type','other_floor_type'])
df_full = merge_by_concat(df_full, type_area_mean, ['foundation_type','roof_type','ground_floor_type','other_floor_type'])

Let us check whether binary columns having only single values, because they will not help for model creation

In [None]:
colname=[]
val=[]
bin_cols = df_full.columns[df_full.columns.str.contains('has')]
for bcol in bin_cols:
  colname.append(bcol)
  val.append(df_full[bcol].value_counts().sort_index().values)
pd.DataFrame(val, index=colname)

Our assumption is wrong there is no column with single values, all binary column having both 0 & 1.

Now let us create new column with frequency encoding technique for categorical columns as a process in feature engineering

In [None]:
freq_cols = ['land_surface_condition','foundation_type','roof_type',
             'ground_floor_type','other_floor_type','position','plan_configuration',
             'legal_ownership_status']

def frequency_encode(cols, df, self_encoding=False):
  for c in cols:
    fq_dict = df[c].value_counts().to_dict()
    if self_encoding:
      df[c] = df[c].map(fq_dict)
    else:
      df[c+'fq_enc'] = df[c].map(fq_dict)
  return df
df_full_freq = frequency_encode(freq_cols, df_full, self_encoding=True)

We have finished feature engineering, as a next step we may split the train & test data for further process

In [None]:
train_new = df_full_freq[:train_values.shape[0]]
test_new = df_full_freq[train_values.shape[0]:]

As a final step we need to check whether columns in our training data have any collinearity between them

In [None]:
corr_matrix = train_new.drop(columns=['building_id','damage_grade'], axis=1).corr()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]
print('Number of columns having collinearity with other columns other than target: ', len(to_drop))

## Scaling data

We will scale all columns to avoid unwanted weightage for columns based on different range of numeric values like binary columns will have only 0 & 1, but geographic column having numeric values > 4 digits. 

In [None]:
scaler = StandardScaler()

features = train_new.drop(columns=['building_id','damage_grade'], axis=1).columns

X = train_new.drop(columns=['building_id','damage_grade'], axis=1)
test_new = test_new.drop(columns=['building_id','damage_grade'], axis=1)

y = train_new['damage_grade']

sc = scaler.fit(X)
temp_train_X = sc.transform(X)
test_scaled = sc.transform(test_new)

X = pd.DataFrame(temp_train_X, columns=features)
test = pd.DataFrame(test_scaled, columns=features)

# Model creation

### Function to check f1 score of model

In [None]:
def check_model_f1_score(model):
  model.fit(X_train, y_train)
  test_y_pred = model.predict(X_test)
  return f1_score(y_test, test_y_pred, average='micro')

Spliting train data as 75% as train & remaining as test data for validating our model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Let us create a base prediction, by guessing damage grade value.
Here we may do value_counts and select the grade with most value counts as the guessing damage grade prediction value.
Using that value let us check base prediction to check our trained model score need to cross more than the base model.

In [None]:
grade_guess = y_test.value_counts().index[0]
base_prediction = np.full(np.shape(y_test), grade_guess)
print('F1 score of base prediction with guess value: ',f1_score(y_test, base_prediction, average='micro'))

In [None]:
lr = LogisticRegression(multi_class='multinomial')
lr_score = check_model_f1_score(lr)
print('Basic LOGISTIC REGRESSION model with default params: ', lr_score)

Out initial logistic model have crossed the base model score, so we can proceed further to try different types of model like ensemble for better score.

In [None]:
dc = DecisionTreeClassifier()
dc_score = check_model_f1_score(dc)
print('DECISION CLASSIFIER model with default params: ', dc_score)

In [None]:
rc = RandomForestClassifier()
rc_score = check_model_f1_score(rc)
print('Basic RANDOM FOREST model with default params: ', rc_score)

In [None]:
knc = KNeighborsClassifier()
knc_score = check_model_f1_score(knc)
print('KNEIGHBORS model with default params: ', knc_score)

In [None]:
xg = xgb.XGBClassifier()
xg_score = check_model_f1_score(xg)
print('XGB model with default params: ', xg_score)

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(8,6))
model_comparison = pd.DataFrame({'model':['Logistic classifier','Decision Tree classifier',
                       'Random forest classifier', 'kneighbors classifier',
                       'xgb classifier'], 'f1_score':[lr_score,dc_score,rc_score,
                                                      knc_score,xg_score]})

# Horizontal bar chart of test mae
model_comparison.sort_values('f1_score', ascending = False).plot(x = 'model', y = 'f1_score', kind = 'barh',
                                                           color = 'red', edgecolor = 'black')

# Plot formatting
plt.ylabel(''); plt.yticks(size = 14); plt.xlabel('F1 score'); plt.xticks(size = 14)
plt.title('Model Comparison on Test F1 score', size = 20);

On seeing the model comparison plot, it is clear that *Random foreset classsifier* out performs than other models, so let us proceed to fine tune hyper parameter of the model further for analysis.

Fine tuned params for Random forest not improved, so let us use other model for prediction.

In [None]:
clf = xgb.XGBClassifier(
    n_estimators=2000,
    objective='multi:softmax',
    num_class=3,
    max_depth=12, 
    learning_rate=0.02, 
    subsample=0.8,
    colsample_bytree=0.4, 
    missing=-1, 
    eval_metric='mlogloss',
    nthread=4,
    tree_method='hist'     
    )

In [None]:
clf.fit(X_train, y_train, eval_set=[(X_test,y_test)],verbose=50, early_stopping_rounds=100)

In [None]:
xgb_score = check_model_f1_score(clf)
print('XGB score with tuned params: ', xgb_score)