In [None]:
import pandas as pd
import numpy as np
ownership=pd.read_csv('Building_Ownership_Use.csv')
train=pd.read_csv('train.csv')
structure=pd.read_csv('Building_Structure.csv')

In [None]:
ownership.head()

### from the initial analysis
* district_id, vdcmun_id, ward_id describes the geolocation information
* since ward_id values are unique it describes both district_id and vdcmun_id
* <b>therefore we can drop district_id and vdcmun_id feature</b>
* it is logicaly understood that, there is no relation between legal_ownership_status and building_grade 
* <b> there for lets drop legal_ownership_status feature too </b>


In [None]:
ownership.drop(['district_id','vdcmun_id','legal_ownership_status'],axis=1,inplace=True)

In [None]:
ownership.isnull().sum()  # checking for missing values


In [None]:
ownership.fillna(0,inplace=True)

In [None]:
ownership.describe()

### XGBoost
* In most Structured datasets, tree based algorithms are considered as best approach
* XGBoost algorithm is a tree based algorithm which is considered as goto algorithm for most of the structured data classifications and regressions
* Therefore we will be using XGBoost in this notebook
* Tree based algorithms performs better on categorical data
* ordering of labels when using label encoding matters less for trees

In [None]:

structure.head()

In [None]:
structure.isnull().sum()

In [None]:
structure.fillna(0,inplace=True)

In [None]:
structure.describe()

* The height of the building and floor counts are highly corelated

In [None]:
structure.drop(['district_id','vdcmun_id','count_floors_pre_eq','count_floors_post_eq'],axis=1,inplace=True)

In [None]:
import seaborn as sns
sns.boxplot(x=structure['height_ft_pre_eq'])

In [None]:
structure['condition_post_eq'].value_counts()

In [None]:
structure= structure[structure['height_ft_pre_eq']<200]

In [None]:
sns.boxplot(structure['age_building'])

In [None]:
replaceval=structure.age_building.median()
structure['age_building']=structure['age_building'].replace(to_replace=999,value=replaceval)

In [None]:
data=structure.merge(train,on='building_id')

In [None]:
data.head()

In [None]:
data['damage_grade']=data['damage_grade'].apply(lambda x: x[-1]).astype('int')

In [None]:
data['height_diff']=data['height_ft_pre_eq']-data['height_ft_post_eq']

In [None]:
data_f=data.drop(['vdcmun_id','building_id', 'ward_id'],axis=1)

In [None]:
data_f.isnull().sum()

In [None]:
data_f=data_f.fillna(0)

In [None]:
from sklearn.preprocessing import LabelEncoder
enc=LabelEncoder()
for col in data_f.columns:
    if data_f[col].dtype=='O':
        data_f[col]=enc.fit_transform(y=data_f[col])

In [None]:
from sklearn.decomposition import PCA
temp=data_f[[ 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'has_geotechnical_risk', 'has_geotechnical_risk_fault_crack',
       'has_geotechnical_risk_flood', 'has_geotechnical_risk_land_settlement',
       'has_geotechnical_risk_landslide', 'has_geotechnical_risk_liquefaction',
       'has_geotechnical_risk_other', 'has_geotechnical_risk_rock_fall',
       'has_repair_started',]]

In [None]:
pca=PCA(n_components=1)
result=pca.fit_transform(temp)

In [None]:
data_f['PCA']=result

In [None]:
train=data_f[0:600000]
test=data_f[600000:data_f.shape[0]]
trainx=train.drop('damage_grade',axis=1)
trainy=train['damage_grade']


In [None]:
train.head()

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(trainx,trainy)

In [None]:
testx=test.drop('damage_grade',axis=1)
testy=test['damage_grade']


In [None]:
testx.head()

In [None]:
predictions = model.predict(testx)
predictions

In [None]:
score=testy==predictions
score.sum()
score.sum()/len(predictions)*100

In [None]:
model.feature_importances_


In [None]:
for i in range(len(model.feature_importances_)):
    if model.feature_importances_[i]>0.01:
        print(data_f.columns[i],model.feature_importances_[i])