In [7]:
# All you may need as the IMPORTS :)
# Basics
# This ensures that sklearn 0.24 is loaded on Google colab
#!pip uninstall scikit-learn -y
#!pip install -U scikit-learn
import pandas as pd
import numpy as np
import mlxtend

# Data Ingestion
import json
import os
from sklearn.datasets import make_classification

#Plot
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions

# Data Preprocessing
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler, Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# Splitting
from sklearn.model_selection import train_test_split, cross_val_score

# Pipelines and Models
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

# Performance
import statistics
from sklearn.metrics import confusion_matrix, accuracy_score, plot_confusion_matrix, classification_report
from mlxtend.evaluate import bootstrap_point632_score

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
#Load Earthquake datasets  here
damage = pd.read_csv('drive/MyDrive/ADS504/csv_building_damage_assessment_featex.csv')
own = pd.read_csv('drive/MyDrive/ADS504/csv_building_ownership_and_use.csv')
struc = pd.read_csv('drive/MyDrive/ADS504/csv_building_structure.csv')


In [10]:
print(damage.columns)
print(own.columns)
print(struc.columns)

Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'damage_overall_collapse', 'damage_overall_leaning', 'area_assesed',
       'damage_grade', 'technical_solution_proposed', 'has_geotechnical_risk',
       'has_geotechnical_risk_land_settlement',
       'has_geotechnical_risk_fault_crack',
       'has_geotechnical_risk_liquefaction', 'has_geotechnical_risk_landslide',
       'has_geotechnical_risk_rock_fall', 'has_geotechnical_risk_flood',
       'has_geotechnical_risk_other'],
      dtype='object')
Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other'],
      

#Verify building_ids are all unique.
#Verify other information that should be identical, ARE identical before data merge

In [11]:
print(len(damage['building_id'].unique()))
print(len(own['building_id'].unique()))
print(len(struc['building_id'].unique()))

damage_building_id = pd.DataFrame(damage['building_id'])
own_building_id = pd.DataFrame(own['building_id'])
struc_building_id = pd.DataFrame(struc['building_id'])

print(damage_building_id.equals(own_building_id))
print(own_building_id.equals(struc_building_id))
print(struc_building_id.equals(damage_building_id))

print(damage_building_id.compare(own_building_id))
print(own_building_id.compare(struc_building_id))
print(struc_building_id.compare(damage_building_id))

'''
damage_assessment_featex is missing 366707000161 at position 761609
ownership is missing 120306000341 at position 42
structure is missing 120306000341  at position 42
'''

762105
762105
762105
False
True
False
         building_id              
                self         other
118     1.203060e+11  1.201020e+11
119     1.201020e+11  1.201020e+11
120     1.201020e+11  1.201020e+11
121     1.201020e+11  1.201020e+11
122     1.201020e+11  1.201020e+11
...              ...           ...
761541  3.667060e+11  3.667060e+11
761542  3.667060e+11  3.667060e+11
761543  3.667060e+11  3.667060e+11
761544  3.667060e+11  3.667060e+11
761545  3.667070e+11  3.667060e+11

[696264 rows x 2 columns]
Empty DataFrame
Columns: []
Index: []
         building_id              
                self         other
118     1.201020e+11  1.203060e+11
119     1.201020e+11  1.201020e+11
120     1.201020e+11  1.201020e+11
121     1.201020e+11  1.201020e+11
122     1.201020e+11  1.201020e+11
...              ...           ...
761541  3.667060e+11  3.667060e+11
761542  3.667060e+11  3.667060e+11
761543  3.667060e+11  3.667060e+11
761544  3.667060e+11  3.667060e+11
761545  3.667060e+11  

'\ndamage_assessment_featex is missing 366707000161 at position 761609\nownership is missing 120306000341 at position 42\nstructure is missing 120306000341  at position 42\n'

In [12]:
struc_dr = struc.drop(columns = ['height_ft_post_eq', 'condition_post_eq', 
                                 'technical_solution_proposed', 'count_floors_post_eq'])
own_dr = own.drop(columns = ['district_id', 'vdcmun_id', 'ward_id'])
damage_dr = damage.drop(columns=['district_id', 'vdcmun_id', 'ward_id',
                              'damage_overall_collapse', 'damage_overall_leaning',
                              'damage_grade', 'technical_solution_proposed'])

struc_own = pd.merge(struc_dr, own_dr, how="inner", on='building_id')
print(struc_own.shape)
full = pd.merge(struc_own, damage_dr, how="inner", on='building_id')
print(full.shape)
cols = full.columns
print(full.columns)
full.head()

(762105, 40)
(762104, 49)
Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'count_floors_pre_eq', 'age_building', 'plinth_area_sq_ft',
       'height_ft_pre_eq', 'land_surface_condition', 'foundation_type',
       'roof_type', 'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'damage_grade', 'legal_ownership_status', 'count_families',
       'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_second

Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,count_floors_pre_eq,age_building,plinth_area_sq_ft,height_ft_pre_eq,land_surface_condition,foundation_type,...,has_secondary_use_other,area_assesed,has_geotechnical_risk,has_geotechnical_risk_land_settlement,has_geotechnical_risk_fault_crack,has_geotechnical_risk_liquefaction,has_geotechnical_risk_landslide,has_geotechnical_risk_rock_fall,has_geotechnical_risk_flood,has_geotechnical_risk_other
0,120101000000.0,12,1207,120703,1,9,288,9,Flat,Other,...,0,Both,0.0,0,0,0,0,0,0,0
1,120101000000.0,12,1207,120703,1,15,364,9,Flat,Other,...,0,Exterior,0.0,0,0,0,0,0,0,0
2,120101000000.0,12,1207,120703,1,20,384,9,Flat,Other,...,0,Both,0.0,0,0,0,0,0,0,0
3,120101000000.0,12,1207,120703,1,20,312,9,Flat,Other,...,0,Both,0.0,0,0,0,0,0,0,0
4,120101000000.0,12,1207,120703,1,30,308,9,Flat,Other,...,0,Exterior,0.0,0,0,0,0,0,0,0


In [13]:
for col in cols:
  print(full[col].value_counts())

1.201010e+11    1
3.002090e+11    1
3.002090e+11    1
3.002090e+11    1
3.002090e+11    1
               ..
2.321070e+11    1
2.321070e+11    1
2.321070e+11    1
2.321070e+11    1
3.667090e+11    1
Name: building_id, Length: 762104, dtype: int64
24    98019
31    90994
30    89122
23    88741
36    78073
28    77148
20    68750
22    60639
21    58623
12    39351
29    12644
Name: district_id, dtype: int64
3104    32696
2005    15659
3009    15239
2802    15204
2001    15002
        ...  
2406     2746
2905     2611
2903     2589
2902     2224
2901     1981
Name: vdcmun_id, Length: 110, dtype: int64
310405    2584
310404    2440
310412    2337
200506    2048
310411    2001
          ... 
360501     221
290204     182
360601     173
280801     160
290202     159
Name: ward_id, Length: 945, dtype: int64
2    468112
3    166993
1    118900
4      6085
5      1569
6       332
7        88
9        13
8        12
Name: count_floors_pre_eq, dtype: int64
15     49983
20     46525
10     39794


In [14]:
full = full.dropna()

In [15]:
full.loc[full['has_geotechnical_risk'] < 0.5, 'has_geotechnical_risk'] = -1
full.loc[full['has_geotechnical_risk'] > 0.5, 'has_geotechnical_risk'] = 0
full.loc[full['has_geotechnical_risk'] <-0.5, 'has_geotechnical_risk'] = 1

full.loc[full['has_secondary_use'] < 0.5, 'has_secondary_use'] = -1
full.loc[full['has_secondary_use'] > 0.5, 'has_secondary_use'] = 0
full.loc[full['has_secondary_use'] <-0.5, 'has_secondary_use'] = 1

full = full.rename(columns={'has_geotechnical_risk': 'no_geo_risk', 'has_secondary_use': 'no_2ary_use'})

full = full[full.area_assesed != 'Not able to inspect']
full = full.drop(columns = ['area_assesed'])
full = full.drop(columns = ['building_id'])
print(full['no_geo_risk'].value_counts())
print(full['no_2ary_use'].value_counts())
full['damage_grade'].value_counts()

1.0    634246
0.0    104960
Name: no_geo_risk, dtype: int64
1.0    648751
0.0     90455
Name: no_2ary_use, dtype: int64


Grade 5    254338
Grade 4    182756
Grade 3    136228
Grade 2     87195
Grade 1     78689
Name: damage_grade, dtype: int64

In [16]:
train = full.sample(frac=0.5, random_state = 42)
test = full.drop(train.index)
X_train = train.drop(labels = 'damage_grade', axis = 1)
y_train = train.damage_grade
X_test = test.drop(labels = 'damage_grade', axis = 1)
y_test = test.damage_grade

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(full.columns)
X_test.head

(369603, 46) (369603,)
(369603, 46) (369603,)
Index(['district_id', 'vdcmun_id', 'ward_id', 'count_floors_pre_eq',
       'age_building', 'plinth_area_sq_ft', 'height_ft_pre_eq',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'damage_grade', 'legal_ownership_status', 'count_families',
       'no_2ary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_seconda

<bound method NDFrame.head of         district_id  vdcmun_id  ward_id  count_floors_pre_eq  age_building  \
1                12       1207   120703                    1            15   
3                12       1207   120703                    1            20   
4                12       1207   120703                    1            30   
5                12       1207   120703                    1            18   
8                12       1207   120703                    1            22   
...             ...        ...      ...                  ...           ...   
762096           36       3603   360302                    2            13   
762097           36       3603   360302                    2             9   
762098           36       3603   360302                    2            13   
762099           36       3603   360302                    2            60   
762100           36       3603   360302                    2            35   

        plinth_area_sq_ft  height

In [17]:
categorical = ['district_id', 'vdcmun_id',
               'ward_id', 'land_surface_condition',
               'foundation_type', 'roof_type',
               'ground_floor_type',  'other_floor_type',
               'position', 'plan_configuration',
               'legal_ownership_status']
numerical = ['count_floors_pre_eq', 'age_building',
             'plinth_area_sq_ft', 'height_ft_pre_eq']

binary_cols = []

#Pipeline Code for single var type

In [15]:
#SGD_hinge = make_pipeline(StandardScaler(), SGDClassifier(loss = 'hinge', max_iter=1000, tol=1e-3))
#SGD_hinge.fit(X, y)
#Pipeline(steps=[('standardscaler', StandardScaler()),
#                ('sgdclassifier', SGDClassifier())])
#print(SGD_hinge.predict([[-0.8, -1]]))

In [18]:
full.isna().sum().sum()

0

#Multiple Pipelines

In [19]:
# Categorical pipeline
cat_pipe = Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore',sparse=False))])
# Simple Imputer pipeline
#impute_pipe = Pipeline([('imputer', SimpleImputer(missing_values=np.nan,strategy='median'))])
# Normalizer pipeline
norm_pipe = Pipeline(steps = [('imputer', SimpleImputer(missing_values=np.nan,strategy='median')), ('normalizer', Normalizer())])
# Scaler pipeline
#scaler_pipe = Pipeline(steps = [('imputer', SimpleImputer(missing_values=np.nan,strategy='median')), ('scaler', StandardScaler())])

# PIPELINE 1: Simple Imputer pipeline
#impute_model = make_pipeline(ColumnTransformer([('cat', cat_pipe, categorical),('num', impute_pipe, numerical)]), 
#                             Perceptron(class_weight='balanced'))
#impute_model.fit(X_train, y_train)
#print(impute_model.score(X_test, y_test))
#simple_imputation_CVscores = cross_val_score(impute_model, X_test, y_test, cv=5, scoring='accuracy')

# PIPELINE 2: Normalize pipeline:
normalizer_model = make_pipeline(ColumnTransformer(transformers=[('cat', cat_pipe, categorical), ('num', norm_pipe, numerical)]),
                                 Perceptron(class_weight='balanced'))
normalizer_model.fit(X_train, y_train)
print(normalizer_model.score(X_test, y_test))
normalizer_CVscores = cross_val_score(normalizer_model, X_test, y_test, cv=5, scoring='accuracy')

# PIPELINE 3: Standardize pipeline:
#scaler_model = make_pipeline(ColumnTransformer(transformers=[('cat', cat_pipe, categorical), ('num', scaler_pipe, numerical)]),
#                                 Perceptron(class_weight='balanced'))
#scaler_model.fit(X_train, y_train)
#print(scaler_model.score(X_test, y_test))
#scaler_CVscores = cross_val_score(scaler_model, X_test, y_test, cv=5, scoring='accuracy')

0.45059158069604416


In [20]:
normod = confusion_matrix(y_test, normalizer_model.predict(X_test))
normod
pd.DataFrame(normod, index=['1','2','3','4','5'], columns=['1','2','3','4','5'])

Unnamed: 0,1,2,3,4,5
1,24414,10313,3804,599,408
2,11918,13350,15440,2052,882
3,8686,10452,38940,6549,3532
4,5521,6152,41608,24903,13295
5,4851,4611,34056,18334,64933


In [None]:
audio_data_nona = audio_data.dropna()
print(audio_data.shape)  # Confirm no missing values
print(audio_data.head())  # Take a look at the data
print(audio_data.tail())   # See the frequencies at the end of the file

for field in ['label_text']:  
  fig,ax = plt.subplots()
  sns.countplot(audio_data_nona[field])
  xticks = ax.get_yticklabels()
  ax.tick_params(axis = 'x', rotation = 45)

for field in ['label']:  # Ensuring that the 'label' and 'label_text' fields have the same information
  fig,ax = plt.subplots()
  sns.countplot(audio_data_nona[field])
  xticks = ax.get_yticklabels()
  ax.tick_params(axis = 'x', rotation = 45)

audio_data.drop(['Unnamed: 0', 'filename', 'label_text'], axis=1, inplace=True)  # Drop unnecessary columns
select = audio_data.sample(20)
select1 = select.iloc[:,:20]
pd.set_option('max_columns', None)  # This is to see more columns and observe which carry signals for snare vs kick. Several columns appear separate each label
print(select1)
pd.reset_option('max_columns')
# This time you should drop two columns first: filename and label