<a href="https://colab.research.google.com/github/mzagari/ADS504_Team_8/blob/main/Post_Zoom_Version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [101]:
__author__ = 'Martin Zagari, Summer Purschke, Dave Friesen'
__email__ = 'mzagari@sandiego.edu, spurschke@sandiego.edu, dfriesen@sandiego.edu'
__version__ = '1.0'
__date__ = 'August 2022'
__license__ = 'MIT'

In [185]:
# Basics
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

# Utilities
import joblib
import re
from time import time
import warnings
#warnings.filterwarnings('ignore')
#warnings.resetwarnings()

# Set basic options for consistent output
PRECISION = 2
np.set_printoptions(precision = PRECISION)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.precision', PRECISION)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')

# Set Matplotlib defaults for consistent visualization look 'n' feel
FONTSIZE_S = 10
FONTSIZE_M = 12
FONTSIZE_L = 14
plt.style.use('default')
plt.rcParams['figure.titlesize'] = FONTSIZE_L
plt.rcParams['figure.figsize'] = (7, 7 / (16 / 9))
plt.rcParams['figure.subplot.left'] = '0.1'
plt.rcParams['figure.subplot.bottom'] = '0.1'
plt.rcParams['figure.subplot.top'] = '0.9'
plt.rcParams['figure.subplot.wspace'] = '0.4'
plt.rcParams['lines.linewidth'] = '2'
plt.rcParams['axes.linewidth'] = '2'
plt.rcParams['axes.titlesize'] = '8'
#plt.rcParams['axes.titleweight'] = 'bold'
plt.rcParams['axes.labelsize'] = FONTSIZE_M
plt.rcParams['xtick.labelsize'] = FONTSIZE_S
plt.rcParams['ytick.labelsize'] = FONTSIZE_S
plt.rcParams['grid.linewidth'] = '1'
plt.rcParams['legend.fontsize'] = FONTSIZE_S
plt.rcParams['legend.title_fontsize'] = FONTSIZE_S

In [186]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# RUN for Local File Retrieval

In [104]:
'''
# Set working directory (depending on coder)
#%cd '/Users/davidfriesen/Desktop/OneDrive/projects/ADS504_Team_8/data'

# Set row count control totals
structure_ctrl = sum(1 for line in open('csv_building_structure.csv'))
ownership_ctrl = sum(1 for line in open('csv_building_ownership_and_use.csv'))
damage_ctrl = sum(1 for line in open('csv_building_damage_assessment_featex.csv'))
'''

"\n# Set working directory (depending on coder)\n#%cd '/Users/davidfriesen/Desktop/OneDrive/projects/ADS504_Team_8/data'\n\n# Set row count control totals\nstructure_ctrl = sum(1 for line in open('csv_building_structure.csv'))\nownership_ctrl = sum(1 for line in open('csv_building_ownership_and_use.csv'))\ndamage_ctrl = sum(1 for line in open('csv_building_damage_assessment_featex.csv'))\n"

In [105]:
'''
# Read files, accomodating any 'bad' rows
structure_df = pd.read_csv('csv_building_structure.csv', on_bad_lines = 'skip', low_memory = False)
ownership_df = pd.read_csv('csv_building_ownership_and_use.csv', on_bad_lines = 'skip', low_memory = False)
damage_df = pd.read_csv('csv_building_damage_assessment_featex.csv', on_bad_lines = 'skip', low_memory = False)
'''

"\n# Read files, accomodating any 'bad' rows\nstructure_df = pd.read_csv('csv_building_structure.csv', on_bad_lines = 'skip', low_memory = False)\nownership_df = pd.read_csv('csv_building_ownership_and_use.csv', on_bad_lines = 'skip', low_memory = False)\ndamage_df = pd.read_csv('csv_building_damage_assessment_featex.csv', on_bad_lines = 'skip', low_memory = False)\n"

# Run for Google Drive/Colab/Cloud File Retrieval

In [187]:
# Set row count control totals
structure_ctrl = sum(1 for line in open('drive/MyDrive/ADS504/csv_building_structure.csv'))
ownership_ctrl = sum(1 for line in open('drive/MyDrive/ADS504/csv_building_ownership_and_use.csv'))
damage_ctrl = sum(1 for line in open('drive/MyDrive/ADS504/csv_building_damage_assessment_featex.csv'))

damage_df = pd.read_csv('drive/MyDrive/ADS504/csv_building_damage_assessment_featex.csv', on_bad_lines = 'skip', low_memory = False)
ownership_df = pd.read_csv('drive/MyDrive/ADS504/csv_building_ownership_and_use.csv', on_bad_lines = 'skip', low_memory = False)
structure_df = pd.read_csv('drive/MyDrive/ADS504/csv_building_structure.csv', on_bad_lines = 'skip', low_memory = False)

# Confirm load counts

In [188]:
# Confirm load counts
print('Structure: file=%0d, import=%0d, delta=%0d' %
      (structure_ctrl, len(structure_df), structure_ctrl - len(structure_df)))
print('Ownership: file=%0d, import=%0d, delta=%0d' %
      (ownership_ctrl, len(ownership_df), ownership_ctrl - len(ownership_df)))
print('Damage: file=%0d, import=%0d, delta=%0d' %
      (damage_ctrl, len(damage_df), damage_ctrl - len(damage_df)))

Structure: file=762106, import=762105, delta=1
Ownership: file=762106, import=762105, delta=1
Damage: file=762106, import=762105, delta=1


In [189]:
structure_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762105 entries, 0 to 762104
Data columns (total 31 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   building_id                             762105 non-null  float64
 1   district_id                             762105 non-null  int64  
 2   vdcmun_id                               762105 non-null  int64  
 3   ward_id                                 762105 non-null  int64  
 4   count_floors_pre_eq                     762105 non-null  int64  
 5   count_floors_post_eq                    762105 non-null  int64  
 6   age_building                            762105 non-null  int64  
 7   plinth_area_sq_ft                       762105 non-null  int64  
 8   height_ft_pre_eq                        762105 non-null  int64  
 9   height_ft_post_eq                       762105 non-null  int64  
 10  land_surface_condition                  7621

In [190]:
ownership_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762105 entries, 0 to 762104
Data columns (total 17 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   building_id                    762105 non-null  float64
 1   district_id                    762105 non-null  int64  
 2   vdcmun_id                      762105 non-null  int64  
 3   ward_id                        762105 non-null  int64  
 4   legal_ownership_status         762105 non-null  object 
 5   count_families                 762103 non-null  float64
 6   has_secondary_use              762095 non-null  float64
 7   has_secondary_use_agriculture  762105 non-null  int64  
 8   has_secondary_use_hotel        762105 non-null  int64  
 9   has_secondary_use_rental       762105 non-null  int64  
 10  has_secondary_use_institution  762105 non-null  int64  
 11  has_secondary_use_school       762105 non-null  int64  
 12  has_secondary_use_industry    

In [191]:
damage_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762105 entries, 0 to 762104
Data columns (total 17 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   building_id                            762105 non-null  float64
 1   district_id                            762105 non-null  int64  
 2   vdcmun_id                              762105 non-null  int64  
 3   ward_id                                762105 non-null  int64  
 4   damage_overall_collapse                500743 non-null  object 
 5   damage_overall_leaning                 500742 non-null  object 
 6   area_assesed                           762093 non-null  object 
 7   damage_grade                           762093 non-null  object 
 8   technical_solution_proposed            762093 non-null  object 
 9   has_geotechnical_risk                  762093 non-null  float64
 10  has_geotechnical_risk_land_settlement  762105 non-null  

# Build the primary working df for processing: 'building_df'

In [347]:
# Eliminate features considered n/a to problem statement and hypothesis
structure_dr = structure_df.drop(columns = ['vdcmun_id', 'ward_id',
                                            'height_ft_post_eq', 'condition_post_eq', 'technical_solution_proposed'])

##################Drop Addl Cols
structure_dr = structure_dr.drop(columns = ['count_floors_post_eq'])
##################

ownership_dr = ownership_df.drop(columns = ['district_id', 'vdcmun_id', 'ward_id']
)
damage_dr = damage_df.drop(columns=['district_id', 'vdcmun_id', 'ward_id', 'damage_grade'])

##################Drop Addl Cols
damage_dr = damage_dr.drop(columns=['damage_overall_collapse','damage_overall_leaning','technical_solution_proposed'])
##################

# Merge all three dataframes into common set of label and prospective features
temp_df = pd.merge(structure_dr, ownership_dr, how = 'inner', on = 'building_id')
building_df = pd.merge(temp_df, damage_dr, how = 'inner', on = 'building_id')
building_df.drop('building_id', axis = 1, inplace = True)
building_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 762104 entries, 0 to 762103
Data columns (total 46 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   district_id                             762104 non-null  int64  
 1   count_floors_pre_eq                     762104 non-null  int64  
 2   age_building                            762104 non-null  int64  
 3   plinth_area_sq_ft                       762104 non-null  int64  
 4   height_ft_pre_eq                        762104 non-null  int64  
 5   land_surface_condition                  762104 non-null  object 
 6   foundation_type                         762104 non-null  object 
 7   roof_type                               762104 non-null  object 
 8   ground_floor_type                       762104 non-null  object 
 9   other_floor_type                        762104 non-null  object 
 10  position                                7621

In [348]:
# Identify target (y)
target_col = 'damage_grade'

# Summarize target class balance
label_count = len(building_df)
label_props = (building_df.groupby(target_col, dropna = False)[target_col].count() / label_count * 100).sort_values(ascending = False)
print('Target class (label) proportions:'); print(label_props.to_string(header = False))

Target class (label) proportions:
Grade 5   36.18
Grade 4   24.12
Grade 3   17.90
Grade 2   11.45
Grade 1   10.34
NaN        0.00


In [349]:
# Drop (relatively immaterial number of) null labels
print('Starting rows:', len(building_df), end = ' ')

#building_df.dropna(axis = 0, subset = [target_col], inplace = True)
building_df.dropna(axis = 0, inplace = True)

print('Ending rows:', len(building_df))

target_classes = sorted(building_df[target_col].unique())

Starting rows: 762104 Ending rows: 762091


# Extract Targets after merge

In [350]:
'''
target1_damagegrade = pd.DataFrame(building_df['damage_grade'])
target2_collapse = pd.DataFrame(building_df['damage_overall_collapse'])
target3_leaning = pd.DataFrame(building_df['damage_overall_leaning'])
target4_soln = pd.DataFrame(building_df['technical_solution_proposed'])
target5_floors =   pd.DataFrame(building_df['count_floors_post_eq'])

targets = pd.DataFrame(building_df[['damage_overall_collapse',
                                    'damage_overall_leaning',
                                    'damage_grade',
                                    'technical_solution_proposed',
                                    'count_floors_post_eq']])

building_df = building_df.drop(columns=['damage_overall_collapse',
                                        'damage_overall_leaning', 
                                        'damage_grade', 
                                        'technical_solution_proposed',
                                        'count_floors_post_eq'])
'''

"\ntarget1_damagegrade = pd.DataFrame(building_df['damage_grade'])\ntarget2_collapse = pd.DataFrame(building_df['damage_overall_collapse'])\ntarget3_leaning = pd.DataFrame(building_df['damage_overall_leaning'])\ntarget4_soln = pd.DataFrame(building_df['technical_solution_proposed'])\ntarget5_floors =   pd.DataFrame(building_df['count_floors_post_eq'])\n\ntargets = pd.DataFrame(building_df[['damage_overall_collapse',\n                                    'damage_overall_leaning',\n                                    'damage_grade',\n                                    'technical_solution_proposed',\n                                    'count_floors_post_eq']])\n\nbuilding_df = building_df.drop(columns=['damage_overall_collapse',\n                                        'damage_overall_leaning', \n                                        'damage_grade', \n                                        'technical_solution_proposed',\n                                        'count_floors_post_eq'])

In [351]:
#################### Edited to allow easy selection of features

# Identify prospective feature columns by 'type'
num_cols = [
    'age_building',
            'plinth_area_sq_ft',
            'height_ft_pre_eq',
#            'count_floors_post_eq',
            'count_families'
            ]
cat_cols = [
    'land_surface_condition',
            'foundation_type',
            'roof_type',
            'ground_floor_type',
            'other_floor_type',
            'position',
            'plan_configuration',
            'legal_ownership_status',
            'district_id',
#            'damage_overall_collapse',
#            'damage_overall_leaning',          
#            'technical_solution_proposed',
#            'area_assesed'
            ]
bin_cols = [
    'has_superstructure_adobe_mud',
            'has_superstructure_mud_mortar_stone',
            'has_superstructure_stone_flag',
            'has_superstructure_cement_mortar_stone',
            'has_superstructure_mud_mortar_brick',
            'has_superstructure_cement_mortar_brick',
            'has_superstructure_timber',
            'has_superstructure_bamboo',
            'has_superstructure_rc_non_engineered',
            'has_superstructure_rc_engineered',
            'has_superstructure_other',
            'has_secondary_use',
 #           'has_secondary_use_agriculture',
 #           'has_secondary_use_hotel',
 #           'has_secondary_use_rental',
 #           'has_secondary_use_institution',
 #           'has_secondary_use_school',
 #           'has_secondary_use_industry',
 #           'has_secondary_use_health_post',
 #           'has_secondary_use_gov_office',
 #           'has_secondary_use_use_police',
 #           'has_secondary_use_other',
            'has_geotechnical_risk',
 #           'has_geotechnical_risk_land_settlement',
 #           'has_geotechnical_risk_fault_crack',
 #           'has_geotechnical_risk_liquefaction',
 #           'has_geotechnical_risk_landslide',
 #           'has_geotechnical_risk_rock_fall',
 #           'has_geotechnical_risk_flood',
 #           'has_geotechnical_risk_other'
            ]

In [352]:
# Confirm [uniform] types
print(building_df.dtypes[num_cols].to_string())
print(building_df.dtypes[cat_cols].to_string())
print(building_df.dtypes[bin_cols].to_string())

age_building           int64
plinth_area_sq_ft      int64
height_ft_pre_eq       int64
count_families       float64
land_surface_condition    object
foundation_type           object
roof_type                 object
ground_floor_type         object
other_floor_type          object
position                  object
plan_configuration        object
legal_ownership_status    object
district_id                int64
has_superstructure_adobe_mud                int64
has_superstructure_mud_mortar_stone         int64
has_superstructure_stone_flag               int64
has_superstructure_cement_mortar_stone      int64
has_superstructure_mud_mortar_brick         int64
has_superstructure_cement_mortar_brick      int64
has_superstructure_timber                   int64
has_superstructure_bamboo                   int64
has_superstructure_rc_non_engineered        int64
has_superstructure_rc_engineered            int64
has_superstructure_other                    int64
has_secondary_use                    

# Merging only selected features (from lists above)

In [353]:
#################### New block that edits out columns with '#' in above block

building_df= pd.concat([building_df[num_cols], 
                        building_df[cat_cols],
                        building_df[bin_cols], 
                        building_df['damage_grade']], axis = 1)
building_df.head()

Unnamed: 0,age_building,plinth_area_sq_ft,height_ft_pre_eq,count_families,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,...,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,has_secondary_use,has_geotechnical_risk,damage_grade
0,9,288,9,1.0,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,...,0,0,0,1,0,0,0,0.0,0.0,Grade 3
1,15,364,9,1.0,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,...,0,0,0,1,0,0,0,0.0,0.0,Grade 5
2,20,384,9,1.0,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,...,0,0,0,0,0,0,0,0.0,0.0,Grade 2
3,20,312,9,1.0,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,...,0,0,0,0,0,0,0,0.0,0.0,Grade 2
4,30,308,9,1.0,Flat,Other,Bamboo/Timber-Light roof,Mud,Not applicable,Not attached,...,0,0,0,0,0,0,0,0.0,0.0,Grade 1


In [354]:
################# No longer need to name columns

# Update types as needed (for consistency)

building_df[num_cols] = building_df[num_cols].astype(int)
building_df[bin_cols] = building_df[bin_cols].astype(int)
#building_df['has_secondary_use'] = building_df['has_secondary_use'].astype(int)
#building_df['has_geotechnical_risk'] = building_df['has_geotechnical_risk'].astype(int)

In [355]:
# Confirm [uniform] types
print(building_df.dtypes[num_cols].to_string())
print(building_df.dtypes[cat_cols].to_string())
print(building_df.dtypes[bin_cols].to_string())

age_building         int64
plinth_area_sq_ft    int64
height_ft_pre_eq     int64
count_families       int64
land_surface_condition    object
foundation_type           object
roof_type                 object
ground_floor_type         object
other_floor_type          object
position                  object
plan_configuration        object
legal_ownership_status    object
district_id                int64
has_superstructure_adobe_mud              int64
has_superstructure_mud_mortar_stone       int64
has_superstructure_stone_flag             int64
has_superstructure_cement_mortar_stone    int64
has_superstructure_mud_mortar_brick       int64
has_superstructure_cement_mortar_brick    int64
has_superstructure_timber                 int64
has_superstructure_bamboo                 int64
has_superstructure_rc_non_engineered      int64
has_superstructure_rc_engineered          int64
has_superstructure_other                  int64
has_secondary_use                         int64
has_geotechnical_ri

In [356]:
'''
# Update types as needed (for consistency)

building_df['count_families'] = building_df['count_families'].astype(int)
building_df['has_superstructure_other'] = building_df['has_superstructure_other'].astype(int)
building_df['has_secondary_use'] = building_df['has_secondary_use'].astype(int)
building_df['has_geotechnical_risk'] = building_df['has_geotechnical_risk'].astype(int)

SyntaxError: ignored

# Sample the full df to run model tests

In [357]:
building_df = building_df.sample(frac=.1, random_state=42)

#Remap the target before splitting

In [358]:
####################### Remap the target

dict = {'Grade 1' : 0, 
        'Grade 2' : 0,
        'Grade 3' : 0,
        'Grade 4' : 1,
        'Grade 5' : 2
        }
  
# Remap the values of the dataframe
building_df = building_df.replace({'damage_grade': dict})

In [359]:


# Segregate data into predictor (X) and target (y) dataframes
building_X = building_df.loc[:, building_df.columns != 'damage_grade'].copy()
building_y = building_df['damage_grade']

# Partition data 70/30, stratifying for class balance (ref. above)
X_train, X_test, y_train, y_test = train_test_split(
    building_X, building_y, test_size = 0.3, random_state = 42, stratify = building_y
)

In [360]:
building_y.head()

7532      0
336294    1
266036    2
225007    2
132635    1
Name: damage_grade, dtype: int64

In [361]:
na_df = pd.DataFrame(X_train[num_cols + cat_cols + bin_cols].isna().sum()) / len(X_train) * 100
na_df.index.name = 'column'; na_df.reset_index(inplace = True)
print('Proportion of nulls:'); print(na_df[na_df[0] > 0].to_string(index = False, header = False))

Proportion of nulls:
Empty DataFrame
Columns: [column, 0]
Index: []


In [None]:
# Address missing/null values here? Or, pipeline? For both train, test. . .

############## Eliminated
'''
X_train['damage_overall_collapse'].fillna('[missing]', inplace = True)
X_test['damage_overall_collapse'].fillna('[missing]', inplace = True)

X_train['damage_overall_leaning'].fillna('[missing]', inplace = True)
X_test['damage_overall_leaning'].fillna('[missing]', inplace = True)
'''

In [362]:
cat_count = len(X_train)
for cc in cat_cols:
    print()
    print(cc, '-')
    for cv in pd.unique(X_train[cc]):
        if isinstance(cv, float) != True:
            print(cv, '(%.2f)' % (X_train[X_train[cc] == cv][cc].count() / cat_count * 100))


land_surface_condition -
Flat (82.81)
Moderate slope (13.78)
Steep slope (3.42)

foundation_type -
Mud mortar-Stone/Brick (82.36)
Bamboo/Timber (7.52)
Other (0.61)
RC (4.20)
Cement-Stone/Brick (5.31)

roof_type -
Bamboo/Timber-Light roof (65.93)
Bamboo/Timber-Heavy roof (28.17)
RCC/RB/RBC (5.90)

ground_floor_type -
Mud (81.00)
RC (9.60)
Brick/Stone (8.80)
Timber (0.49)
Other (0.12)

other_floor_type -
TImber/Bamboo-Mud (63.55)
Timber-Planck (16.60)
Not applicable (15.56)
RCC/RB/RBC (4.29)

position -
Not attached (79.13)
Attached-2 side (3.59)
Attached-1 side (17.09)
Attached-3 side (0.19)

plan_configuration -
Rectangular (95.82)
L-shape (1.42)
Square (2.34)
Multi-projected (0.14)
T-shape (0.12)
U-shape (0.06)
Others (0.07)
E-shape (0.01)
H-shape (0.01)
Building with Central Courtyard (0.01)

legal_ownership_status -
Private (96.06)
Institutional (1.00)
Public (2.49)
Other (0.44)

district_id -
23 (11.66)
28 (10.30)
24 (12.96)
20 (9.09)
12 (5.03)
36 (10.12)
29 (1.69)
21 (7.74)
22 (7

In [363]:
# Encode categorical label (y)
label_enc = LabelEncoder().fit(y_train)
y_train = label_enc.transform(y_train)
y_test = label_enc.transform(y_test)

# Encode other here? Or, pipeline (currently handled in pipeline)?

In [364]:
print('Skewness:'); print(pd.DataFrame(building_X[num_cols].skew()).to_string(header = False))

Skewness:
age_building      13.82
plinth_area_sq_ft  3.77
height_ft_pre_eq   2.53
count_families     1.35


In [None]:
# Address outliers here?

In [None]:
# Address centering/scaling here? Or, pipeline (currently handled in pipeline)?

In [365]:
print(building_X[bin_cols].max() - building_X[bin_cols].min())

has_superstructure_adobe_mud              1
has_superstructure_mud_mortar_stone       1
has_superstructure_stone_flag             1
has_superstructure_cement_mortar_stone    1
has_superstructure_mud_mortar_brick       1
has_superstructure_cement_mortar_brick    1
has_superstructure_timber                 1
has_superstructure_bamboo                 1
has_superstructure_rc_non_engineered      1
has_superstructure_rc_engineered          1
has_superstructure_other                  1
has_secondary_use                         1
has_geotechnical_risk                     1
dtype: int64


In [366]:
#
corr_df = X_train[num_cols].corr()
corr_df.rename(columns = lambda s: s[0:19], index = lambda s: s[0:19], inplace = True)
print(corr_df)

#
# Do something like Cramer's V for categorical correlation?

                   age_building  plinth_area_sq_ft  height_ft_pre_eq  count_families
age_building           1.00           -0.01               0.03             0.00     
plinth_area_sq_ft     -0.01            1.00               0.20             0.10     
height_ft_pre_eq       0.03            0.20               1.00             0.06     
count_families         0.00            0.10               0.06             1.00     


In [367]:
# Standardized modeling function, to create and execute a modeling pipeline through
#   to model performance metrics
def model(algorithm, 
          iteration, 
          params, 
#          classes, 
          X_tr, y_tr, X_te, y_te, 
          cv, 
          plot_cm):
    # Create transformers
    cat_encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
    standardizer = StandardScaler()
    
    # Create preprocessor
    cat_transformer = make_pipeline(cat_encoder)
    num_transformer = make_pipeline(standardizer)
    preprocessor = ColumnTransformer(
        [('cat', cat_transformer, cat_cols),
         ('num', num_transformer, num_cols)]
    )

    # Prepare algorithm with hyperparameters+
    algorithm.set_params(**params)
    algorithm_name = type(algorithm).__name__ + ' (iteration ' + str(iteration) + ')'
    
    # Create pipeline and cross-validate model (for later output)
    pipe = make_pipeline(preprocessor, algorithm)
    
    # Fit model
    print('\n\n', 'Fitting ' + algorithm_name + '...', end = ' ')
    fit_start = time()
    pipe.fit(X_tr, y_tr)
    fit_time = time() - fit_start
    print('done in %0.3fs.' % fit_time)
    
    # Cross-validate model
    scores = []
    cv_time = 0
    if (cv > 0):
        print('Cross-validating ' + algorithm_name + '...', end = ' ')
        cv_start = time()
        scores = cross_val_score(pipe, X_tr, y_tr, cv = cv, scoring = 'accuracy')
        cv_time = time() - cv_start
        print('done in %0.3fs.' % cv_time)
        print(scores)
    
    # Validate model
    print('Validating ' + algorithm_name + '...', end = ' ')
    val_start = time()
    y_tr_pred = pipe.predict(X_tr)
    y_te_pred = pipe.predict(X_te)
    val_time = time() - val_start
    print('done in %0.2fs.' % val_time)

    # Show validation results
    if plot_cm:
        fig, ax = plt.subplots()
        cmd = ConfusionMatrixDisplay(confusion_matrix(y_te, y_te_pred), display_labels = classes)
        cmd.plot(ax = ax)
        plt.suptitle(algorithm_name, y = 1)
        plt.title(params)
        plt.show()

    print('\n', algorithm_name)
    print(params)
    print(classification_report(y_te, y_te_pred))    
    
    # Persist pipeline (+model)
    pipe_filename = algorithm_name + '.pipe'
    joblib.dump(pipe, pipe_filename)
    
    # Persist results
    results = []
    results.append ({
        'algorithm': algorithm_name,
        'parameters': params,
        'fit_time': fit_time,
        'cv_time': cv_time,
        'val_time': val_time,
        'scores': tuple(scores),
        'train_acc': accuracy_score(y_tr_pred, y_tr),
        'test_acc': accuracy_score(y_te_pred, y_te)
    })
    results_filename = algorithm_name + '.results'
    joblib.dump(results, results_filename)

    return pd.DataFrame(results)

In [368]:
# Iterate on models (preliminary)
clf_results = pd.DataFrame()

############### Function Structure

#def model(algorithm, 
#          iteration, 
#          params, 
#          classes, 
#          X_tr, y_tr, X_te, y_te, 
#          cv, 
#          plot_cm)


for depth in range(5, 11):
    tree_r = model(DecisionTreeClassifier(),
                   (depth - 4),
                   {'max_depth': depth},
#                   target_classes,
                   X_train, y_train, X_test, y_test,
                   cv = 0, 
                   plot_cm = False)
    clf_results = pd.concat([clf_results, tree_r])

#
perc_r = model(Perceptron(), 
               1,
               {'class_weight': 'balanced'},
#               target_classes, 
               X_train, y_train, X_test, y_test,
               cv = 0, 
               plot_cm = False)
clf_results = pd.concat([clf_results, perc_r])

#for i in range(1, 3):
#    knn_r = model(KNeighborsClassifier(),
#                  i,
#                  {'n_neighbors': 2 * i + 1},
#                  target_classes, 
#                  X_train, y_train, X_test, y_test,
#                  cv = 0, 
#                  plot_cm = False)
#    clf_results = pd.concat([clf_results, knn_r])

logr_r = model(LogisticRegression(), 
               1,
               {'solver': 'saga', 'max_iter': 400, 'multi_class': 'multinomial'},
#               target_classes, 
               X_train, y_train, X_test, y_test,
               cv = 0, plot_cm = False)
clf_results = pd.concat([clf_results, logr_r])

svm_r = model(LinearSVC(), 
              1,
              {'multi_class': 'crammer_singer', 'max_iter': 2000},
#              target_classes, 
              X_train, y_train, X_test, y_test,
              cv = 0, 
              plot_cm = False)
clf_results = pd.concat([clf_results, svm_r])

clf_results



 Fitting DecisionTreeClassifier (iteration 1)... done in 0.364s.
Validating DecisionTreeClassifier (iteration 1)... done in 0.22s.

 DecisionTreeClassifier (iteration 1)
{'max_depth': 5}
              precision    recall  f1-score   support

           0       0.77      0.53      0.63      9110
           1       0.00      0.00      0.00      5499
           2       0.46      0.93      0.62      8254

    accuracy                           0.55     22863
   macro avg       0.41      0.49      0.41     22863
weighted avg       0.47      0.55      0.47     22863



 Fitting DecisionTreeClassifier (iteration 2)... done in 0.374s.
Validating DecisionTreeClassifier (iteration 2)... done in 0.21s.

 DecisionTreeClassifier (iteration 2)
{'max_depth': 6}
              precision    recall  f1-score   support

           0       0.52      0.92      0.66      9110
           1       0.11      0.00      0.00      5499
           2       0.68      0.54      0.60      8254

    accuracy           



done in 41.857s.
Validating LinearSVC (iteration 1)... done in 0.25s.

 LinearSVC (iteration 1)
{'multi_class': 'crammer_singer', 'max_iter': 2000}
              precision    recall  f1-score   support

           0       0.62      0.75      0.68      9110
           1       0.33      0.27      0.30      5499
           2       0.67      0.60      0.63      8254

    accuracy                           0.58     22863
   macro avg       0.54      0.54      0.54     22863
weighted avg       0.57      0.58      0.57     22863



Unnamed: 0,algorithm,parameters,fit_time,cv_time,val_time,scores,train_acc,test_acc
0,DecisionTreeClassifier (iteration 1),{'max_depth': 5},0.36,0,0.22,(),0.55,0.55
0,DecisionTreeClassifier (iteration 2),{'max_depth': 6},0.37,0,0.21,(),0.56,0.56
0,DecisionTreeClassifier (iteration 3),{'max_depth': 7},0.42,0,0.21,(),0.57,0.57
0,DecisionTreeClassifier (iteration 4),{'max_depth': 8},0.42,0,0.22,(),0.59,0.59
0,DecisionTreeClassifier (iteration 5),{'max_depth': 9},0.46,0,0.2,(),0.6,0.59
0,DecisionTreeClassifier (iteration 6),{'max_depth': 10},0.46,0,0.21,(),0.61,0.59
0,Perceptron (iteration 1),{'class_weight': 'balanced'},0.57,0,0.25,(),0.45,0.45
0,LogisticRegression (iteration 1),"{'solver': 'saga', 'max_iter': 400, 'multi_cla...",38.81,0,0.25,(),0.6,0.6
0,LinearSVC (iteration 1),"{'multi_class': 'crammer_singer', 'max_iter': ...",41.86,0,0.25,(),0.58,0.58


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier

# Iterate on models (preliminary)
clf_results2 = pd.DataFrame()

############### Function Structure

nn_r = model(MLPClassifier(),
             1,
             {'random_state' : 1, 'max_iter': 300},
#             target_classes,
             X_train, y_train, X_test, y_test,
             cv = 0,
             plot_cm = False)
clf_results2 = pd.concat([clf_results2, nn_r])

rf_r = model(RandomForestClassifier(),
             1,
             {'max_depth' : 2},
#             target_classes,
             X_train, y_train, X_test, y_test,
             cv = 0,
             plot_cm = False)
clf_results2 = pd.concat([clf_results2, rf_r])

ridge_r = model(RidgeClassifier(),
             1,
             {'alpha' : 1, 'max_iter': 1000},
#             target_classes,
             X_train, y_train, X_test, y_test,
             cv = 0,
             plot_cm = False)
clf_results2 = pd.concat([clf_results2, ridge_r])

clf_results2

# Summer's kNN Minkowski
## Still need to add a for loop for k

In [None]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB

In [None]:
clf_results3 = pd.DataFrame()

knnmink_r = model(KNeighborsClassifier(),
              1,
              {'metric' : 'minkowski', 'n_neighbors' : 5},
#              target_classes,
              X_train, y_train, X_test, y_test,
              cv = 0,
              plot_cm = False)
clf_results3 = pd.concat([clf_results3, knnmink_r])

clf_results3

In [None]:
############# Summer's previous kNN
'''
# creating a function to create and test a KNN model and output accuracy scores 
def k_nn(xtrain, ytrain, xtest, ytest, kvalues, metric):
  knn_accuracy = []
  clfs = []
  for i in kvalues: 
    clf = KNeighborsClassifier(metric = metric, n_neighbors = i).fit(X_train, y_train)
    clf_train_pred = clf.predict(X_train)
    clf_test_pred = clf.predict(X_test)
    clfs.append(clf)
    knn_accuracy.append({'k-values':i, 'Training Accuracy': accuracy_score(clf_train_pred, y_train), 
                         'Test Accuracy': accuracy_score(clf_test_pred, y_test) })
    print('K-value', i, 'complete')
  return pd.DataFrame(knn_accuracy), clfs

# Summer's kNN Manhattan
## Still need to add a for loop for k

In [None]:
knnman_r = model(KNeighborsClassifier(),
              1,
              {'metric' : 'manhattan', 'n_neighbors' : 5},
#              target_classes,
              X_train, y_train, X_test, y_test,
              cv = 0,
              plot_cm = False)
clf_results3 = pd.concat([clf_results3, knnman_r])

clf_results3

# Summer's NB Bernoulli

In [None]:
alpha = [0.1,1,5,10]
for a in alpha:
  nbbern_r = model(BernoulliNB(),
                   1,
                   {'alpha' : a},
#                   target_classes,
                   X_train, y_train, X_test, y_test,
                   cv = 0,
                   plot_cm = False)
  clf_results3 = pd.concat([clf_results3, nbbern_r])

clf_results3

In [None]:
############# Summer's previous NB_bern

# Bernoulli 
# creating a function to create and test a Naive Bayes model and output accuracy scores 
def nb_bern(xtrain, ytrain, xtest, ytest, alpha):
  nb_bern_acc = []
  clfs_bern = []
  for a in alpha: 
    clf = BernoulliNB(alpha = a).fit(X_train, y_train)
    clf_train_pred = clf.predict(X_train)
    clf_test_pred = clf.predict(X_test)
    clfs_bern.append(clf)
    nb_bern_acc.append({'alpha':a, 'Training Accuracy': accuracy_score(clf_train_pred, y_train), 
                         'Test Accuracy': accuracy_score(clf_test_pred, y_test) })
    print('alpha', a, 'complete')
  return pd.DataFrame(nb_bern_acc), clfs_bern



In [None]:
############# Summer's previous NB_bern

nb_bern_acc, nb_bern_clf = nb_bern(X_train, y_train, X_test, y_test, (np.arange(0,10,.5)))
display(nb_bern_acc)

