# Data Cleaning and Preparation

### Import libraries

In [80]:
import os
import numpy as np
import pandas as pd
import shapely as shapely
import geopandas as gpd

### Load Data and Keep Relevant Columns

In [94]:
claims_data_file = 'FimaNfipClaims.parquet' #claims data file
metadata_file = 'fima-claims-metadata.csv' #metadata file

# Read metadata to select the features to use
metadata = pd.read_csv(metadata_file)
cols = metadata[metadata['Used in Model'] == 1]['Name'].tolist()

#### Keep useful features for Harris County (FIPS = 48201)

In [95]:
# Load the claims data and retain the selected features
claims = pd.read_parquet(claims_data_file, columns = cols)
claims = claims[claims['countyCode'] == '48201'].copy().drop(columns = 'countyCode') #Harris county

#### Check features with most missing values 

In [96]:
# Percentage of missing values
claims.isnull().sum().sort_values(ascending=False)/len(claims)*100

floodCharacteristicsIndicator         98.811421
lowestFloorElevation                  81.458397
elevationDifference                   80.087049
basementEnclosureCrawlspaceType       79.918340
floodZoneCurrent                      57.590740
nfipCommunityName                     57.238091
buildingDescriptionCode               54.244089
locationOfContents                    24.642079
buildingDamageAmount                  18.420929
buildingPropertyValue                 18.420929
floodEvent                            13.324507
waterDepth                             6.625349
floodWaterDuration                     4.037303
ratedFloodZone                         2.048527
causeOfDamage                          1.798978
numberOfFloorsInTheInsuredBuilding     0.603369
occupancyType                          0.012302
originalConstructionDate               0.002343
latitude                               0.000000
reportedCity                           0.000000
longitude                              0

#### Keep columns with missing values < 60%

In [None]:
keep_cols = [col for col in claims.columns if claims[col].isnull().sum() < 0.6*len(claims)]
claims = claims[keep_cols].copy()
keep_cols

['elevatedBuildingIndicator',
 'ratedFloodZone',
 'locationOfContents',
 'numberOfFloorsInTheInsuredBuilding',
 'occupancyType',
 'originalConstructionDate',
 'yearOfLoss',
 'buildingDamageAmount',
 'buildingPropertyValue',
 'causeOfDamage',
 'floodWaterDuration',
 'floodproofedIndicator',
 'floodEvent',
 'waterDepth',
 'state',
 'reportedCity',
 'reportedZipCode',
 'latitude',
 'longitude',
 'id']

### Create derived features

#### Clean Original Construction Data

In [98]:
# Convert originalConstructionDate to year except for None values
claims['originalConstructionDate'] = (claims['originalConstructionDate']
                                      .apply(lambda x: x.year if x is not None else x))
# Fill None values with median construction date
median_construction_date = claims['originalConstructionDate'].median()
claims['originalConstructionDate'] = pd.to_numeric(claims['originalConstructionDate']
                                      .fillna(median_construction_date)
                                      )

#### Derive building age

In [105]:
claims["buildingAgeAtLoss"] = claims['yearOfLoss'] - claims['originalConstructionDate']
claims['buildingAgeAtLoss'] = claims['buildingAgeAtLoss'].clip(lower=1)